Add TencentSpider,Add SelfDefine Command to run mulity spider
This commit is contained in:
parent
2a312aa769
commit
3201d09c43
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
news_spider/tt.json
|
||||
news_spider/ne.json
|
||||
news_spider/te.json
|
0
news_spider/news_spider/commands/__init__.py
Normal file
0
news_spider/news_spider/commands/__init__.py
Normal file
BIN
news_spider/news_spider/commands/__init__.pyc
Normal file
BIN
news_spider/news_spider/commands/__init__.pyc
Normal file
Binary file not shown.
39
news_spider/news_spider/commands/crawlall.py
Normal file
39
news_spider/news_spider/commands/crawlall.py
Normal file
@ -0,0 +1,39 @@
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.conf import arglist_to_dict
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
|
||||
def syntax(self):
|
||||
return '[options]'
|
||||
|
||||
def short_desc(self):
|
||||
return 'Runs all of the spiders'
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_option("-o", "--output", metavar="FILE",
|
||||
help="dump scraped items into FILE (use - for stdout)")
|
||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items with -o")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||
|
||||
def run(self, args, opts):
|
||||
#settings = get_project_settings()
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
for spidername in args or spider_loader.list():
|
||||
print "*********cralall spidername************" + spidername
|
||||
self.crawler_process.crawl(spidername, **opts.spargs)
|
||||
|
||||
self.crawler_process.start()
|
BIN
news_spider/news_spider/commands/crawlall.pyc
Normal file
BIN
news_spider/news_spider/commands/crawlall.pyc
Normal file
Binary file not shown.
@ -13,6 +13,7 @@ BOT_NAME = 'news_spider'
|
||||
|
||||
SPIDER_MODULES = ['news_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'news_spider.spiders'
|
||||
COMMANDS_MODULE='news_spider.commands'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
|
Binary file not shown.
9
news_spider/news_spider/setup.py
Normal file
9
news_spider/news_spider/setup.py
Normal file
@ -0,0 +1,9 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(name='scrapy-mymodule',
|
||||
entry_points={
|
||||
'scrapy.commands': [
|
||||
'crawlall=news_spider.commands:crawlall',
|
||||
],
|
||||
},
|
||||
)
|
@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
|
||||
allowed_domains=['news.163.com']
|
||||
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
# year = ['2016','2015']
|
||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||
# '09','08','07','06','05','04','03','02','01']
|
||||
day = ['31']
|
||||
year = ['2016']
|
||||
month = ['03']
|
||||
year = ['2016','2015']
|
||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
'09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31']
|
||||
# year = ['2016']
|
||||
# month = ['03']
|
||||
|
||||
def parse(self,response):
|
||||
for y in self.year:
|
||||
|
@ -1,10 +0,0 @@
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from TouTiaoSpider import TouTiaoSpider
|
||||
from NetEase import NetEaseSpider
|
||||
|
||||
process = CrawlerProcess()
|
||||
process.crawl(TouTiaoSpider)
|
||||
process.crawl(NetEaseSpider)
|
||||
process.start()
|
||||
|
Binary file not shown.
@ -5,14 +5,14 @@ import json
|
||||
import time
|
||||
import re
|
||||
|
||||
class NetEaseSpider(scrapy.Spider):
|
||||
class TencentSpider(scrapy.Spider):
|
||||
|
||||
start_urls = ['http://news.qq.com']
|
||||
name='tencent'
|
||||
allowed_domains=['news.qq.com']
|
||||
|
||||
base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
|
||||
year = ['2016','2015']
|
||||
base_url = 'http://news.qq.com/b/history/index'
|
||||
year = ['2016','2015','2014']
|
||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
@ -23,7 +23,7 @@ class NetEaseSpider(scrapy.Spider):
|
||||
for y in self.year:
|
||||
for m in self.month:
|
||||
for d in self.day:
|
||||
for t in tp:
|
||||
for t in self.tp:
|
||||
url = self.base_url+y+m+d+t+'.shtml?'
|
||||
yield scrapy.Request(url,self.parseList)
|
||||
|
||||
@ -31,14 +31,15 @@ class NetEaseSpider(scrapy.Spider):
|
||||
def parseList(self,response):
|
||||
urls = response.xpath("//a/@href").extract()
|
||||
for url in urls:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
if 'http' in url:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
|
||||
def parseNews(self,response):
|
||||
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
||||
item = NewsSpiderItem()
|
||||
time = data.xpath("//span[@class='article-time']/text()").extract()
|
||||
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||
content = data.xpath("//p/text()").extract()
|
||||
|
||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user