Add TencentSpider,Add SelfDefine Command to run mulity spider
This commit is contained in:
parent
2a312aa769
commit
3201d09c43
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
news_spider/tt.json
|
||||||
|
news_spider/ne.json
|
||||||
|
news_spider/te.json
|
0
news_spider/news_spider/commands/__init__.py
Normal file
0
news_spider/news_spider/commands/__init__.py
Normal file
BIN
news_spider/news_spider/commands/__init__.pyc
Normal file
BIN
news_spider/news_spider/commands/__init__.pyc
Normal file
Binary file not shown.
39
news_spider/news_spider/commands/crawlall.py
Normal file
39
news_spider/news_spider/commands/crawlall.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
from scrapy.commands import ScrapyCommand
|
||||||
|
from scrapy.crawler import CrawlerRunner
|
||||||
|
from scrapy.utils.conf import arglist_to_dict
|
||||||
|
|
||||||
|
class Command(ScrapyCommand):
|
||||||
|
|
||||||
|
requires_project = True
|
||||||
|
|
||||||
|
def syntax(self):
|
||||||
|
return '[options]'
|
||||||
|
|
||||||
|
def short_desc(self):
|
||||||
|
return 'Runs all of the spiders'
|
||||||
|
|
||||||
|
def add_options(self, parser):
|
||||||
|
ScrapyCommand.add_options(self, parser)
|
||||||
|
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||||
|
help="set spider argument (may be repeated)")
|
||||||
|
parser.add_option("-o", "--output", metavar="FILE",
|
||||||
|
help="dump scraped items into FILE (use - for stdout)")
|
||||||
|
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||||
|
help="format to use for dumping items with -o")
|
||||||
|
|
||||||
|
def process_options(self, args, opts):
|
||||||
|
ScrapyCommand.process_options(self, args, opts)
|
||||||
|
try:
|
||||||
|
opts.spargs = arglist_to_dict(opts.spargs)
|
||||||
|
except ValueError:
|
||||||
|
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||||
|
|
||||||
|
def run(self, args, opts):
|
||||||
|
#settings = get_project_settings()
|
||||||
|
|
||||||
|
spider_loader = self.crawler_process.spider_loader
|
||||||
|
for spidername in args or spider_loader.list():
|
||||||
|
print "*********cralall spidername************" + spidername
|
||||||
|
self.crawler_process.crawl(spidername, **opts.spargs)
|
||||||
|
|
||||||
|
self.crawler_process.start()
|
BIN
news_spider/news_spider/commands/crawlall.pyc
Normal file
BIN
news_spider/news_spider/commands/crawlall.pyc
Normal file
Binary file not shown.
@ -13,6 +13,7 @@ BOT_NAME = 'news_spider'
|
|||||||
|
|
||||||
SPIDER_MODULES = ['news_spider.spiders']
|
SPIDER_MODULES = ['news_spider.spiders']
|
||||||
NEWSPIDER_MODULE = 'news_spider.spiders'
|
NEWSPIDER_MODULE = 'news_spider.spiders'
|
||||||
|
COMMANDS_MODULE='news_spider.commands'
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
Binary file not shown.
9
news_spider/news_spider/setup.py
Normal file
9
news_spider/news_spider/setup.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(name='scrapy-mymodule',
|
||||||
|
entry_points={
|
||||||
|
'scrapy.commands': [
|
||||||
|
'crawlall=news_spider.commands:crawlall',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
allowed_domains=['news.163.com']
|
allowed_domains=['news.163.com']
|
||||||
|
|
||||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||||
# year = ['2016','2015']
|
year = ['2016','2015']
|
||||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
# '20','19','18','17','16','15','14','13','12','11','10',
|
'20','19','18','17','16','15','14','13','12','11','10',
|
||||||
# '09','08','07','06','05','04','03','02','01']
|
'09','08','07','06','05','04','03','02','01']
|
||||||
day = ['31']
|
# day = ['31']
|
||||||
year = ['2016']
|
# year = ['2016']
|
||||||
month = ['03']
|
# month = ['03']
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
for y in self.year:
|
for y in self.year:
|
||||||
|
@ -1,10 +0,0 @@
|
|||||||
import scrapy
|
|
||||||
from scrapy.crawler import CrawlerProcess
|
|
||||||
from TouTiaoSpider import TouTiaoSpider
|
|
||||||
from NetEase import NetEaseSpider
|
|
||||||
|
|
||||||
process = CrawlerProcess()
|
|
||||||
process.crawl(TouTiaoSpider)
|
|
||||||
process.crawl(NetEaseSpider)
|
|
||||||
process.start()
|
|
||||||
|
|
Binary file not shown.
@ -5,14 +5,14 @@ import json
|
|||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class NetEaseSpider(scrapy.Spider):
|
class TencentSpider(scrapy.Spider):
|
||||||
|
|
||||||
start_urls = ['http://news.qq.com']
|
start_urls = ['http://news.qq.com']
|
||||||
name='tencent'
|
name='tencent'
|
||||||
allowed_domains=['news.qq.com']
|
allowed_domains=['news.qq.com']
|
||||||
|
|
||||||
base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
|
base_url = 'http://news.qq.com/b/history/index'
|
||||||
year = ['2016','2015']
|
year = ['2016','2015','2014']
|
||||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
'20','19','18','17','16','15','14','13','12','11','10',
|
'20','19','18','17','16','15','14','13','12','11','10',
|
||||||
@ -23,7 +23,7 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
for y in self.year:
|
for y in self.year:
|
||||||
for m in self.month:
|
for m in self.month:
|
||||||
for d in self.day:
|
for d in self.day:
|
||||||
for t in tp:
|
for t in self.tp:
|
||||||
url = self.base_url+y+m+d+t+'.shtml?'
|
url = self.base_url+y+m+d+t+'.shtml?'
|
||||||
yield scrapy.Request(url,self.parseList)
|
yield scrapy.Request(url,self.parseList)
|
||||||
|
|
||||||
@ -31,14 +31,15 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
def parseList(self,response):
|
def parseList(self,response):
|
||||||
urls = response.xpath("//a/@href").extract()
|
urls = response.xpath("//a/@href").extract()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
yield scrapy.Request(url,self.parseNews)
|
if 'http' in url:
|
||||||
|
yield scrapy.Request(url,self.parseNews)
|
||||||
|
|
||||||
def parseNews(self,response):
|
def parseNews(self,response):
|
||||||
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
||||||
item = NewsSpiderItem()
|
item = NewsSpiderItem()
|
||||||
time = data.xpath("//span[@class='article-time']/text()").extract()
|
time = data.xpath("//span[@class='article-time']/text()").extract()
|
||||||
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
||||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
content = data.xpath("//p/text()").extract()
|
||||||
|
|
||||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user