Add TencentSpider,Add SelfDefine Command to run mulity spider

This commit is contained in:
lzjqsdd 2016-04-22 10:13:34 +08:00
parent 2a312aa769
commit 3201d09c43
13 changed files with 67 additions and 24 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
news_spider/tt.json
news_spider/ne.json
news_spider/te.json

Binary file not shown.

View File

@ -0,0 +1,39 @@
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerRunner
from scrapy.utils.conf import arglist_to_dict
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
def run(self, args, opts):
#settings = get_project_settings()
spider_loader = self.crawler_process.spider_loader
for spidername in args or spider_loader.list():
print "*********cralall spidername************" + spidername
self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start()

Binary file not shown.

View File

@ -13,6 +13,7 @@ BOT_NAME = 'news_spider'
SPIDER_MODULES = ['news_spider.spiders']
NEWSPIDER_MODULE = 'news_spider.spiders'
COMMANDS_MODULE='news_spider.commands'
# Crawl responsibly by identifying yourself (and your website) on the user-agent

Binary file not shown.

View File

@ -0,0 +1,9 @@
from setuptools import setup, find_packages
setup(name='scrapy-mymodule',
entry_points={
'scrapy.commands': [
'crawlall=news_spider.commands:crawlall',
],
},
)

View File

@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
# year = ['2016','2015']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
# day = ['31','30','29','28','27','26','25','24','23','22','21',
# '20','19','18','17','16','15','14','13','12','11','10',
# '09','08','07','06','05','04','03','02','01']
day = ['31']
year = ['2016']
month = ['03']
year = ['2016','2015']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
# day = ['31']
# year = ['2016']
# month = ['03']
def parse(self,response):
for y in self.year:

View File

@ -1,10 +0,0 @@
import scrapy
from scrapy.crawler import CrawlerProcess
from TouTiaoSpider import TouTiaoSpider
from NetEase import NetEaseSpider
process = CrawlerProcess()
process.crawl(TouTiaoSpider)
process.crawl(NetEaseSpider)
process.start()

View File

@ -5,14 +5,14 @@ import json
import time
import re
class NetEaseSpider(scrapy.Spider):
class TencentSpider(scrapy.Spider):
start_urls = ['http://news.qq.com']
name='tencent'
allowed_domains=['news.qq.com']
base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
year = ['2016','2015']
base_url = 'http://news.qq.com/b/history/index'
year = ['2016','2015','2014']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
@ -23,7 +23,7 @@ class NetEaseSpider(scrapy.Spider):
for y in self.year:
for m in self.month:
for d in self.day:
for t in tp:
for t in self.tp:
url = self.base_url+y+m+d+t+'.shtml?'
yield scrapy.Request(url,self.parseList)
@ -31,14 +31,15 @@ class NetEaseSpider(scrapy.Spider):
def parseList(self,response):
urls = response.xpath("//a/@href").extract()
for url in urls:
yield scrapy.Request(url,self.parseNews)
if 'http' in url:
yield scrapy.Request(url,self.parseNews)
def parseNews(self,response):
data = response.xpath("//div[@id='C-Main-Article-QQ']")
item = NewsSpiderItem()
time = data.xpath("//span[@class='article-time']/text()").extract()
title = data.xpath("//div[@class='hd']//h1/text()").extract()
content = data.xpath("//div[@class='post_text']/p/text()").extract()
content = data.xpath("//p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0):