Add TencentSpider,Add SelfDefine Command to run mulity spider

2016-04-22 10:13:34 +08:00 · 2016-04-22 10:13:34 +08:00 · 3201d09c43
commit 3201d09c43
parent 2a312aa769
13 changed files with 67 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 news_spider/tt.json
 news_spider/ne.json
 news_spider/te.json
--- a/news_spider/news_spider/commands/init.py
+++ b/news_spider/news_spider/commands/init.py
--- a/news_spider/news_spider/commands/init.pyc
+++ b/news_spider/news_spider/commands/init.pyc
--- a/news_spider/news_spider/commands/crawlall.py
+++ b/news_spider/news_spider/commands/crawlall.py
@ -0,0 +1,39 @@
 from scrapy.commands import ScrapyCommand  
 from scrapy.crawler import CrawlerRunner
 from scrapy.utils.conf import arglist_to_dict
 class Command(ScrapyCommand):
    requires_project = True
    def syntax(self):  
        return '[options]'  
    def short_desc(self):  
        return 'Runs all of the spiders'  
    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
                          help="set spider argument (may be repeated)")
        parser.add_option("-o", "--output", metavar="FILE",
                          help="dump scraped items into FILE (use - for stdout)")
        parser.add_option("-t", "--output-format", metavar="FORMAT",
                          help="format to use for dumping items with -o")
    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
    def run(self, args, opts):
 	#settings = get_project_settings()
 	spider_loader = self.crawler_process.spider_loader
 	for spidername in args or spider_loader.list():
 	    print "*********cralall spidername************" + spidername
 	    self.crawler_process.crawl(spidername, **opts.spargs)
        self.crawler_process.start()
--- a/news_spider/news_spider/commands/crawlall.pyc
+++ b/news_spider/news_spider/commands/crawlall.pyc
--- a/news_spider/news_spider/settings.py
+++ b/news_spider/news_spider/settings.py
@ -13,6 +13,7 @@ BOT_NAME = 'news_spider'
 SPIDER_MODULES = ['news_spider.spiders']
 NEWSPIDER_MODULE = 'news_spider.spiders'
 COMMANDS_MODULE='news_spider.commands'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
--- a/news_spider/news_spider/settings.pyc
+++ b/news_spider/news_spider/settings.pyc
--- a/news_spider/news_spider/setup.py
+++ b/news_spider/news_spider/setup.py
@ -0,0 +1,9 @@
 from setuptools import setup, find_packages
 setup(name='scrapy-mymodule',
  entry_points={
    'scrapy.commands': [
      'crawlall=news_spider.commands:crawlall',
    ],
  },
 )
--- a/news_spider/news_spider/spiders/NetEase.py
+++ b/news_spider/news_spider/spiders/NetEase.py
@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
 	allowed_domains=['news.163.com']
 	base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
-#	year = ['2016','2015']
+	year = ['2016','2015']
-#	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
+	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
-#	day = ['31','30','29','28','27','26','25','24','23','22','21',
+	day = ['31','30','29','28','27','26','25','24','23','22','21',
-#		   '20','19','18','17','16','15','14','13','12','11','10',
+		   '20','19','18','17','16','15','14','13','12','11','10',
-#		   '09','08','07','06','05','04','03','02','01']
+		   '09','08','07','06','05','04','03','02','01']
-	day = ['31']
+#	day = ['31']
-	year = ['2016']
+#	year = ['2016']
-	month = ['03']
+#	month = ['03']
 	def parse(self,response):
 		for y in self.year:
--- a/news_spider/news_spider/spiders/SpiderAll.py
+++ b/news_spider/news_spider/spiders/SpiderAll.py
@ -1,10 +0,0 @@
 import scrapy
 from scrapy.crawler import CrawlerProcess
 from TouTiaoSpider import TouTiaoSpider
 from NetEase import NetEaseSpider
 process = CrawlerProcess()
 process.crawl(TouTiaoSpider)
 process.crawl(NetEaseSpider)
 process.start()
--- a/news_spider/news_spider/spiders/SpiderAll.pyc
+++ b/news_spider/news_spider/spiders/SpiderAll.pyc
--- a/news_spider/news_spider/spiders/Tencent.py
+++ b/news_spider/news_spider/spiders/Tencent.py
@ -5,14 +5,14 @@ import json
 import time 
 import re
-class NetEaseSpider(scrapy.Spider):
+class TencentSpider(scrapy.Spider):
 	start_urls = ['http://news.qq.com']
 	name='tencent'
 	allowed_domains=['news.qq.com']
-	base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
+	base_url = 'http://news.qq.com/b/history/index'
-	year = ['2016','2015']
+	year = ['2016','2015','2014']
 	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
 	day = ['31','30','29','28','27','26','25','24','23','22','21',
 		   '20','19','18','17','16','15','14','13','12','11','10',
@ -23,7 +23,7 @@ class NetEaseSpider(scrapy.Spider):
 		for y in self.year:
 			for m in self.month:
 				for d in self.day:
-					for t in tp:
+					for t in self.tp:
 						url = self.base_url+y+m+d+t+'.shtml?'
 						yield scrapy.Request(url,self.parseList)
@ -31,14 +31,15 @@ class NetEaseSpider(scrapy.Spider):
 	def parseList(self,response):
 		urls = response.xpath("//a/@href").extract()
 		for url in urls:
-			yield scrapy.Request(url,self.parseNews)
+			if 'http' in url:
 				yield scrapy.Request(url,self.parseNews)
 	def parseNews(self,response):
 		data = response.xpath("//div[@id='C-Main-Article-QQ']")
 		item = NewsSpiderItem()
 		time = data.xpath("//span[@class='article-time']/text()").extract()
 		title = data.xpath("//div[@class='hd']//h1/text()").extract()
-		content = data.xpath("//div[@class='post_text']/p/text()").extract()
+		content = data.xpath("//p/text()").extract()
 		time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
 		if(len(time)!=0 and len(title)!=0 and len(content)!=0):
--- a/news_spider/news_spider/spiders/Tencent.pyc
+++ b/news_spider/news_spider/spiders/Tencent.pyc