Add TencentSpider,Add SelfDefine Command to run mulity spider

2016-04-22 10:13:34 +08:00 · 2016-04-22 10:13:34 +08:00 · 3201d09c43
commit 3201d09c43
parent 2a312aa769
13 changed files with 67 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+news_spider/tt.json
+news_spider/ne.json
+news_spider/te.json
--- a/news_spider/news_spider/commands/init.py
+++ b/news_spider/news_spider/commands/init.py
--- a/news_spider/news_spider/commands/init.pyc
+++ b/news_spider/news_spider/commands/init.pyc
--- a/news_spider/news_spider/commands/crawlall.py
+++ b/news_spider/news_spider/commands/crawlall.py
@ -0,0 +1,39 @@
+from scrapy.commands import ScrapyCommand  
+from scrapy.crawler import CrawlerRunner
+from scrapy.utils.conf import arglist_to_dict
+
+class Command(ScrapyCommand):
+  
+    requires_project = True
+  
+    def syntax(self):  
+        return '[options]'  
+  
+    def short_desc(self):  
+        return 'Runs all of the spiders'  
+
+    def add_options(self, parser):
+        ScrapyCommand.add_options(self, parser)
+        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
+                          help="set spider argument (may be repeated)")
+        parser.add_option("-o", "--output", metavar="FILE",
+                          help="dump scraped items into FILE (use - for stdout)")
+        parser.add_option("-t", "--output-format", metavar="FORMAT",
+                          help="format to use for dumping items with -o")
+
+    def process_options(self, args, opts):
+        ScrapyCommand.process_options(self, args, opts)
+        try:
+            opts.spargs = arglist_to_dict(opts.spargs)
+        except ValueError:
+            raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
+
+    def run(self, args, opts):
+	#settings = get_project_settings()
+	
+	spider_loader = self.crawler_process.spider_loader
+	for spidername in args or spider_loader.list():
+	    print "*********cralall spidername************" + spidername
+	    self.crawler_process.crawl(spidername, **opts.spargs)
+
+        self.crawler_process.start()
--- a/news_spider/news_spider/commands/crawlall.pyc
+++ b/news_spider/news_spider/commands/crawlall.pyc
--- a/news_spider/news_spider/settings.py
+++ b/news_spider/news_spider/settings.py
@ -13,6 +13,7 @@ BOT_NAME = 'news_spider'

 SPIDER_MODULES = ['news_spider.spiders']
 NEWSPIDER_MODULE = 'news_spider.spiders'
+COMMANDS_MODULE='news_spider.commands'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
--- a/news_spider/news_spider/settings.pyc
+++ b/news_spider/news_spider/settings.pyc
--- a/news_spider/news_spider/setup.py
+++ b/news_spider/news_spider/setup.py
@ -0,0 +1,9 @@
+from setuptools import setup, find_packages
+
+setup(name='scrapy-mymodule',
+  entry_points={
+    'scrapy.commands': [
+      'crawlall=news_spider.commands:crawlall',
+    ],
+  },
+ )
--- a/news_spider/news_spider/spiders/NetEase.py
+++ b/news_spider/news_spider/spiders/NetEase.py
@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
 	allowed_domains=['news.163.com']

 	base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
-#	year = ['2016','2015']
-#	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
-#	day = ['31','30','29','28','27','26','25','24','23','22','21',
-#		   '20','19','18','17','16','15','14','13','12','11','10',
-#		   '09','08','07','06','05','04','03','02','01']
-	day = ['31']
-	year = ['2016']
-	month = ['03']
+	year = ['2016','2015']
+	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
+	day = ['31','30','29','28','27','26','25','24','23','22','21',
+		   '20','19','18','17','16','15','14','13','12','11','10',
+		   '09','08','07','06','05','04','03','02','01']
+#	day = ['31']
+#	year = ['2016']
+#	month = ['03']

 	def parse(self,response):
 		for y in self.year:
--- a/news_spider/news_spider/spiders/SpiderAll.py
+++ b/news_spider/news_spider/spiders/SpiderAll.py
@ -1,10 +0,0 @@
-import scrapy
-from scrapy.crawler import CrawlerProcess
-from TouTiaoSpider import TouTiaoSpider
-from NetEase import NetEaseSpider
-
-process = CrawlerProcess()
-process.crawl(TouTiaoSpider)
-process.crawl(NetEaseSpider)
-process.start()
-
--- a/news_spider/news_spider/spiders/SpiderAll.pyc
+++ b/news_spider/news_spider/spiders/SpiderAll.pyc
--- a/news_spider/news_spider/spiders/Tencent.py
+++ b/news_spider/news_spider/spiders/Tencent.py
@ -5,14 +5,14 @@ import json
 import time 
 import re

-class NetEaseSpider(scrapy.Spider):
+class TencentSpider(scrapy.Spider):

 	start_urls = ['http://news.qq.com']
 	name='tencent'
 	allowed_domains=['news.qq.com']

-	base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
-	year = ['2016','2015']
+	base_url = 'http://news.qq.com/b/history/index'
+	year = ['2016','2015','2014']
 	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
 	day = ['31','30','29','28','27','26','25','24','23','22','21',
 		   '20','19','18','17','16','15','14','13','12','11','10',
@ -23,7 +23,7 @@ class NetEaseSpider(scrapy.Spider):
 		for y in self.year:
 			for m in self.month:
 				for d in self.day:
-					for t in tp:
+					for t in self.tp:
 						url = self.base_url+y+m+d+t+'.shtml?'
 						yield scrapy.Request(url,self.parseList)

@ -31,14 +31,15 @@ class NetEaseSpider(scrapy.Spider):
 	def parseList(self,response):
 		urls = response.xpath("//a/@href").extract()
 		for url in urls:
-			yield scrapy.Request(url,self.parseNews)
+			if 'http' in url:
+				yield scrapy.Request(url,self.parseNews)

 	def parseNews(self,response):
 		data = response.xpath("//div[@id='C-Main-Article-QQ']")
 		item = NewsSpiderItem()
 		time = data.xpath("//span[@class='article-time']/text()").extract()
 		title = data.xpath("//div[@class='hd']//h1/text()").extract()
-		content = data.xpath("//div[@class='post_text']/p/text()").extract()
+		content = data.xpath("//p/text()").extract()

 		time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
 		if(len(time)!=0 and len(title)!=0 and len(content)!=0):
--- a/news_spider/news_spider/spiders/Tencent.pyc
+++ b/news_spider/news_spider/spiders/Tencent.pyc