修复网易新闻抓取时间提取错误，增加提取的新闻数据（仅时间标题）

2016-04-21 22:44:36 +08:00 · 2016-04-21 22:44:36 +08:00 · 2a312aa769
commit 2a312aa769
parent 37288e7260
14 changed files with 81856 additions and 82 deletions
--- a/news_spider/ne.json
+++ b/news_spider/ne.json
--- a/news_spider/news_spider/spiders/NetEase.py
+++ b/news_spider/news_spider/spiders/NetEase.py
@ -3,6 +3,7 @@ import scrapy
 from news_spider.items import NewsSpiderItem
 import json
 import time 
+import re

 class NetEaseSpider(scrapy.Spider):

@ -13,14 +14,18 @@ class NetEaseSpider(scrapy.Spider):
 	base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
 #	year = ['2016','2015']
 #	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
+#	day = ['31','30','29','28','27','26','25','24','23','22','21',
+#		   '20','19','18','17','16','15','14','13','12','11','10',
+#		   '09','08','07','06','05','04','03','02','01']
+	day = ['31']
 	year = ['2016']
 	month = ['03']

 	def parse(self,response):
 		for y in self.year:
 			for m in self.month:
-				for d in range(1,30):
-					url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html'
+				for d in self.day:
+					url = self.base_url+'/'+y+'-'+m+'/'+d+'/12.html'
 					yield scrapy.Request(url,self.parseList)

 	
@ -36,8 +41,9 @@ class NetEaseSpider(scrapy.Spider):
 		title = data.xpath("//h1/text()").extract()
 		content = data.xpath("//div[@class='post_text']/p/text()").extract()

+		time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
 		if(len(time)!=0 and len(title)!=0 and len(content)!=0):
-			item['time'] = time[0][13:-5]
+			item['time'] = time_pattern.findall(time[0])[0]
 			item['title'] = title[0]
 			cc=''
 			if(len(content)!=0):
@ -45,4 +51,3 @@ class NetEaseSpider(scrapy.Spider):
 					cc = cc+c+'\n'
 			item['content'] = cc
 			yield item
-
--- a/news_spider/news_spider/spiders/NetEase.pyc
+++ b/news_spider/news_spider/spiders/NetEase.pyc
--- a/news_spider/news_spider/spiders/SpiderAll.py
+++ b/news_spider/news_spider/spiders/SpiderAll.py
@ -0,0 +1,10 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from TouTiaoSpider import TouTiaoSpider
+from NetEase import NetEaseSpider
+
+process = CrawlerProcess()
+process.crawl(TouTiaoSpider)
+process.crawl(NetEaseSpider)
+process.start()
+
--- a/news_spider/news_spider/spiders/SpiderAll.pyc
+++ b/news_spider/news_spider/spiders/SpiderAll.pyc
--- a/news_spider/news_spider/spiders/Tencent.py
+++ b/news_spider/news_spider/spiders/Tencent.py
@ -0,0 +1,52 @@
+#encoding=utf-8
+import scrapy
+from news_spider.items import NewsSpiderItem
+import json
+import time 
+import re
+
+class NetEaseSpider(scrapy.Spider):
+
+	start_urls = ['http://news.qq.com']
+	name='tencent'
+	allowed_domains=['news.qq.com']
+
+	base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
+	year = ['2016','2015']
+	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
+	day = ['31','30','29','28','27','26','25','24','23','22','21',
+		   '20','19','18','17','16','15','14','13','12','11','10',
+		   '09','08','07','06','05','04','03','02','01']
+	tp = ['am','pm']
+
+	def parse(self,response):
+		for y in self.year:
+			for m in self.month:
+				for d in self.day:
+					for t in tp:
+						url = self.base_url+y+m+d+t+'.shtml?'
+						yield scrapy.Request(url,self.parseList)
+
+	
+	def parseList(self,response):
+		urls = response.xpath("//a/@href").extract()
+		for url in urls:
+			yield scrapy.Request(url,self.parseNews)
+
+	def parseNews(self,response):
+		data = response.xpath("//div[@id='C-Main-Article-QQ']")
+		item = NewsSpiderItem()
+		time = data.xpath("//span[@class='article-time']/text()").extract()
+		title = data.xpath("//div[@class='hd']//h1/text()").extract()
+		content = data.xpath("//div[@class='post_text']/p/text()").extract()
+
+		time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
+		if(len(time)!=0 and len(title)!=0 and len(content)!=0):
+			item['time'] = time_pattern.findall(time[0])[0]
+			item['title'] = title[0]
+			cc=''
+			if(len(content)!=0):
+				for c in content:
+					cc = cc+c+'\n'
+			item['content'] = cc
+			yield item
--- a/news_spider/news_spider/spiders/Tencent.pyc
+++ b/news_spider/news_spider/spiders/Tencent.pyc
--- a/news_spider/news_spider/spiders/TouTiaoSpider.py
+++ b/news_spider/news_spider/spiders/TouTiaoSpider.py
@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
 	]
 	base_class_url = 'http://toutiao.com/articles_news_society'
 	base_url = 'http://toutiao.com'
-	maxpage = 2;#允许爬的最大的页数
+	maxpage = 501;#允许爬的最大的页数
 	category = ['articles_news_society','articles_news_entertainment',
 	'articles_movie','articles_news_tech','articles_digital',
 	'articels_news_sports','articles_news_finance','articles_news_military',
--- a/news_spider/news_spider/spiders/TouTiaoSpider.pyc
+++ b/news_spider/news_spider/spiders/TouTiaoSpider.pyc
--- a/news_spider/news_spider/spiders/init.py
+++ b/news_spider/news_spider/spiders/init.py
@ -2,3 +2,4 @@
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
+
--- a/news_spider/news_spider/spiders/init.pyc
+++ b/news_spider/news_spider/spiders/init.pyc
--- a/news_spider/show.py
+++ b/news_spider/show.py
@ -1,10 +1,17 @@
 #!/usr/bin/python
 import json
+import sys
+reload(sys)
+sys.setdefaultencoding( "utf-8" )

-file = open('ne.json')
+file = open(sys.argv[1])
 data = json.load(file)

+c = 0
 for article in data:
-	print "[----Time-----]\n",article['time']
-	print "[----Title----]\n",article['title']
-	print "[----Article--]\n",article['content'],"\n\n"
+	c+=1
+	print article['time'],"--------",article['title']
+#	print "[----Time-----]\n",article['time'],article['title']
+#	print "[----Title----]\n",article['title']
+#	print "[----Article--]\n",article['content'],"\n\n"
+print c
--- a/news_spider/test.py
+++ b/news_spider/test.py
@ -0,0 +1,11 @@
+import re
+
+time = " - -- 2015-06-15 15:34   "
+
+day = ['31','30','29','28','27','26','25','24','23','22','21',
+	   '20','19','18','17','16','15','14','13','12','11','10',
+	   '09','08','07','06','05','04','03','02','01']
+pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
+#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
+#pattern = re.compile("[0-9]")
+print pattern.findall(time)[0]
--- a/news_spider/toutiao.data
+++ b/news_spider/toutiao.data