修复网易新闻抓取时间提取错误,增加提取的新闻数据(仅时间标题)

This commit is contained in:
lzjqsdd 2016-04-21 22:44:36 +08:00
parent 37288e7260
commit 2a312aa769
14 changed files with 81856 additions and 82 deletions

File diff suppressed because one or more lines are too long

View File

@ -3,6 +3,7 @@ import scrapy
from news_spider.items import NewsSpiderItem from news_spider.items import NewsSpiderItem
import json import json
import time import time
import re
class NetEaseSpider(scrapy.Spider): class NetEaseSpider(scrapy.Spider):
@ -13,14 +14,18 @@ class NetEaseSpider(scrapy.Spider):
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!' base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
# year = ['2016','2015'] # year = ['2016','2015']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01'] # month = ['12','11','10','09','08','07','06','05','04','03','02','01']
# day = ['31','30','29','28','27','26','25','24','23','22','21',
# '20','19','18','17','16','15','14','13','12','11','10',
# '09','08','07','06','05','04','03','02','01']
day = ['31']
year = ['2016'] year = ['2016']
month = ['03'] month = ['03']
def parse(self,response): def parse(self,response):
for y in self.year: for y in self.year:
for m in self.month: for m in self.month:
for d in range(1,30): for d in self.day:
url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html' url = self.base_url+'/'+y+'-'+m+'/'+d+'/12.html'
yield scrapy.Request(url,self.parseList) yield scrapy.Request(url,self.parseList)
@ -36,8 +41,9 @@ class NetEaseSpider(scrapy.Spider):
title = data.xpath("//h1/text()").extract() title = data.xpath("//h1/text()").extract()
content = data.xpath("//div[@class='post_text']/p/text()").extract() content = data.xpath("//div[@class='post_text']/p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0): if(len(time)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time[0][13:-5] item['time'] = time_pattern.findall(time[0])[0]
item['title'] = title[0] item['title'] = title[0]
cc='' cc=''
if(len(content)!=0): if(len(content)!=0):
@ -45,4 +51,3 @@ class NetEaseSpider(scrapy.Spider):
cc = cc+c+'\n' cc = cc+c+'\n'
item['content'] = cc item['content'] = cc
yield item yield item

View File

@ -0,0 +1,10 @@
import scrapy
from scrapy.crawler import CrawlerProcess
from TouTiaoSpider import TouTiaoSpider
from NetEase import NetEaseSpider
process = CrawlerProcess()
process.crawl(TouTiaoSpider)
process.crawl(NetEaseSpider)
process.start()

Binary file not shown.

View File

@ -0,0 +1,52 @@
#encoding=utf-8
import scrapy
from news_spider.items import NewsSpiderItem
import json
import time
import re
class NetEaseSpider(scrapy.Spider):
start_urls = ['http://news.qq.com']
name='tencent'
allowed_domains=['news.qq.com']
base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
year = ['2016','2015']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
tp = ['am','pm']
def parse(self,response):
for y in self.year:
for m in self.month:
for d in self.day:
for t in tp:
url = self.base_url+y+m+d+t+'.shtml?'
yield scrapy.Request(url,self.parseList)
def parseList(self,response):
urls = response.xpath("//a/@href").extract()
for url in urls:
yield scrapy.Request(url,self.parseNews)
def parseNews(self,response):
data = response.xpath("//div[@id='C-Main-Article-QQ']")
item = NewsSpiderItem()
time = data.xpath("//span[@class='article-time']/text()").extract()
title = data.xpath("//div[@class='hd']//h1/text()").extract()
content = data.xpath("//div[@class='post_text']/p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time_pattern.findall(time[0])[0]
item['title'] = title[0]
cc=''
if(len(content)!=0):
for c in content:
cc = cc+c+'\n'
item['content'] = cc
yield item

Binary file not shown.

View File

@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
] ]
base_class_url = 'http://toutiao.com/articles_news_society' base_class_url = 'http://toutiao.com/articles_news_society'
base_url = 'http://toutiao.com' base_url = 'http://toutiao.com'
maxpage = 2;#允许爬的最大的页数 maxpage = 501;#允许爬的最大的页数
category = ['articles_news_society','articles_news_entertainment', category = ['articles_news_society','articles_news_entertainment',
'articles_movie','articles_news_tech','articles_digital', 'articles_movie','articles_news_tech','articles_digital',
'articels_news_sports','articles_news_finance','articles_news_military', 'articels_news_sports','articles_news_finance','articles_news_military',

View File

@ -2,3 +2,4 @@
# #
# Please refer to the documentation for information on how to create and manage # Please refer to the documentation for information on how to create and manage
# your spiders. # your spiders.

View File

@ -1,10 +1,17 @@
#!/usr/bin/python #!/usr/bin/python
import json import json
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
file = open('ne.json') file = open(sys.argv[1])
data = json.load(file) data = json.load(file)
c = 0
for article in data: for article in data:
print "[----Time-----]\n",article['time'] c+=1
print "[----Title----]\n",article['title'] print article['time'],"--------",article['title']
print "[----Article--]\n",article['content'],"\n\n" # print "[----Time-----]\n",article['time'],article['title']
# print "[----Title----]\n",article['title']
# print "[----Article--]\n",article['content'],"\n\n"
print c

11
news_spider/test.py Normal file
View File

@ -0,0 +1,11 @@
import re
time = " - -- 2015-06-15 15:34 "
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
#pattern = re.compile("[0-9]")
print pattern.findall(time)[0]

81691
news_spider/toutiao.data Normal file

File diff suppressed because it is too large Load Diff