修复网易新闻抓取时间提取错误,增加提取的新闻数据(仅时间标题)
This commit is contained in:
parent
37288e7260
commit
2a312aa769
File diff suppressed because one or more lines are too long
@ -3,6 +3,7 @@ import scrapy
|
||||
from news_spider.items import NewsSpiderItem
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
|
||||
class NetEaseSpider(scrapy.Spider):
|
||||
|
||||
@ -13,14 +14,18 @@ class NetEaseSpider(scrapy.Spider):
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
# year = ['2016','2015']
|
||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||
# '09','08','07','06','05','04','03','02','01']
|
||||
day = ['31']
|
||||
year = ['2016']
|
||||
month = ['03']
|
||||
|
||||
def parse(self,response):
|
||||
for y in self.year:
|
||||
for m in self.month:
|
||||
for d in range(1,30):
|
||||
url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html'
|
||||
for d in self.day:
|
||||
url = self.base_url+'/'+y+'-'+m+'/'+d+'/12.html'
|
||||
yield scrapy.Request(url,self.parseList)
|
||||
|
||||
|
||||
@ -36,8 +41,9 @@ class NetEaseSpider(scrapy.Spider):
|
||||
title = data.xpath("//h1/text()").extract()
|
||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||
|
||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||
item['time'] = time[0][13:-5]
|
||||
item['time'] = time_pattern.findall(time[0])[0]
|
||||
item['title'] = title[0]
|
||||
cc=''
|
||||
if(len(content)!=0):
|
||||
@ -45,4 +51,3 @@ class NetEaseSpider(scrapy.Spider):
|
||||
cc = cc+c+'\n'
|
||||
item['content'] = cc
|
||||
yield item
|
||||
|
||||
|
Binary file not shown.
10
news_spider/news_spider/spiders/SpiderAll.py
Normal file
10
news_spider/news_spider/spiders/SpiderAll.py
Normal file
@ -0,0 +1,10 @@
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from TouTiaoSpider import TouTiaoSpider
|
||||
from NetEase import NetEaseSpider
|
||||
|
||||
process = CrawlerProcess()
|
||||
process.crawl(TouTiaoSpider)
|
||||
process.crawl(NetEaseSpider)
|
||||
process.start()
|
||||
|
BIN
news_spider/news_spider/spiders/SpiderAll.pyc
Normal file
BIN
news_spider/news_spider/spiders/SpiderAll.pyc
Normal file
Binary file not shown.
52
news_spider/news_spider/spiders/Tencent.py
Normal file
52
news_spider/news_spider/spiders/Tencent.py
Normal file
@ -0,0 +1,52 @@
|
||||
#encoding=utf-8
|
||||
import scrapy
|
||||
from news_spider.items import NewsSpiderItem
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
|
||||
class NetEaseSpider(scrapy.Spider):
|
||||
|
||||
start_urls = ['http://news.qq.com']
|
||||
name='tencent'
|
||||
allowed_domains=['news.qq.com']
|
||||
|
||||
base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
|
||||
year = ['2016','2015']
|
||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
'09','08','07','06','05','04','03','02','01']
|
||||
tp = ['am','pm']
|
||||
|
||||
def parse(self,response):
|
||||
for y in self.year:
|
||||
for m in self.month:
|
||||
for d in self.day:
|
||||
for t in tp:
|
||||
url = self.base_url+y+m+d+t+'.shtml?'
|
||||
yield scrapy.Request(url,self.parseList)
|
||||
|
||||
|
||||
def parseList(self,response):
|
||||
urls = response.xpath("//a/@href").extract()
|
||||
for url in urls:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
|
||||
def parseNews(self,response):
|
||||
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
||||
item = NewsSpiderItem()
|
||||
time = data.xpath("//span[@class='article-time']/text()").extract()
|
||||
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||
|
||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||
item['time'] = time_pattern.findall(time[0])[0]
|
||||
item['title'] = title[0]
|
||||
cc=''
|
||||
if(len(content)!=0):
|
||||
for c in content:
|
||||
cc = cc+c+'\n'
|
||||
item['content'] = cc
|
||||
yield item
|
BIN
news_spider/news_spider/spiders/Tencent.pyc
Normal file
BIN
news_spider/news_spider/spiders/Tencent.pyc
Normal file
Binary file not shown.
@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
|
||||
]
|
||||
base_class_url = 'http://toutiao.com/articles_news_society'
|
||||
base_url = 'http://toutiao.com'
|
||||
maxpage = 2;#允许爬的最大的页数
|
||||
maxpage = 501;#允许爬的最大的页数
|
||||
category = ['articles_news_society','articles_news_entertainment',
|
||||
'articles_movie','articles_news_tech','articles_digital',
|
||||
'articels_news_sports','articles_news_finance','articles_news_military',
|
||||
|
Binary file not shown.
@ -2,3 +2,4 @@
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
|
||||
|
Binary file not shown.
@ -1,10 +1,17 @@
|
||||
#!/usr/bin/python
|
||||
import json
|
||||
import sys
|
||||
reload(sys)
|
||||
sys.setdefaultencoding( "utf-8" )
|
||||
|
||||
file = open('ne.json')
|
||||
file = open(sys.argv[1])
|
||||
data = json.load(file)
|
||||
|
||||
c = 0
|
||||
for article in data:
|
||||
print "[----Time-----]\n",article['time']
|
||||
print "[----Title----]\n",article['title']
|
||||
print "[----Article--]\n",article['content'],"\n\n"
|
||||
c+=1
|
||||
print article['time'],"--------",article['title']
|
||||
# print "[----Time-----]\n",article['time'],article['title']
|
||||
# print "[----Title----]\n",article['title']
|
||||
# print "[----Article--]\n",article['content'],"\n\n"
|
||||
print c
|
||||
|
11
news_spider/test.py
Normal file
11
news_spider/test.py
Normal file
@ -0,0 +1,11 @@
|
||||
import re
|
||||
|
||||
time = " - -- 2015-06-15 15:34 "
|
||||
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
'09','08','07','06','05','04','03','02','01']
|
||||
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
|
||||
#pattern = re.compile("[0-9]")
|
||||
print pattern.findall(time)[0]
|
81691
news_spider/toutiao.data
Normal file
81691
news_spider/toutiao.data
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user