修复网易新闻抓取时间提取错误,增加提取的新闻数据(仅时间标题)
This commit is contained in:
parent
37288e7260
commit
2a312aa769
File diff suppressed because one or more lines are too long
@ -3,6 +3,7 @@ import scrapy
|
|||||||
from news_spider.items import NewsSpiderItem
|
from news_spider.items import NewsSpiderItem
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
class NetEaseSpider(scrapy.Spider):
|
class NetEaseSpider(scrapy.Spider):
|
||||||
|
|
||||||
@ -13,14 +14,18 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||||
# year = ['2016','2015']
|
# year = ['2016','2015']
|
||||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
|
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
|
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||||
|
# '09','08','07','06','05','04','03','02','01']
|
||||||
|
day = ['31']
|
||||||
year = ['2016']
|
year = ['2016']
|
||||||
month = ['03']
|
month = ['03']
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
for y in self.year:
|
for y in self.year:
|
||||||
for m in self.month:
|
for m in self.month:
|
||||||
for d in range(1,30):
|
for d in self.day:
|
||||||
url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html'
|
url = self.base_url+'/'+y+'-'+m+'/'+d+'/12.html'
|
||||||
yield scrapy.Request(url,self.parseList)
|
yield scrapy.Request(url,self.parseList)
|
||||||
|
|
||||||
|
|
||||||
@ -36,8 +41,9 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
title = data.xpath("//h1/text()").extract()
|
title = data.xpath("//h1/text()").extract()
|
||||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||||
|
|
||||||
|
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||||
item['time'] = time[0][13:-5]
|
item['time'] = time_pattern.findall(time[0])[0]
|
||||||
item['title'] = title[0]
|
item['title'] = title[0]
|
||||||
cc=''
|
cc=''
|
||||||
if(len(content)!=0):
|
if(len(content)!=0):
|
||||||
@ -45,4 +51,3 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
cc = cc+c+'\n'
|
cc = cc+c+'\n'
|
||||||
item['content'] = cc
|
item['content'] = cc
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
Binary file not shown.
10
news_spider/news_spider/spiders/SpiderAll.py
Normal file
10
news_spider/news_spider/spiders/SpiderAll.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import scrapy
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from TouTiaoSpider import TouTiaoSpider
|
||||||
|
from NetEase import NetEaseSpider
|
||||||
|
|
||||||
|
process = CrawlerProcess()
|
||||||
|
process.crawl(TouTiaoSpider)
|
||||||
|
process.crawl(NetEaseSpider)
|
||||||
|
process.start()
|
||||||
|
|
BIN
news_spider/news_spider/spiders/SpiderAll.pyc
Normal file
BIN
news_spider/news_spider/spiders/SpiderAll.pyc
Normal file
Binary file not shown.
52
news_spider/news_spider/spiders/Tencent.py
Normal file
52
news_spider/news_spider/spiders/Tencent.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
import scrapy
|
||||||
|
from news_spider.items import NewsSpiderItem
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
|
class NetEaseSpider(scrapy.Spider):
|
||||||
|
|
||||||
|
start_urls = ['http://news.qq.com']
|
||||||
|
name='tencent'
|
||||||
|
allowed_domains=['news.qq.com']
|
||||||
|
|
||||||
|
base_url = 'http://news.qq.com/b/history/index20160419am.shtml?'
|
||||||
|
year = ['2016','2015']
|
||||||
|
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
|
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
|
'20','19','18','17','16','15','14','13','12','11','10',
|
||||||
|
'09','08','07','06','05','04','03','02','01']
|
||||||
|
tp = ['am','pm']
|
||||||
|
|
||||||
|
def parse(self,response):
|
||||||
|
for y in self.year:
|
||||||
|
for m in self.month:
|
||||||
|
for d in self.day:
|
||||||
|
for t in tp:
|
||||||
|
url = self.base_url+y+m+d+t+'.shtml?'
|
||||||
|
yield scrapy.Request(url,self.parseList)
|
||||||
|
|
||||||
|
|
||||||
|
def parseList(self,response):
|
||||||
|
urls = response.xpath("//a/@href").extract()
|
||||||
|
for url in urls:
|
||||||
|
yield scrapy.Request(url,self.parseNews)
|
||||||
|
|
||||||
|
def parseNews(self,response):
|
||||||
|
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
||||||
|
item = NewsSpiderItem()
|
||||||
|
time = data.xpath("//span[@class='article-time']/text()").extract()
|
||||||
|
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
||||||
|
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||||
|
|
||||||
|
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
|
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||||
|
item['time'] = time_pattern.findall(time[0])[0]
|
||||||
|
item['title'] = title[0]
|
||||||
|
cc=''
|
||||||
|
if(len(content)!=0):
|
||||||
|
for c in content:
|
||||||
|
cc = cc+c+'\n'
|
||||||
|
item['content'] = cc
|
||||||
|
yield item
|
BIN
news_spider/news_spider/spiders/Tencent.pyc
Normal file
BIN
news_spider/news_spider/spiders/Tencent.pyc
Normal file
Binary file not shown.
@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
|
|||||||
]
|
]
|
||||||
base_class_url = 'http://toutiao.com/articles_news_society'
|
base_class_url = 'http://toutiao.com/articles_news_society'
|
||||||
base_url = 'http://toutiao.com'
|
base_url = 'http://toutiao.com'
|
||||||
maxpage = 2;#允许爬的最大的页数
|
maxpage = 501;#允许爬的最大的页数
|
||||||
category = ['articles_news_society','articles_news_entertainment',
|
category = ['articles_news_society','articles_news_entertainment',
|
||||||
'articles_movie','articles_news_tech','articles_digital',
|
'articles_movie','articles_news_tech','articles_digital',
|
||||||
'articels_news_sports','articles_news_finance','articles_news_military',
|
'articels_news_sports','articles_news_finance','articles_news_military',
|
||||||
|
Binary file not shown.
@ -2,3 +2,4 @@
|
|||||||
#
|
#
|
||||||
# Please refer to the documentation for information on how to create and manage
|
# Please refer to the documentation for information on how to create and manage
|
||||||
# your spiders.
|
# your spiders.
|
||||||
|
|
||||||
|
Binary file not shown.
@ -1,10 +1,17 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
import json
|
import json
|
||||||
|
import sys
|
||||||
|
reload(sys)
|
||||||
|
sys.setdefaultencoding( "utf-8" )
|
||||||
|
|
||||||
file = open('ne.json')
|
file = open(sys.argv[1])
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
|
c = 0
|
||||||
for article in data:
|
for article in data:
|
||||||
print "[----Time-----]\n",article['time']
|
c+=1
|
||||||
print "[----Title----]\n",article['title']
|
print article['time'],"--------",article['title']
|
||||||
print "[----Article--]\n",article['content'],"\n\n"
|
# print "[----Time-----]\n",article['time'],article['title']
|
||||||
|
# print "[----Title----]\n",article['title']
|
||||||
|
# print "[----Article--]\n",article['content'],"\n\n"
|
||||||
|
print c
|
||||||
|
11
news_spider/test.py
Normal file
11
news_spider/test.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
time = " - -- 2015-06-15 15:34 "
|
||||||
|
|
||||||
|
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
|
'20','19','18','17','16','15','14','13','12','11','10',
|
||||||
|
'09','08','07','06','05','04','03','02','01']
|
||||||
|
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
|
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
|
||||||
|
#pattern = re.compile("[0-9]")
|
||||||
|
print pattern.findall(time)[0]
|
81691
news_spider/toutiao.data
Normal file
81691
news_spider/toutiao.data
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user