增加网易历史新闻数据抓取,由于各个页面的时间布局有出入,应采用正则提取时间

This commit is contained in:
lzjqsdd 2016-04-19 23:49:36 +08:00
parent 47865e367d
commit 37288e7260
3 changed files with 12 additions and 34 deletions

View File

@ -11,9 +11,20 @@ class NetEaseSpider(scrapy.Spider):
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
# year = ['2016','2015']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
year = ['2016']
month = ['03']
def parse(self,response):
count = 1
for y in self.year:
for m in self.month:
for d in range(1,30):
url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html'
yield scrapy.Request(url,self.parseList)
def parseList(self,response):
urls = response.xpath("//a/@href").extract()
for url in urls:
yield scrapy.Request(url,self.parseNews)

View File

@ -1,33 +0,0 @@
#encoding=utf-8
import scrapy
from news_spider.items import NewsSpiderItem
import json
import time
class NetEaseSpider(scrapy.Spider):
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
name='netease'
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
def parse(self,response):
count = 1
urls = response.xpath("//a/@href").extract()
for url in urls:
yield scrapy.Request(url,self.parseNews)
def parseNews(self,response):
content = response.xpath("//div[@class='post_content_main']")
item = NewsSpiderItem()
item['time'] = content.xpath("//div[@class='post_time_source']").extract()[0]
item['title'] = content.xpath("//h1/text()").extract()[0]
# content = content.xpath("//div[@class='post_text']/p/text()")
# cc=''
# if(len(content)!=0):
# for cc in content:
# cc = cc+content+'\n'
# item['content'] = cc
yield item