增加网易历史新闻数据抓取,由于各个页面的时间布局有出入,应采用正则提取时间
This commit is contained in:
parent
47865e367d
commit
37288e7260
@ -11,9 +11,20 @@ class NetEaseSpider(scrapy.Spider):
|
||||
allowed_domains=['news.163.com']
|
||||
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
# year = ['2016','2015']
|
||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
year = ['2016']
|
||||
month = ['03']
|
||||
|
||||
def parse(self,response):
|
||||
count = 1
|
||||
for y in self.year:
|
||||
for m in self.month:
|
||||
for d in range(1,30):
|
||||
url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html'
|
||||
yield scrapy.Request(url,self.parseList)
|
||||
|
||||
|
||||
def parseList(self,response):
|
||||
urls = response.xpath("//a/@href").extract()
|
||||
for url in urls:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
|
Binary file not shown.
@ -1,33 +0,0 @@
|
||||
#encoding=utf-8
|
||||
import scrapy
|
||||
from news_spider.items import NewsSpiderItem
|
||||
import json
|
||||
import time
|
||||
|
||||
class NetEaseSpider(scrapy.Spider):
|
||||
|
||||
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
|
||||
name='netease'
|
||||
allowed_domains=['news.163.com']
|
||||
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
|
||||
def parse(self,response):
|
||||
count = 1
|
||||
urls = response.xpath("//a/@href").extract()
|
||||
for url in urls:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
|
||||
def parseNews(self,response):
|
||||
content = response.xpath("//div[@class='post_content_main']")
|
||||
item = NewsSpiderItem()
|
||||
item['time'] = content.xpath("//div[@class='post_time_source']").extract()[0]
|
||||
item['title'] = content.xpath("//h1/text()").extract()[0]
|
||||
# content = content.xpath("//div[@class='post_text']/p/text()")
|
||||
# cc=''
|
||||
# if(len(content)!=0):
|
||||
# for cc in content:
|
||||
# cc = cc+content+'\n'
|
||||
# item['content'] = cc
|
||||
yield item
|
||||
|
Loading…
Reference in New Issue
Block a user