增加网易历史新闻数据抓取,由于各个页面的时间布局有出入,应采用正则提取时间
This commit is contained in:
parent
47865e367d
commit
37288e7260
@ -11,9 +11,20 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
allowed_domains=['news.163.com']
|
allowed_domains=['news.163.com']
|
||||||
|
|
||||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||||
|
# year = ['2016','2015']
|
||||||
|
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
|
year = ['2016']
|
||||||
|
month = ['03']
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
count = 1
|
for y in self.year:
|
||||||
|
for m in self.month:
|
||||||
|
for d in range(1,30):
|
||||||
|
url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html'
|
||||||
|
yield scrapy.Request(url,self.parseList)
|
||||||
|
|
||||||
|
|
||||||
|
def parseList(self,response):
|
||||||
urls = response.xpath("//a/@href").extract()
|
urls = response.xpath("//a/@href").extract()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
yield scrapy.Request(url,self.parseNews)
|
yield scrapy.Request(url,self.parseNews)
|
||||||
|
Binary file not shown.
@ -1,33 +0,0 @@
|
|||||||
#encoding=utf-8
|
|
||||||
import scrapy
|
|
||||||
from news_spider.items import NewsSpiderItem
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
|
|
||||||
class NetEaseSpider(scrapy.Spider):
|
|
||||||
|
|
||||||
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
|
|
||||||
name='netease'
|
|
||||||
allowed_domains=['news.163.com']
|
|
||||||
|
|
||||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
|
||||||
|
|
||||||
def parse(self,response):
|
|
||||||
count = 1
|
|
||||||
urls = response.xpath("//a/@href").extract()
|
|
||||||
for url in urls:
|
|
||||||
yield scrapy.Request(url,self.parseNews)
|
|
||||||
|
|
||||||
def parseNews(self,response):
|
|
||||||
content = response.xpath("//div[@class='post_content_main']")
|
|
||||||
item = NewsSpiderItem()
|
|
||||||
item['time'] = content.xpath("//div[@class='post_time_source']").extract()[0]
|
|
||||||
item['title'] = content.xpath("//h1/text()").extract()[0]
|
|
||||||
# content = content.xpath("//div[@class='post_text']/p/text()")
|
|
||||||
# cc=''
|
|
||||||
# if(len(content)!=0):
|
|
||||||
# for cc in content:
|
|
||||||
# cc = cc+content+'\n'
|
|
||||||
# item['content'] = cc
|
|
||||||
yield item
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user