增加其他分类抓取
This commit is contained in:
parent
a37d701063
commit
d10ebdec47
@ -1,3 +1,4 @@
|
||||
#encoding=utf-8
|
||||
import scrapy
|
||||
from news_spider.items import NewsSpiderItem
|
||||
import json
|
||||
@ -7,22 +8,32 @@ class TouTiaoSpider(scrapy.Spider):
|
||||
name = 'toutiao'
|
||||
allowed_domains = ["toutiao.com"]
|
||||
start_urls = [
|
||||
'http://toutiao.com/articles_news_society'
|
||||
'http://toutiao.com/articles_news_society/p1'
|
||||
]
|
||||
base_class_url = 'http://toutiao.com/articles_news_society'
|
||||
base_url = 'http://toutiao.com'
|
||||
page = 1;
|
||||
maxpage = 3;#允许爬的最大的页数
|
||||
category = ['articles_news_society','articles_news_entertainment',
|
||||
'articles_movie','articles_news_tech','articles_digital',
|
||||
'articels_news_sports','articles_news_finance','articles_news_military',
|
||||
'articles_news_culture','articles_science_all'
|
||||
]
|
||||
|
||||
#请求每一个分类,按页数来
|
||||
def parse(self,response):
|
||||
print self.page
|
||||
for ctg in self.category:
|
||||
for page in range(0,self.maxpage):
|
||||
url = self.base_url+'/'+ctg+'/p'+str(page)
|
||||
yield scrapy.Request(url,self.parseNewsHref)
|
||||
|
||||
#解析每页新闻列表的地址
|
||||
def parseNewsHref(self,response):
|
||||
urls = response.xpath("//div[@class='info']//a/@href").extract()
|
||||
for url in urls:
|
||||
news_url = self.base_url+url
|
||||
yield scrapy.Request(news_url,self.parseNews)
|
||||
self.page+=1
|
||||
if(self.page <=30):
|
||||
yield scrapy.Request(self.base_class_url+'/p'+str(self.page))
|
||||
|
||||
|
||||
#解析具体新闻内容
|
||||
def parseNews(self,response):
|
||||
articles = response.xpath("//div[@id='pagelet-article']")
|
||||
for article in articles:
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user