增加其他分类抓取

This commit is contained in:
lzjqsdd 2016-04-19 11:26:16 +08:00
parent a37d701063
commit d10ebdec47
2 changed files with 18 additions and 7 deletions

View File

@ -1,3 +1,4 @@
#encoding=utf-8
import scrapy
from news_spider.items import NewsSpiderItem
import json
@ -7,22 +8,32 @@ class TouTiaoSpider(scrapy.Spider):
name = 'toutiao'
allowed_domains = ["toutiao.com"]
start_urls = [
'http://toutiao.com/articles_news_society'
'http://toutiao.com/articles_news_society/p1'
]
base_class_url = 'http://toutiao.com/articles_news_society'
base_url = 'http://toutiao.com'
page = 1;
maxpage = 3;#允许爬的最大的页数
category = ['articles_news_society','articles_news_entertainment',
'articles_movie','articles_news_tech','articles_digital',
'articels_news_sports','articles_news_finance','articles_news_military',
'articles_news_culture','articles_science_all'
]
#请求每一个分类,按页数来
def parse(self,response):
print self.page
for ctg in self.category:
for page in range(0,self.maxpage):
url = self.base_url+'/'+ctg+'/p'+str(page)
yield scrapy.Request(url,self.parseNewsHref)
#解析每页新闻列表的地址
def parseNewsHref(self,response):
urls = response.xpath("//div[@class='info']//a/@href").extract()
for url in urls:
news_url = self.base_url+url
yield scrapy.Request(news_url,self.parseNews)
self.page+=1
if(self.page <=30):
yield scrapy.Request(self.base_class_url+'/p'+str(self.page))
#解析具体新闻内容
def parseNews(self,response):
articles = response.xpath("//div[@id='pagelet-article']")
for article in articles: