开始添加检索模块

This commit is contained in:
lzjqsdd 2016-04-24 00:16:16 +08:00
parent d7a3e28f59
commit 3541ef0e7e
8 changed files with 22 additions and 16 deletions

Binary file not shown.

View File

@ -12,14 +12,16 @@ class NetEaseSpider(scrapy.Spider):
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
# year = ['2016','2015']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
year = ['2016','2015']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
# year = ['2016']
# month = ['03']
# day = ['31','30','29','28','27','26','25','24','23','22','21',
# '20','19','18','17','16','15','14','13','12','11','10',
# '09','08','07','06','05','04','03','02','01']
day = ['31']
year = ['2016']
month = ['03']
def parse(self,response):
for y in self.year:

View File

@ -11,17 +11,17 @@ class TencentSpider(scrapy.Spider):
name='tencent'
allowed_domains=['news.qq.com']
# base_url = 'http://news.qq.com/b/history/index'
# year = ['2016','2015','2014']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
# day = ['31','30','29','28','27','26','25','24','23','22','21',
# '20','19','18','17','16','15','14','13','12','11','10',
# '09','08','07','06','05','04','03','02','01']
base_url = 'http://news.qq.com/b/history/index'
year = ['2016','2015','2014']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
tp = ['am','pm']
day = ['31']
year = ['2016']
month = ['03']
# day = ['31']
# year = ['2016']
# month = ['03']
def parse(self,response):
for y in self.year:

View File

@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
]
base_class_url = 'http://toutiao.com/articles_news_society'
base_url = 'http://toutiao.com'
maxpage = 10;#允许爬的最大的页数
maxpage = 501;#允许爬的最大的页数
category = ['articles_news_society','articles_news_entertainment',
'articles_movie','articles_news_tech','articles_digital',
'articels_news_sports','articles_news_finance','articles_news_military',

View File

@ -12,7 +12,11 @@ while 1:
break
data = json.loads(line)
c+=1
print data['time'],data['title'],data['url']
if sys.argv[2] == '1':
print c,"-->",data['time'],data['title'],data['url'],data['content']
else:
print c,"-->",data['time'],data['title'],data['url']
#data = json.load(file)
#c = 0