diff --git a/news_spider/news.db b/news_spider/news.db index c97a9ef..454515f 100644 Binary files a/news_spider/news.db and b/news_spider/news.db differ diff --git a/news_spider/news_spider/spiders/NetEase.py b/news_spider/news_spider/spiders/NetEase.py index ef486bc..62e7abe 100644 --- a/news_spider/news_spider/spiders/NetEase.py +++ b/news_spider/news_spider/spiders/NetEase.py @@ -12,14 +12,16 @@ class NetEaseSpider(scrapy.Spider): allowed_domains=['news.163.com'] base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!' -# year = ['2016','2015'] -# month = ['12','11','10','09','08','07','06','05','04','03','02','01'] + year = ['2016','2015'] + month = ['12','11','10','09','08','07','06','05','04','03','02','01'] + day = ['31','30','29','28','27','26','25','24','23','22','21', + '20','19','18','17','16','15','14','13','12','11','10', + '09','08','07','06','05','04','03','02','01'] +# year = ['2016'] +# month = ['03'] # day = ['31','30','29','28','27','26','25','24','23','22','21', # '20','19','18','17','16','15','14','13','12','11','10', # '09','08','07','06','05','04','03','02','01'] - day = ['31'] - year = ['2016'] - month = ['03'] def parse(self,response): for y in self.year: diff --git a/news_spider/news_spider/spiders/NetEase.pyc b/news_spider/news_spider/spiders/NetEase.pyc index 42dc16d..4f51bf1 100644 Binary files a/news_spider/news_spider/spiders/NetEase.pyc and b/news_spider/news_spider/spiders/NetEase.pyc differ diff --git a/news_spider/news_spider/spiders/Tencent.py b/news_spider/news_spider/spiders/Tencent.py index 549f8c7..1e9e1c8 100644 --- a/news_spider/news_spider/spiders/Tencent.py +++ b/news_spider/news_spider/spiders/Tencent.py @@ -11,17 +11,17 @@ class TencentSpider(scrapy.Spider): name='tencent' allowed_domains=['news.qq.com'] -# base_url = 'http://news.qq.com/b/history/index' -# year = ['2016','2015','2014'] -# month = ['12','11','10','09','08','07','06','05','04','03','02','01'] -# day = ['31','30','29','28','27','26','25','24','23','22','21', -# '20','19','18','17','16','15','14','13','12','11','10', -# '09','08','07','06','05','04','03','02','01'] + base_url = 'http://news.qq.com/b/history/index' + year = ['2016','2015','2014'] + month = ['12','11','10','09','08','07','06','05','04','03','02','01'] + day = ['31','30','29','28','27','26','25','24','23','22','21', + '20','19','18','17','16','15','14','13','12','11','10', + '09','08','07','06','05','04','03','02','01'] tp = ['am','pm'] - day = ['31'] - year = ['2016'] - month = ['03'] +# day = ['31'] +# year = ['2016'] +# month = ['03'] def parse(self,response): for y in self.year: diff --git a/news_spider/news_spider/spiders/Tencent.pyc b/news_spider/news_spider/spiders/Tencent.pyc index 6488437..ec17540 100644 Binary files a/news_spider/news_spider/spiders/Tencent.pyc and b/news_spider/news_spider/spiders/Tencent.pyc differ diff --git a/news_spider/news_spider/spiders/TouTiaoSpider.py b/news_spider/news_spider/spiders/TouTiaoSpider.py index 76f500c..8e3db0d 100644 --- a/news_spider/news_spider/spiders/TouTiaoSpider.py +++ b/news_spider/news_spider/spiders/TouTiaoSpider.py @@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider): ] base_class_url = 'http://toutiao.com/articles_news_society' base_url = 'http://toutiao.com' - maxpage = 10;#允许爬的最大的页数 + maxpage = 501;#允许爬的最大的页数 category = ['articles_news_society','articles_news_entertainment', 'articles_movie','articles_news_tech','articles_digital', 'articels_news_sports','articles_news_finance','articles_news_military', diff --git a/news_spider/news_spider/spiders/TouTiaoSpider.pyc b/news_spider/news_spider/spiders/TouTiaoSpider.pyc index d468fb9..c4188da 100644 Binary files a/news_spider/news_spider/spiders/TouTiaoSpider.pyc and b/news_spider/news_spider/spiders/TouTiaoSpider.pyc differ diff --git a/news_spider/show.py b/news_spider/show.py index 2ed65f9..370931a 100755 --- a/news_spider/show.py +++ b/news_spider/show.py @@ -12,7 +12,11 @@ while 1: break data = json.loads(line) c+=1 - print data['time'],data['title'],data['url'] + if sys.argv[2] == '1': + print c,"-->",data['time'],data['title'],data['url'],data['content'] + else: + print c,"-->",data['time'],data['title'],data['url'] + #data = json.load(file) #c = 0