diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fa6a5d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +news_spider/tt.json +news_spider/ne.json +news_spider/te.json diff --git a/news_spider/news_spider/commands/__init__.py b/news_spider/news_spider/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/news_spider/news_spider/commands/__init__.pyc b/news_spider/news_spider/commands/__init__.pyc new file mode 100644 index 0000000..b24566e Binary files /dev/null and b/news_spider/news_spider/commands/__init__.pyc differ diff --git a/news_spider/news_spider/commands/crawlall.py b/news_spider/news_spider/commands/crawlall.py new file mode 100644 index 0000000..dee1001 --- /dev/null +++ b/news_spider/news_spider/commands/crawlall.py @@ -0,0 +1,39 @@ +from scrapy.commands import ScrapyCommand +from scrapy.crawler import CrawlerRunner +from scrapy.utils.conf import arglist_to_dict + +class Command(ScrapyCommand): + + requires_project = True + + def syntax(self): + return '[options]' + + def short_desc(self): + return 'Runs all of the spiders' + + def add_options(self, parser): + ScrapyCommand.add_options(self, parser) + parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", + help="set spider argument (may be repeated)") + parser.add_option("-o", "--output", metavar="FILE", + help="dump scraped items into FILE (use - for stdout)") + parser.add_option("-t", "--output-format", metavar="FORMAT", + help="format to use for dumping items with -o") + + def process_options(self, args, opts): + ScrapyCommand.process_options(self, args, opts) + try: + opts.spargs = arglist_to_dict(opts.spargs) + except ValueError: + raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) + + def run(self, args, opts): + #settings = get_project_settings() + + spider_loader = self.crawler_process.spider_loader + for spidername in args or spider_loader.list(): + print "*********cralall spidername************" + spidername + self.crawler_process.crawl(spidername, **opts.spargs) + + self.crawler_process.start() diff --git a/news_spider/news_spider/commands/crawlall.pyc b/news_spider/news_spider/commands/crawlall.pyc new file mode 100644 index 0000000..3862337 Binary files /dev/null and b/news_spider/news_spider/commands/crawlall.pyc differ diff --git a/news_spider/news_spider/settings.py b/news_spider/news_spider/settings.py index 23814b9..3a332c9 100644 --- a/news_spider/news_spider/settings.py +++ b/news_spider/news_spider/settings.py @@ -13,6 +13,7 @@ BOT_NAME = 'news_spider' SPIDER_MODULES = ['news_spider.spiders'] NEWSPIDER_MODULE = 'news_spider.spiders' +COMMANDS_MODULE='news_spider.commands' # Crawl responsibly by identifying yourself (and your website) on the user-agent diff --git a/news_spider/news_spider/settings.pyc b/news_spider/news_spider/settings.pyc index 4acf92c..cf132d2 100644 Binary files a/news_spider/news_spider/settings.pyc and b/news_spider/news_spider/settings.pyc differ diff --git a/news_spider/news_spider/setup.py b/news_spider/news_spider/setup.py new file mode 100644 index 0000000..6ce611c --- /dev/null +++ b/news_spider/news_spider/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup, find_packages + +setup(name='scrapy-mymodule', + entry_points={ + 'scrapy.commands': [ + 'crawlall=news_spider.commands:crawlall', + ], + }, + ) diff --git a/news_spider/news_spider/spiders/NetEase.py b/news_spider/news_spider/spiders/NetEase.py index 18c82ab..c061c95 100644 --- a/news_spider/news_spider/spiders/NetEase.py +++ b/news_spider/news_spider/spiders/NetEase.py @@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider): allowed_domains=['news.163.com'] base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!' -# year = ['2016','2015'] -# month = ['12','11','10','09','08','07','06','05','04','03','02','01'] -# day = ['31','30','29','28','27','26','25','24','23','22','21', -# '20','19','18','17','16','15','14','13','12','11','10', -# '09','08','07','06','05','04','03','02','01'] - day = ['31'] - year = ['2016'] - month = ['03'] + year = ['2016','2015'] + month = ['12','11','10','09','08','07','06','05','04','03','02','01'] + day = ['31','30','29','28','27','26','25','24','23','22','21', + '20','19','18','17','16','15','14','13','12','11','10', + '09','08','07','06','05','04','03','02','01'] +# day = ['31'] +# year = ['2016'] +# month = ['03'] def parse(self,response): for y in self.year: diff --git a/news_spider/news_spider/spiders/SpiderAll.py b/news_spider/news_spider/spiders/SpiderAll.py deleted file mode 100644 index 5a8640c..0000000 --- a/news_spider/news_spider/spiders/SpiderAll.py +++ /dev/null @@ -1,10 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -from TouTiaoSpider import TouTiaoSpider -from NetEase import NetEaseSpider - -process = CrawlerProcess() -process.crawl(TouTiaoSpider) -process.crawl(NetEaseSpider) -process.start() - diff --git a/news_spider/news_spider/spiders/SpiderAll.pyc b/news_spider/news_spider/spiders/SpiderAll.pyc deleted file mode 100644 index c968a1c..0000000 Binary files a/news_spider/news_spider/spiders/SpiderAll.pyc and /dev/null differ diff --git a/news_spider/news_spider/spiders/Tencent.py b/news_spider/news_spider/spiders/Tencent.py index d5c1a88..93668b1 100644 --- a/news_spider/news_spider/spiders/Tencent.py +++ b/news_spider/news_spider/spiders/Tencent.py @@ -5,14 +5,14 @@ import json import time import re -class NetEaseSpider(scrapy.Spider): +class TencentSpider(scrapy.Spider): start_urls = ['http://news.qq.com'] name='tencent' allowed_domains=['news.qq.com'] - base_url = 'http://news.qq.com/b/history/index20160419am.shtml?' - year = ['2016','2015'] + base_url = 'http://news.qq.com/b/history/index' + year = ['2016','2015','2014'] month = ['12','11','10','09','08','07','06','05','04','03','02','01'] day = ['31','30','29','28','27','26','25','24','23','22','21', '20','19','18','17','16','15','14','13','12','11','10', @@ -23,7 +23,7 @@ class NetEaseSpider(scrapy.Spider): for y in self.year: for m in self.month: for d in self.day: - for t in tp: + for t in self.tp: url = self.base_url+y+m+d+t+'.shtml?' yield scrapy.Request(url,self.parseList) @@ -31,14 +31,15 @@ class NetEaseSpider(scrapy.Spider): def parseList(self,response): urls = response.xpath("//a/@href").extract() for url in urls: - yield scrapy.Request(url,self.parseNews) + if 'http' in url: + yield scrapy.Request(url,self.parseNews) def parseNews(self,response): data = response.xpath("//div[@id='C-Main-Article-QQ']") item = NewsSpiderItem() time = data.xpath("//span[@class='article-time']/text()").extract() title = data.xpath("//div[@class='hd']//h1/text()").extract() - content = data.xpath("//div[@class='post_text']/p/text()").extract() + content = data.xpath("//p/text()").extract() time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}") if(len(time)!=0 and len(title)!=0 and len(content)!=0): diff --git a/news_spider/news_spider/spiders/Tencent.pyc b/news_spider/news_spider/spiders/Tencent.pyc index 9623b1c..c60e15f 100644 Binary files a/news_spider/news_spider/spiders/Tencent.pyc and b/news_spider/news_spider/spiders/Tencent.pyc differ