diff --git a/.gitignore b/.gitignore index fa6a5d5..0d243de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ news_spider/tt.json news_spider/ne.json news_spider/te.json +news_spider/title.json +news_spider/news.json diff --git a/news_spider/news_spider/commands/crawlall.py b/news_spider/news_spider/commands/crawlall.py index 767df19..446c44e 100644 --- a/news_spider/news_spider/commands/crawlall.py +++ b/news_spider/news_spider/commands/crawlall.py @@ -33,6 +33,6 @@ class Command(ScrapyCommand): spider_loader = self.crawler_process.spider_loader for spidername in args or spider_loader.list(): - print "*********cralall spidername************" + spidername + print "*********crawlall spidername************" + spidername self.crawler_process.crawl(spidername, **opts.spargs) self.crawler_process.start() diff --git a/news_spider/news_spider/commands/crawlall.pyc b/news_spider/news_spider/commands/crawlall.pyc index 8437e3c..40e640b 100644 Binary files a/news_spider/news_spider/commands/crawlall.pyc and b/news_spider/news_spider/commands/crawlall.pyc differ diff --git a/news_spider/news_spider/pipelines.py b/news_spider/news_spider/pipelines.py index d909e1b..f4c69ab 100644 --- a/news_spider/news_spider/pipelines.py +++ b/news_spider/news_spider/pipelines.py @@ -7,33 +7,51 @@ import codecs import json from items import TitleSpiderItem -import fcntl +import threading class NewsSpiderPipeline(object): + lock = threading.Lock() + file = open('news.json','a') def __init__(self): - self.file = open('news.json','wb') + pass def process_item(self,item,spider): - fcntl.flock(self.file,fcntl.LOCK_EX) line = json.dumps(dict(item))+'\n' - self.file.write(line) - fcntl.flock(self.file,fcntl.LOCK_UN) + try: + NewsSpiderPipeline.lock.acquire() + NewsSpiderPipeline.file.write(line) + except: + pass + finally: + NewsSpiderPipeline.lock.release() return item + def spider_closed(self,spider): + pass class TitlePipeline(object): - def __init__(self): - file_title = open('title.json','wb') + lock = threading.Lock() + file_title = open('title.json','a') + def __init__(self): + pass def process_item(self,item,spider): - fcntl.flock(file_title,fcntl.LOCK_EX) title_item = TitleSpiderItem() title_item['title'] = item['title'] title_item['time'] = item['time'] title_item['url'] = item['url'] line = json.dumps(dict(title_item))+'\n' - file_title.write(line) - fcntl.flock(file_title,fcntl.LOCK_UN) + + try: + TitlePipeline.lock.acquire() + TitlePipeline.file_title.write(line) + except: + pass + finally: + TitlePipeline.lock.release() return item + + def spider_closed(self,spider): + pass diff --git a/news_spider/news_spider/pipelines.pyc b/news_spider/news_spider/pipelines.pyc index 19bc0a8..638ac80 100644 Binary files a/news_spider/news_spider/pipelines.pyc and b/news_spider/news_spider/pipelines.pyc differ diff --git a/news_spider/news_spider/rotateuseragent.py b/news_spider/news_spider/rotateuseragent.py new file mode 100644 index 0000000..1caebbf --- /dev/null +++ b/news_spider/news_spider/rotateuseragent.py @@ -0,0 +1,66 @@ +# -*-coding:utf-8-*- + +from scrapy import log + +"""避免被ban策略之一:使用useragent池。 + +使用注意:需在settings.py中进行相应的设置。 +""" + +import random +from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware + +class RotateUserAgentMiddleware(UserAgentMiddleware): + + def __init__(self, user_agent=''): + self.user_agent = user_agent + + def process_request(self, request, spider): + ua = random.choice(self.user_agent_list) + if ua: + #显示当前使用的useragent + print "********Current UserAgent:%s************",ua + log.msg('Current UserAgent:'+ua,log.INFO) + request.headers.setdefault('User-Agent', ua) + + #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape + #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php + user_agent_list = [\ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " + "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " + "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " + "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " + "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " + "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " + "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " + "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " + "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " + "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " + "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" + ] + diff --git a/news_spider/news_spider/rotateuseragent.pyc b/news_spider/news_spider/rotateuseragent.pyc new file mode 100644 index 0000000..4b640fc Binary files /dev/null and b/news_spider/news_spider/rotateuseragent.pyc differ diff --git a/news_spider/news_spider/settings.py b/news_spider/news_spider/settings.py index dad51e0..3a12d0e 100644 --- a/news_spider/news_spider/settings.py +++ b/news_spider/news_spider/settings.py @@ -18,7 +18,8 @@ COMMANDS_MODULE='news_spider.commands' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'news_spider (+http://www.yourdomain.com)' -USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' +#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' + # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS=32 @@ -51,9 +52,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'news_spider.middlewares.MyCustomDownloaderMiddleware': 543, -#} +DOWNLOADER_MIDDLEWARES = { + 'news_spider.middlewares.MyCustomDownloaderMiddleware': None, + 'news_spider.rotateuseragent.RotateUserAgentMiddleware':400 +} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html @@ -64,7 +66,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { -# 'news_spider.pipelines.TouTiaoPipeline': 300, + 'news_spider.pipelines.NewsSpiderPipeline': 300, 'news_spider.pipelines.TitlePipeline': 500, } diff --git a/news_spider/news_spider/settings.pyc b/news_spider/news_spider/settings.pyc index 663d551..31c6588 100644 Binary files a/news_spider/news_spider/settings.pyc and b/news_spider/news_spider/settings.pyc differ diff --git a/news_spider/news_spider/spiders/NetEase.pyc b/news_spider/news_spider/spiders/NetEase.pyc index 976b66e..42dc16d 100644 Binary files a/news_spider/news_spider/spiders/NetEase.pyc and b/news_spider/news_spider/spiders/NetEase.pyc differ diff --git a/news_spider/news_spider/spiders/Tencent.py b/news_spider/news_spider/spiders/Tencent.py index de35f9f..549f8c7 100644 --- a/news_spider/news_spider/spiders/Tencent.py +++ b/news_spider/news_spider/spiders/Tencent.py @@ -11,7 +11,7 @@ class TencentSpider(scrapy.Spider): name='tencent' allowed_domains=['news.qq.com'] - base_url = 'http://news.qq.com/b/history/index' +# base_url = 'http://news.qq.com/b/history/index' # year = ['2016','2015','2014'] # month = ['12','11','10','09','08','07','06','05','04','03','02','01'] # day = ['31','30','29','28','27','26','25','24','23','22','21', diff --git a/news_spider/news_spider/spiders/Tencent.pyc b/news_spider/news_spider/spiders/Tencent.pyc index 8b1f959..6488437 100644 Binary files a/news_spider/news_spider/spiders/Tencent.pyc and b/news_spider/news_spider/spiders/Tencent.pyc differ diff --git a/news_spider/news_spider/spiders/TouTiaoSpider.pyc b/news_spider/news_spider/spiders/TouTiaoSpider.pyc index b4599aa..d468fb9 100644 Binary files a/news_spider/news_spider/spiders/TouTiaoSpider.pyc and b/news_spider/news_spider/spiders/TouTiaoSpider.pyc differ diff --git a/news_spider/show.py b/news_spider/show.py index c83c1c9..2ed65f9 100755 --- a/news_spider/show.py +++ b/news_spider/show.py @@ -5,11 +5,13 @@ reload(sys) sys.setdefaultencoding( "utf-8" ) file = open(sys.argv[1]) +c=0 while 1: line = file.readline() if not line: break data = json.loads(line) + c+=1 print data['time'],data['title'],data['url'] #data = json.load(file) @@ -21,3 +23,4 @@ while 1: # print article['time'],article['title'] # print "[----Article--]\n",article['content'],"\n\n" +print c