增加UserAgent池防止爬虫被禁，增加进程锁防止同时开启多个爬虫写入同一个文件出现错误数据

2016-04-23 12:34:35 +08:00 · 2016-04-23 12:34:35 +08:00 · d7a3e28f59
commit d7a3e28f59
parent cfed93f3ef
14 changed files with 108 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 news_spider/tt.json
 news_spider/ne.json
 news_spider/te.json
+news_spider/title.json
+news_spider/news.json
--- a/news_spider/news_spider/commands/crawlall.py
+++ b/news_spider/news_spider/commands/crawlall.py
@ -33,6 +33,6 @@ class Command(ScrapyCommand):
 	
 	spider_loader = self.crawler_process.spider_loader
 	for spidername in args or spider_loader.list():
-	    print "*********cralall spidername************" + spidername
+	    print "*********crawlall spidername************" + spidername
 	    self.crawler_process.crawl(spidername, **opts.spargs)
        self.crawler_process.start()
--- a/news_spider/news_spider/commands/crawlall.pyc
+++ b/news_spider/news_spider/commands/crawlall.pyc
--- a/news_spider/news_spider/pipelines.py
+++ b/news_spider/news_spider/pipelines.py
@ -7,33 +7,51 @@
 import codecs
 import json
 from items import TitleSpiderItem
-import fcntl
+import threading


 class NewsSpiderPipeline(object):
+	lock = threading.Lock()
+	file = open('news.json','a')
 	
 	def __init__(self):
-		self.file = open('news.json','wb')
+		pass

 	def process_item(self,item,spider):
-		fcntl.flock(self.file,fcntl.LOCK_EX)
 		line = json.dumps(dict(item))+'\n'
-		self.file.write(line)
-		fcntl.flock(self.file,fcntl.LOCK_UN)
+		try:
+			NewsSpiderPipeline.lock.acquire()	
+			NewsSpiderPipeline.file.write(line)
+		except:
+			pass
+		finally:
+		 	NewsSpiderPipeline.lock.release()
 		return item
+	def spider_closed(self,spider):
+		pass


 class TitlePipeline(object):
-	def __init__(self):
-		file_title = open('title.json','wb')
+	lock = threading.Lock()
+	file_title = open('title.json','a')

+	def __init__(self):
+		pass
 	def process_item(self,item,spider):
-		fcntl.flock(file_title,fcntl.LOCK_EX)
 		title_item = TitleSpiderItem()
 		title_item['title'] = item['title']
 		title_item['time'] = item['time']
 		title_item['url'] = item['url']
 		line = json.dumps(dict(title_item))+'\n'
-		file_title.write(line)
-		fcntl.flock(file_title,fcntl.LOCK_UN)
+
+		try:
+			TitlePipeline.lock.acquire()
+			TitlePipeline.file_title.write(line)
+		except:
+			pass
+		finally:
+			TitlePipeline.lock.release()
 		return item
+
+	def spider_closed(self,spider):
+		pass
--- a/news_spider/news_spider/pipelines.pyc
+++ b/news_spider/news_spider/pipelines.pyc
--- a/news_spider/news_spider/rotateuseragent.py
+++ b/news_spider/news_spider/rotateuseragent.py
@ -0,0 +1,66 @@
+# -*-coding:utf-8-*-
+
+from scrapy import log
+
+"""避免被ban策略之一：使用useragent池。
+
+使用注意：需在settings.py中进行相应的设置。
+"""
+
+import random
+from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
+
+class RotateUserAgentMiddleware(UserAgentMiddleware):
+
+    def __init__(self, user_agent=''):
+        self.user_agent = user_agent
+
+    def process_request(self, request, spider):
+        ua = random.choice(self.user_agent_list)
+        if ua:
+            #显示当前使用的useragent
+			print "********Current UserAgent:%s************",ua
+			log.msg('Current UserAgent:'+ua,log.INFO)
+			request.headers.setdefault('User-Agent', ua)
+
+    #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
+    #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
+    user_agent_list = [\
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
+        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
+        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
+        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
+        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
+        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
+        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
+        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
+        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
+       ]
+
--- a/news_spider/news_spider/rotateuseragent.pyc
+++ b/news_spider/news_spider/rotateuseragent.pyc
--- a/news_spider/news_spider/settings.py
+++ b/news_spider/news_spider/settings.py
@ -18,7 +18,8 @@ COMMANDS_MODULE='news_spider.commands'

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'news_spider (+http://www.yourdomain.com)'
-USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
+#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
+

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS=32
@ -51,9 +52,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'news_spider.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
+DOWNLOADER_MIDDLEWARES = {
+	'news_spider.middlewares.MyCustomDownloaderMiddleware': None,
+	'news_spider.rotateuseragent.RotateUserAgentMiddleware':400
+}

 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
@ -64,7 +66,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-#    'news_spider.pipelines.TouTiaoPipeline': 300,
+    'news_spider.pipelines.NewsSpiderPipeline': 300,
    'news_spider.pipelines.TitlePipeline': 500,
 }

--- a/news_spider/news_spider/settings.pyc
+++ b/news_spider/news_spider/settings.pyc
--- a/news_spider/news_spider/spiders/NetEase.pyc
+++ b/news_spider/news_spider/spiders/NetEase.pyc
--- a/news_spider/news_spider/spiders/Tencent.py
+++ b/news_spider/news_spider/spiders/Tencent.py
@ -11,7 +11,7 @@ class TencentSpider(scrapy.Spider):
 	name='tencent'
 	allowed_domains=['news.qq.com']

-	base_url = 'http://news.qq.com/b/history/index'
+#	base_url = 'http://news.qq.com/b/history/index'
 #	year = ['2016','2015','2014']
 #	month = ['12','11','10','09','08','07','06','05','04','03','02','01']
 #	day = ['31','30','29','28','27','26','25','24','23','22','21',
--- a/news_spider/news_spider/spiders/Tencent.pyc
+++ b/news_spider/news_spider/spiders/Tencent.pyc
--- a/news_spider/news_spider/spiders/TouTiaoSpider.pyc
+++ b/news_spider/news_spider/spiders/TouTiaoSpider.pyc
--- a/news_spider/show.py
+++ b/news_spider/show.py
@ -5,11 +5,13 @@ reload(sys)
 sys.setdefaultencoding( "utf-8" )

 file = open(sys.argv[1])
+c=0
 while 1:
 	line = file.readline()
 	if not line:
 		break
 	data = json.loads(line)
+	c+=1
 	print data['time'],data['title'],data['url']

 #data = json.load(file)
@ -21,3 +23,4 @@ while 1:
 #	print article['time'],article['title']
 #	print "[----Article--]\n",article['content'],"\n\n"

+print c