增加UserAgent池防止爬虫被禁,增加进程锁防止同时开启多个爬虫写入同一个文件出现错误数据

This commit is contained in:
lzjqsdd 2016-04-23 12:34:35 +08:00
parent cfed93f3ef
commit d7a3e28f59
14 changed files with 108 additions and 17 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
news_spider/tt.json
news_spider/ne.json
news_spider/te.json
news_spider/title.json
news_spider/news.json

View File

@ -33,6 +33,6 @@ class Command(ScrapyCommand):
spider_loader = self.crawler_process.spider_loader
for spidername in args or spider_loader.list():
print "*********cralall spidername************" + spidername
print "*********crawlall spidername************" + spidername
self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start()

View File

@ -7,33 +7,51 @@
import codecs
import json
from items import TitleSpiderItem
import fcntl
import threading
class NewsSpiderPipeline(object):
lock = threading.Lock()
file = open('news.json','a')
def __init__(self):
self.file = open('news.json','wb')
pass
def process_item(self,item,spider):
fcntl.flock(self.file,fcntl.LOCK_EX)
line = json.dumps(dict(item))+'\n'
self.file.write(line)
fcntl.flock(self.file,fcntl.LOCK_UN)
try:
NewsSpiderPipeline.lock.acquire()
NewsSpiderPipeline.file.write(line)
except:
pass
finally:
NewsSpiderPipeline.lock.release()
return item
def spider_closed(self,spider):
pass
class TitlePipeline(object):
def __init__(self):
file_title = open('title.json','wb')
lock = threading.Lock()
file_title = open('title.json','a')
def __init__(self):
pass
def process_item(self,item,spider):
fcntl.flock(file_title,fcntl.LOCK_EX)
title_item = TitleSpiderItem()
title_item['title'] = item['title']
title_item['time'] = item['time']
title_item['url'] = item['url']
line = json.dumps(dict(title_item))+'\n'
file_title.write(line)
fcntl.flock(file_title,fcntl.LOCK_UN)
try:
TitlePipeline.lock.acquire()
TitlePipeline.file_title.write(line)
except:
pass
finally:
TitlePipeline.lock.release()
return item
def spider_closed(self,spider):
pass

View File

@ -0,0 +1,66 @@
# -*-coding:utf-8-*-
from scrapy import log
"""避免被ban策略之一使用useragent池。
使用注意需在settings.py中进行相应的设置
"""
import random
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
#显示当前使用的useragent
print "********Current UserAgent:%s************",ua
log.msg('Current UserAgent:'+ua,log.INFO)
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
user_agent_list = [\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

Binary file not shown.

View File

@ -18,7 +18,8 @@ COMMANDS_MODULE='news_spider.commands'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'news_spider (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
@ -51,9 +52,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'news_spider.middlewares.MyCustomDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'news_spider.middlewares.MyCustomDownloaderMiddleware': None,
'news_spider.rotateuseragent.RotateUserAgentMiddleware':400
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
@ -64,7 +66,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'news_spider.pipelines.TouTiaoPipeline': 300,
'news_spider.pipelines.NewsSpiderPipeline': 300,
'news_spider.pipelines.TitlePipeline': 500,
}

Binary file not shown.

View File

@ -11,7 +11,7 @@ class TencentSpider(scrapy.Spider):
name='tencent'
allowed_domains=['news.qq.com']
base_url = 'http://news.qq.com/b/history/index'
# base_url = 'http://news.qq.com/b/history/index'
# year = ['2016','2015','2014']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
# day = ['31','30','29','28','27','26','25','24','23','22','21',

View File

@ -5,11 +5,13 @@ reload(sys)
sys.setdefaultencoding( "utf-8" )
file = open(sys.argv[1])
c=0
while 1:
line = file.readline()
if not line:
break
data = json.loads(line)
c+=1
print data['time'],data['title'],data['url']
#data = json.load(file)
@ -21,3 +23,4 @@ while 1:
# print article['time'],article['title']
# print "[----Article--]\n",article['content'],"\n\n"
print c