增加UserAgent池防止爬虫被禁,增加进程锁防止同时开启多个爬虫写入同一个文件出现错误数据
This commit is contained in:
parent
cfed93f3ef
commit
d7a3e28f59
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
news_spider/tt.json
|
||||
news_spider/ne.json
|
||||
news_spider/te.json
|
||||
news_spider/title.json
|
||||
news_spider/news.json
|
||||
|
@ -33,6 +33,6 @@ class Command(ScrapyCommand):
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
for spidername in args or spider_loader.list():
|
||||
print "*********cralall spidername************" + spidername
|
||||
print "*********crawlall spidername************" + spidername
|
||||
self.crawler_process.crawl(spidername, **opts.spargs)
|
||||
self.crawler_process.start()
|
||||
|
Binary file not shown.
@ -7,33 +7,51 @@
|
||||
import codecs
|
||||
import json
|
||||
from items import TitleSpiderItem
|
||||
import fcntl
|
||||
import threading
|
||||
|
||||
|
||||
class NewsSpiderPipeline(object):
|
||||
lock = threading.Lock()
|
||||
file = open('news.json','a')
|
||||
|
||||
def __init__(self):
|
||||
self.file = open('news.json','wb')
|
||||
pass
|
||||
|
||||
def process_item(self,item,spider):
|
||||
fcntl.flock(self.file,fcntl.LOCK_EX)
|
||||
line = json.dumps(dict(item))+'\n'
|
||||
self.file.write(line)
|
||||
fcntl.flock(self.file,fcntl.LOCK_UN)
|
||||
try:
|
||||
NewsSpiderPipeline.lock.acquire()
|
||||
NewsSpiderPipeline.file.write(line)
|
||||
except:
|
||||
pass
|
||||
finally:
|
||||
NewsSpiderPipeline.lock.release()
|
||||
return item
|
||||
def spider_closed(self,spider):
|
||||
pass
|
||||
|
||||
|
||||
class TitlePipeline(object):
|
||||
def __init__(self):
|
||||
file_title = open('title.json','wb')
|
||||
lock = threading.Lock()
|
||||
file_title = open('title.json','a')
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def process_item(self,item,spider):
|
||||
fcntl.flock(file_title,fcntl.LOCK_EX)
|
||||
title_item = TitleSpiderItem()
|
||||
title_item['title'] = item['title']
|
||||
title_item['time'] = item['time']
|
||||
title_item['url'] = item['url']
|
||||
line = json.dumps(dict(title_item))+'\n'
|
||||
file_title.write(line)
|
||||
fcntl.flock(file_title,fcntl.LOCK_UN)
|
||||
|
||||
try:
|
||||
TitlePipeline.lock.acquire()
|
||||
TitlePipeline.file_title.write(line)
|
||||
except:
|
||||
pass
|
||||
finally:
|
||||
TitlePipeline.lock.release()
|
||||
return item
|
||||
|
||||
def spider_closed(self,spider):
|
||||
pass
|
||||
|
Binary file not shown.
66
news_spider/news_spider/rotateuseragent.py
Normal file
66
news_spider/news_spider/rotateuseragent.py
Normal file
@ -0,0 +1,66 @@
|
||||
# -*-coding:utf-8-*-
|
||||
|
||||
from scrapy import log
|
||||
|
||||
"""避免被ban策略之一:使用useragent池。
|
||||
|
||||
使用注意:需在settings.py中进行相应的设置。
|
||||
"""
|
||||
|
||||
import random
|
||||
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
|
||||
|
||||
class RotateUserAgentMiddleware(UserAgentMiddleware):
|
||||
|
||||
def __init__(self, user_agent=''):
|
||||
self.user_agent = user_agent
|
||||
|
||||
def process_request(self, request, spider):
|
||||
ua = random.choice(self.user_agent_list)
|
||||
if ua:
|
||||
#显示当前使用的useragent
|
||||
print "********Current UserAgent:%s************",ua
|
||||
log.msg('Current UserAgent:'+ua,log.INFO)
|
||||
request.headers.setdefault('User-Agent', ua)
|
||||
|
||||
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
|
||||
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
|
||||
user_agent_list = [\
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
|
||||
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
|
||||
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
|
||||
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
|
||||
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
|
||||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
|
||||
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
|
||||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
|
||||
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
|
||||
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
|
||||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
|
||||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
|
||||
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
|
||||
]
|
||||
|
BIN
news_spider/news_spider/rotateuseragent.pyc
Normal file
BIN
news_spider/news_spider/rotateuseragent.pyc
Normal file
Binary file not shown.
@ -18,7 +18,8 @@ COMMANDS_MODULE='news_spider.commands'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'news_spider (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
|
||||
#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
|
||||
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS=32
|
||||
@ -51,9 +52,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'news_spider.middlewares.MyCustomDownloaderMiddleware': 543,
|
||||
#}
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'news_spider.middlewares.MyCustomDownloaderMiddleware': None,
|
||||
'news_spider.rotateuseragent.RotateUserAgentMiddleware':400
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
|
||||
@ -64,7 +66,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
|
||||
# Configure item pipelines
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
# 'news_spider.pipelines.TouTiaoPipeline': 300,
|
||||
'news_spider.pipelines.NewsSpiderPipeline': 300,
|
||||
'news_spider.pipelines.TitlePipeline': 500,
|
||||
}
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
@ -11,7 +11,7 @@ class TencentSpider(scrapy.Spider):
|
||||
name='tencent'
|
||||
allowed_domains=['news.qq.com']
|
||||
|
||||
base_url = 'http://news.qq.com/b/history/index'
|
||||
# base_url = 'http://news.qq.com/b/history/index'
|
||||
# year = ['2016','2015','2014']
|
||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
|
Binary file not shown.
Binary file not shown.
@ -5,11 +5,13 @@ reload(sys)
|
||||
sys.setdefaultencoding( "utf-8" )
|
||||
|
||||
file = open(sys.argv[1])
|
||||
c=0
|
||||
while 1:
|
||||
line = file.readline()
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
c+=1
|
||||
print data['time'],data['title'],data['url']
|
||||
|
||||
#data = json.load(file)
|
||||
@ -21,3 +23,4 @@ while 1:
|
||||
# print article['time'],article['title']
|
||||
# print "[----Article--]\n",article['content'],"\n\n"
|
||||
|
||||
print c
|
||||
|
Loading…
Reference in New Issue
Block a user