优化代码,提高代码可移植性
This commit is contained in:
parent
77c7f322a5
commit
71c564deb8
2
.gitignore
vendored
2
.gitignore
vendored
@ -5,3 +5,5 @@ data/title.json
|
||||
data/cutnews
|
||||
data/orinews
|
||||
data/inversedata
|
||||
**/*.pyc
|
||||
tools/news.db
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*-coding:utf-8-*-
|
||||
|
||||
from scrapy import log
|
||||
import logging
|
||||
|
||||
"""避免被ban策略之一:使用useragent池。
|
||||
|
||||
@ -8,7 +8,26 @@ from scrapy import log
|
||||
"""
|
||||
|
||||
import random
|
||||
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
|
||||
from scrapy import signals
|
||||
|
||||
class UserAgentMiddleware(object):
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, user_agent='Scrapy'):
|
||||
self.user_agent = user_agent
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings['USER_AGENT'])
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self.user_agent:
|
||||
request.headers.setdefault(b'User-Agent', self.user_agent)
|
||||
|
||||
class RotateUserAgentMiddleware(UserAgentMiddleware):
|
||||
|
||||
@ -19,8 +38,8 @@ class RotateUserAgentMiddleware(UserAgentMiddleware):
|
||||
ua = random.choice(self.user_agent_list)
|
||||
if ua:
|
||||
#显示当前使用的useragent
|
||||
print "********Current UserAgent:%s************",ua
|
||||
log.msg('Current UserAgent:'+ua,log.INFO)
|
||||
print "********Current UserAgent:%s************" % ua
|
||||
logging.info('Current UserAgent:'+ua)
|
||||
request.headers.setdefault('User-Agent', ua)
|
||||
|
||||
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
|
||||
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
Scrapy==1.7.3
|
@ -10,6 +10,15 @@ sys.setdefaultencoding('utf-8')
|
||||
|
||||
file = open(Global.content_dir)
|
||||
conn = sqlite3.connect('news.db')
|
||||
|
||||
# Check table is exist
|
||||
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
||||
result = cursor.fetchall()
|
||||
tables = [tables[0] for tables in result]
|
||||
if 'news' not in tables:
|
||||
conn.execute("CREATE TABLE news (title, time, url)")
|
||||
conn.commit()
|
||||
|
||||
while 1:
|
||||
line = file.readline()
|
||||
if not line:
|
||||
|
Loading…
Reference in New Issue
Block a user