优化代码,提高代码可移植性

This commit is contained in:
dhc_king 2019-09-26 03:55:32 +00:00
parent 77c7f322a5
commit 71c564deb8
4 changed files with 35 additions and 4 deletions

2
.gitignore vendored
View File

@ -5,3 +5,5 @@ data/title.json
data/cutnews
data/orinews
data/inversedata
**/*.pyc
tools/news.db

View File

@ -1,6 +1,6 @@
# -*-coding:utf-8-*-
from scrapy import log
import logging
"""避免被ban策略之一使用useragent池。
@ -8,7 +8,26 @@ from scrapy import log
"""
import random
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
from scrapy import signals
class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
class RotateUserAgentMiddleware(UserAgentMiddleware):
@ -19,8 +38,8 @@ class RotateUserAgentMiddleware(UserAgentMiddleware):
ua = random.choice(self.user_agent_list)
if ua:
#显示当前使用的useragent
print "********Current UserAgent:%s************",ua
log.msg('Current UserAgent:'+ua,log.INFO)
print "********Current UserAgent:%s************" % ua
logging.info('Current UserAgent:'+ua)
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
Scrapy==1.7.3

View File

@ -10,6 +10,15 @@ sys.setdefaultencoding('utf-8')
file = open(Global.content_dir)
conn = sqlite3.connect('news.db')
# Check table is exist
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
result = cursor.fetchall()
tables = [tables[0] for tables in result]
if 'news' not in tables:
conn.execute("CREATE TABLE news (title, time, url)")
conn.commit()
while 1:
line = file.readline()
if not line: