From 71c564deb86895a2eb663ae03e6d7c04821066a0 Mon Sep 17 00:00:00 2001 From: dhc_king <601767890@qq.com> Date: Thu, 26 Sep 2019 03:55:32 +0000 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81=EF=BC=8C?= =?UTF-8?q?=E6=8F=90=E9=AB=98=E4=BB=A3=E7=A0=81=E5=8F=AF=E7=A7=BB=E6=A4=8D?= =?UTF-8?q?=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ news_spider/news_spider/rotateuseragent.py | 27 ++++++++++++++++++---- requirements.txt | 1 + tools/news2db.py | 9 ++++++++ 4 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index c44ff6c..21eeebe 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ data/title.json data/cutnews data/orinews data/inversedata +**/*.pyc +tools/news.db diff --git a/news_spider/news_spider/rotateuseragent.py b/news_spider/news_spider/rotateuseragent.py index 1caebbf..dc10737 100644 --- a/news_spider/news_spider/rotateuseragent.py +++ b/news_spider/news_spider/rotateuseragent.py @@ -1,6 +1,6 @@ # -*-coding:utf-8-*- -from scrapy import log +import logging """避免被ban策略之一:使用useragent池。 @@ -8,7 +8,26 @@ from scrapy import log """ import random -from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware +from scrapy import signals + +class UserAgentMiddleware(object): + """This middleware allows spiders to override the user_agent""" + + def __init__(self, user_agent='Scrapy'): + self.user_agent = user_agent + + @classmethod + def from_crawler(cls, crawler): + o = cls(crawler.settings['USER_AGENT']) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + return o + + def spider_opened(self, spider): + self.user_agent = getattr(spider, 'user_agent', self.user_agent) + + def process_request(self, request, spider): + if self.user_agent: + request.headers.setdefault(b'User-Agent', self.user_agent) class RotateUserAgentMiddleware(UserAgentMiddleware): @@ -19,8 +38,8 @@ class RotateUserAgentMiddleware(UserAgentMiddleware): ua = random.choice(self.user_agent_list) if ua: #显示当前使用的useragent - print "********Current UserAgent:%s************",ua - log.msg('Current UserAgent:'+ua,log.INFO) + print "********Current UserAgent:%s************" % ua + logging.info('Current UserAgent:'+ua) request.headers.setdefault('User-Agent', ua) #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9045790 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +Scrapy==1.7.3 diff --git a/tools/news2db.py b/tools/news2db.py index 603cc04..8f7a0ec 100644 --- a/tools/news2db.py +++ b/tools/news2db.py @@ -10,6 +10,15 @@ sys.setdefaultencoding('utf-8') file = open(Global.content_dir) conn = sqlite3.connect('news.db') + +# Check table is exist +cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';") +result = cursor.fetchall() +tables = [tables[0] for tables in result] +if 'news' not in tables: + conn.execute("CREATE TABLE news (title, time, url)") + conn.commit() + while 1: line = file.readline() if not line: