diff --git a/.gitignore b/.gitignore index c44ff6c..21eeebe 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ data/title.json data/cutnews data/orinews data/inversedata +**/*.pyc +tools/news.db diff --git a/news_spider/news_spider/rotateuseragent.py b/news_spider/news_spider/rotateuseragent.py index 1caebbf..dc10737 100644 --- a/news_spider/news_spider/rotateuseragent.py +++ b/news_spider/news_spider/rotateuseragent.py @@ -1,6 +1,6 @@ # -*-coding:utf-8-*- -from scrapy import log +import logging """避免被ban策略之一:使用useragent池。 @@ -8,7 +8,26 @@ from scrapy import log """ import random -from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware +from scrapy import signals + +class UserAgentMiddleware(object): + """This middleware allows spiders to override the user_agent""" + + def __init__(self, user_agent='Scrapy'): + self.user_agent = user_agent + + @classmethod + def from_crawler(cls, crawler): + o = cls(crawler.settings['USER_AGENT']) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + return o + + def spider_opened(self, spider): + self.user_agent = getattr(spider, 'user_agent', self.user_agent) + + def process_request(self, request, spider): + if self.user_agent: + request.headers.setdefault(b'User-Agent', self.user_agent) class RotateUserAgentMiddleware(UserAgentMiddleware): @@ -19,8 +38,8 @@ class RotateUserAgentMiddleware(UserAgentMiddleware): ua = random.choice(self.user_agent_list) if ua: #显示当前使用的useragent - print "********Current UserAgent:%s************",ua - log.msg('Current UserAgent:'+ua,log.INFO) + print "********Current UserAgent:%s************" % ua + logging.info('Current UserAgent:'+ua) request.headers.setdefault('User-Agent', ua) #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9045790 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +Scrapy==1.7.3 diff --git a/tools/news2db.py b/tools/news2db.py index 603cc04..8f7a0ec 100644 --- a/tools/news2db.py +++ b/tools/news2db.py @@ -10,6 +10,15 @@ sys.setdefaultencoding('utf-8') file = open(Global.content_dir) conn = sqlite3.connect('news.db') + +# Check table is exist +cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';") +result = cursor.fetchall() +tables = [tables[0] for tables in result] +if 'news' not in tables: + conn.execute("CREATE TABLE news (title, time, url)") + conn.commit() + while 1: line = file.readline() if not line: