修改时间为时间戳格式,同时运行三个spider写入一个文件会出现脏数据

This commit is contained in:
lzjqsdd 2016-04-22 17:16:06 +08:00
parent 3201d09c43
commit cfed93f3ef
18 changed files with 87 additions and 81737 deletions

View File

@ -35,5 +35,4 @@ class Command(ScrapyCommand):
for spidername in args or spider_loader.list(): for spidername in args or spider_loader.list():
print "*********cralall spidername************" + spidername print "*********cralall spidername************" + spidername
self.crawler_process.crawl(spidername, **opts.spargs) self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start() self.crawler_process.start()

View File

@ -13,3 +13,10 @@ class NewsSpiderItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
time = scrapy.Field() time = scrapy.Field()
content = scrapy.Field() content = scrapy.Field()
url = scrapy.Field()
class TitleSpiderItem(scrapy.Item):
title = scrapy.Field()
time = scrapy.Field()
url = scrapy.Field()

Binary file not shown.

View File

@ -6,21 +6,34 @@
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import codecs import codecs
import json import json
from items import TitleSpiderItem
import fcntl
class NewsSpiderPipeline(object): class NewsSpiderPipeline(object):
def process_item(self,item,spider):
return item
class TouTiaoPipeline(object):
def __init__(self): def __init__(self):
# self.file = codecs.open('toutiao.json','wb',encoding='utf-8') self.file = open('news.json','wb')
self.file = open('toutiao.json','wb')
def process_item(self,item,spider): def process_item(self,item,spider):
fcntl.flock(self.file,fcntl.LOCK_EX)
line = json.dumps(dict(item))+'\n' line = json.dumps(dict(item))+'\n'
# self.file.write(line.decode("unicode_escape"))
self.file.write(line) self.file.write(line)
fcntl.flock(self.file,fcntl.LOCK_UN)
return item
class TitlePipeline(object):
def __init__(self):
file_title = open('title.json','wb')
def process_item(self,item,spider):
fcntl.flock(file_title,fcntl.LOCK_EX)
title_item = TitleSpiderItem()
title_item['title'] = item['title']
title_item['time'] = item['time']
title_item['url'] = item['url']
line = json.dumps(dict(title_item))+'\n'
file_title.write(line)
fcntl.flock(file_title,fcntl.LOCK_UN)
return item return item

View File

@ -63,9 +63,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'news_spider.pipelines.TouTiaoPipeline': 300, # 'news_spider.pipelines.TouTiaoPipeline': 300,
#} 'news_spider.pipelines.TitlePipeline': 500,
}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

Binary file not shown.

View File

@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
allowed_domains=['news.163.com'] allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!' base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
year = ['2016','2015'] # year = ['2016','2015']
month = ['12','11','10','09','08','07','06','05','04','03','02','01'] # month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21', # day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10', # '20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01'] # '09','08','07','06','05','04','03','02','01']
# day = ['31'] day = ['31']
# year = ['2016'] year = ['2016']
# month = ['03'] month = ['03']
def parse(self,response): def parse(self,response):
for y in self.year: for y in self.year:
@ -37,14 +37,16 @@ class NetEaseSpider(scrapy.Spider):
def parseNews(self,response): def parseNews(self,response):
data = response.xpath("//div[@class='post_content_main']") data = response.xpath("//div[@class='post_content_main']")
item = NewsSpiderItem() item = NewsSpiderItem()
time = data.xpath("//div[@class='post_time_source']/text()").extract() timee = data.xpath("//div[@class='post_time_source']/text()").extract()
title = data.xpath("//h1/text()").extract() title = data.xpath("//h1/text()").extract()
content = data.xpath("//div[@class='post_text']/p/text()").extract() content = data.xpath("//div[@class='post_text']/p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}") time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0): if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time_pattern.findall(time[0])[0] tm = time_pattern.findall(timee[0])[0]
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['title'] = title[0] item['title'] = title[0]
item['url'] = response.url
cc='' cc=''
if(len(content)!=0): if(len(content)!=0):
for c in content: for c in content:

View File

@ -12,13 +12,17 @@ class TencentSpider(scrapy.Spider):
allowed_domains=['news.qq.com'] allowed_domains=['news.qq.com']
base_url = 'http://news.qq.com/b/history/index' base_url = 'http://news.qq.com/b/history/index'
year = ['2016','2015','2014'] # year = ['2016','2015','2014']
month = ['12','11','10','09','08','07','06','05','04','03','02','01'] # month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21', # day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10', # '20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01'] # '09','08','07','06','05','04','03','02','01']
tp = ['am','pm'] tp = ['am','pm']
day = ['31']
year = ['2016']
month = ['03']
def parse(self,response): def parse(self,response):
for y in self.year: for y in self.year:
for m in self.month: for m in self.month:
@ -37,14 +41,16 @@ class TencentSpider(scrapy.Spider):
def parseNews(self,response): def parseNews(self,response):
data = response.xpath("//div[@id='C-Main-Article-QQ']") data = response.xpath("//div[@id='C-Main-Article-QQ']")
item = NewsSpiderItem() item = NewsSpiderItem()
time = data.xpath("//span[@class='article-time']/text()").extract() timee = data.xpath("//span[@class='article-time']/text()").extract()
title = data.xpath("//div[@class='hd']//h1/text()").extract() title = data.xpath("//div[@class='hd']//h1/text()").extract()
content = data.xpath("//p/text()").extract() content = data.xpath("//p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}") time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0): if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time_pattern.findall(time[0])[0] tm = time_pattern.findall(timee[0])[0]
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['title'] = title[0] item['title'] = title[0]
item['url'] = response.url
cc='' cc=''
if(len(content)!=0): if(len(content)!=0):
for c in content: for c in content:

View File

@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
] ]
base_class_url = 'http://toutiao.com/articles_news_society' base_class_url = 'http://toutiao.com/articles_news_society'
base_url = 'http://toutiao.com' base_url = 'http://toutiao.com'
maxpage = 501;#允许爬的最大的页数 maxpage = 10;#允许爬的最大的页数
category = ['articles_news_society','articles_news_entertainment', category = ['articles_news_society','articles_news_entertainment',
'articles_movie','articles_news_tech','articles_digital', 'articles_movie','articles_news_tech','articles_digital',
'articels_news_sports','articles_news_finance','articles_news_military', 'articels_news_sports','articles_news_finance','articles_news_military',
@ -36,12 +36,15 @@ class TouTiaoSpider(scrapy.Spider):
#解析具体新闻内容 #解析具体新闻内容
def parseNews(self,response): def parseNews(self,response):
articles = response.xpath("//div[@id='pagelet-article']") articles = response.xpath("//div[@id='pagelet-article']")
for article in articles:
item = NewsSpiderItem() item = NewsSpiderItem()
item['title'] = article.xpath("//div[@class='article-header']/h1/text()").extract()[0] title = articles.xpath("//div[@class='article-header']/h1/text()").extract()[0]
item['time'] = article.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0] tm = articles.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
content = article.xpath("//div[@class='article-content']//p/text()").extract() content = articles.xpath("//div[@class='article-content']//p/text()").extract()
#item['content'] = article.xpath("//div[@class='article-content']//p/text()").extract()
if(len(title)!=0 and len(tm)!=0 and len(content)!=0):
item['title'] = title
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['url'] = response.url
cc='' cc=''
if(len(content) != 0): if(len(content) != 0):
for c in content: for c in content:

View File

@ -7,5 +7,5 @@
default = news_spider.settings default = news_spider.settings
[deploy] [deploy]
#url = http://localhost:6800/ url = http://localhost:6800/
project = news_spider project = news_spider

View File

@ -5,13 +5,19 @@ reload(sys)
sys.setdefaultencoding( "utf-8" ) sys.setdefaultencoding( "utf-8" )
file = open(sys.argv[1]) file = open(sys.argv[1])
data = json.load(file) while 1:
line = file.readline()
if not line:
break
data = json.loads(line)
print data['time'],data['title'],data['url']
c = 0 #data = json.load(file)
for article in data: #c = 0
c+=1 #for article in data:
print article['time'],"--------",article['title'] # c+=1
# print "[----Time-----]\n",article['time'],article['title'] # print "[----Time-----]\n",article['time'],article['title']
# print "[----Title----]\n",article['title'] # print "[----Title----]\n",article['title']
# print article['time'],article['title']
# print "[----Article--]\n",article['content'],"\n\n" # print "[----Article--]\n",article['content'],"\n\n"
print c

View File

@ -1,6 +1,7 @@
import re import re
import time
time = " - -- 2015-06-15 15:34 " timee = " - -- 2015-06-15 15:34 "
day = ['31','30','29','28','27','26','25','24','23','22','21', day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10', '20','19','18','17','16','15','14','13','12','11','10',
@ -8,4 +9,7 @@ day = ['31','30','29','28','27','26','25','24','23','22','21',
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}") pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}") #pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
#pattern = re.compile("[0-9]") #pattern = re.compile("[0-9]")
print pattern.findall(time)[0] tm = pattern.findall(timee)[0]
a = time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))
print int(a)

File diff suppressed because it is too large Load Diff