修改时间为时间戳格式,同时运行三个spider写入一个文件会出现脏数据

This commit is contained in:
lzjqsdd 2016-04-22 17:16:06 +08:00
parent 3201d09c43
commit cfed93f3ef
18 changed files with 87 additions and 81737 deletions

View File

@ -35,5 +35,4 @@ class Command(ScrapyCommand):
for spidername in args or spider_loader.list():
print "*********cralall spidername************" + spidername
self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start()

View File

@ -13,3 +13,10 @@ class NewsSpiderItem(scrapy.Item):
title = scrapy.Field()
time = scrapy.Field()
content = scrapy.Field()
url = scrapy.Field()
class TitleSpiderItem(scrapy.Item):
title = scrapy.Field()
time = scrapy.Field()
url = scrapy.Field()

Binary file not shown.

View File

@ -6,21 +6,34 @@
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import codecs
import json
from items import TitleSpiderItem
import fcntl
class NewsSpiderPipeline(object):
def process_item(self,item,spider):
return item
class TouTiaoPipeline(object):
def __init__(self):
# self.file = codecs.open('toutiao.json','wb',encoding='utf-8')
self.file = open('toutiao.json','wb')
self.file = open('news.json','wb')
def process_item(self,item,spider):
fcntl.flock(self.file,fcntl.LOCK_EX)
line = json.dumps(dict(item))+'\n'
# self.file.write(line.decode("unicode_escape"))
self.file.write(line)
fcntl.flock(self.file,fcntl.LOCK_UN)
return item
class TitlePipeline(object):
def __init__(self):
file_title = open('title.json','wb')
def process_item(self,item,spider):
fcntl.flock(file_title,fcntl.LOCK_EX)
title_item = TitleSpiderItem()
title_item['title'] = item['title']
title_item['time'] = item['time']
title_item['url'] = item['url']
line = json.dumps(dict(title_item))+'\n'
file_title.write(line)
fcntl.flock(file_title,fcntl.LOCK_UN)
return item

View File

@ -63,9 +63,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
ITEM_PIPELINES = {
# 'news_spider.pipelines.TouTiaoPipeline': 300,
#}
'news_spider.pipelines.TitlePipeline': 500,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

Binary file not shown.

View File

@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
year = ['2016','2015']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
# day = ['31']
# year = ['2016']
# month = ['03']
# year = ['2016','2015']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
# day = ['31','30','29','28','27','26','25','24','23','22','21',
# '20','19','18','17','16','15','14','13','12','11','10',
# '09','08','07','06','05','04','03','02','01']
day = ['31']
year = ['2016']
month = ['03']
def parse(self,response):
for y in self.year:
@ -37,14 +37,16 @@ class NetEaseSpider(scrapy.Spider):
def parseNews(self,response):
data = response.xpath("//div[@class='post_content_main']")
item = NewsSpiderItem()
time = data.xpath("//div[@class='post_time_source']/text()").extract()
timee = data.xpath("//div[@class='post_time_source']/text()").extract()
title = data.xpath("//h1/text()").extract()
content = data.xpath("//div[@class='post_text']/p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time_pattern.findall(time[0])[0]
if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
tm = time_pattern.findall(timee[0])[0]
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['title'] = title[0]
item['url'] = response.url
cc=''
if(len(content)!=0):
for c in content:

View File

@ -12,13 +12,17 @@ class TencentSpider(scrapy.Spider):
allowed_domains=['news.qq.com']
base_url = 'http://news.qq.com/b/history/index'
year = ['2016','2015','2014']
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
# year = ['2016','2015','2014']
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
# day = ['31','30','29','28','27','26','25','24','23','22','21',
# '20','19','18','17','16','15','14','13','12','11','10',
# '09','08','07','06','05','04','03','02','01']
tp = ['am','pm']
day = ['31']
year = ['2016']
month = ['03']
def parse(self,response):
for y in self.year:
for m in self.month:
@ -37,14 +41,16 @@ class TencentSpider(scrapy.Spider):
def parseNews(self,response):
data = response.xpath("//div[@id='C-Main-Article-QQ']")
item = NewsSpiderItem()
time = data.xpath("//span[@class='article-time']/text()").extract()
timee = data.xpath("//span[@class='article-time']/text()").extract()
title = data.xpath("//div[@class='hd']//h1/text()").extract()
content = data.xpath("//p/text()").extract()
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time_pattern.findall(time[0])[0]
if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
tm = time_pattern.findall(timee[0])[0]
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['title'] = title[0]
item['url'] = response.url
cc=''
if(len(content)!=0):
for c in content:

View File

@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
]
base_class_url = 'http://toutiao.com/articles_news_society'
base_url = 'http://toutiao.com'
maxpage = 501;#允许爬的最大的页数
maxpage = 10;#允许爬的最大的页数
category = ['articles_news_society','articles_news_entertainment',
'articles_movie','articles_news_tech','articles_digital',
'articels_news_sports','articles_news_finance','articles_news_military',
@ -36,12 +36,15 @@ class TouTiaoSpider(scrapy.Spider):
#解析具体新闻内容
def parseNews(self,response):
articles = response.xpath("//div[@id='pagelet-article']")
for article in articles:
item = NewsSpiderItem()
item['title'] = article.xpath("//div[@class='article-header']/h1/text()").extract()[0]
item['time'] = article.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
content = article.xpath("//div[@class='article-content']//p/text()").extract()
#item['content'] = article.xpath("//div[@class='article-content']//p/text()").extract()
item = NewsSpiderItem()
title = articles.xpath("//div[@class='article-header']/h1/text()").extract()[0]
tm = articles.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
content = articles.xpath("//div[@class='article-content']//p/text()").extract()
if(len(title)!=0 and len(tm)!=0 and len(content)!=0):
item['title'] = title
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['url'] = response.url
cc=''
if(len(content) != 0):
for c in content:

View File

@ -7,5 +7,5 @@
default = news_spider.settings
[deploy]
#url = http://localhost:6800/
url = http://localhost:6800/
project = news_spider

View File

@ -5,13 +5,19 @@ reload(sys)
sys.setdefaultencoding( "utf-8" )
file = open(sys.argv[1])
data = json.load(file)
while 1:
line = file.readline()
if not line:
break
data = json.loads(line)
print data['time'],data['title'],data['url']
c = 0
for article in data:
c+=1
print article['time'],"--------",article['title']
#data = json.load(file)
#c = 0
#for article in data:
# c+=1
# print "[----Time-----]\n",article['time'],article['title']
# print "[----Title----]\n",article['title']
# print article['time'],article['title']
# print "[----Article--]\n",article['content'],"\n\n"
print c

View File

@ -1,6 +1,7 @@
import re
import time
time = " - -- 2015-06-15 15:34 "
timee = " - -- 2015-06-15 15:34 "
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
@ -8,4 +9,7 @@ day = ['31','30','29','28','27','26','25','24','23','22','21',
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
#pattern = re.compile("[0-9]")
print pattern.findall(time)[0]
tm = pattern.findall(timee)[0]
a = time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))
print int(a)

File diff suppressed because it is too large Load Diff