修改时间为时间戳格式,同时运行三个spider写入一个文件会出现脏数据
This commit is contained in:
parent
3201d09c43
commit
cfed93f3ef
@ -35,5 +35,4 @@ class Command(ScrapyCommand):
|
|||||||
for spidername in args or spider_loader.list():
|
for spidername in args or spider_loader.list():
|
||||||
print "*********cralall spidername************" + spidername
|
print "*********cralall spidername************" + spidername
|
||||||
self.crawler_process.crawl(spidername, **opts.spargs)
|
self.crawler_process.crawl(spidername, **opts.spargs)
|
||||||
|
|
||||||
self.crawler_process.start()
|
self.crawler_process.start()
|
||||||
|
Binary file not shown.
@ -13,3 +13,10 @@ class NewsSpiderItem(scrapy.Item):
|
|||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
time = scrapy.Field()
|
time = scrapy.Field()
|
||||||
content = scrapy.Field()
|
content = scrapy.Field()
|
||||||
|
url = scrapy.Field()
|
||||||
|
|
||||||
|
class TitleSpiderItem(scrapy.Item):
|
||||||
|
title = scrapy.Field()
|
||||||
|
time = scrapy.Field()
|
||||||
|
url = scrapy.Field()
|
||||||
|
|
||||||
|
Binary file not shown.
@ -6,21 +6,34 @@
|
|||||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
|
from items import TitleSpiderItem
|
||||||
|
import fcntl
|
||||||
|
|
||||||
|
|
||||||
class NewsSpiderPipeline(object):
|
class NewsSpiderPipeline(object):
|
||||||
|
|
||||||
def process_item(self,item,spider):
|
|
||||||
return item
|
|
||||||
|
|
||||||
|
|
||||||
class TouTiaoPipeline(object):
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# self.file = codecs.open('toutiao.json','wb',encoding='utf-8')
|
self.file = open('news.json','wb')
|
||||||
self.file = open('toutiao.json','wb')
|
|
||||||
|
|
||||||
def process_item(self,item,spider):
|
def process_item(self,item,spider):
|
||||||
|
fcntl.flock(self.file,fcntl.LOCK_EX)
|
||||||
line = json.dumps(dict(item))+'\n'
|
line = json.dumps(dict(item))+'\n'
|
||||||
# self.file.write(line.decode("unicode_escape"))
|
|
||||||
self.file.write(line)
|
self.file.write(line)
|
||||||
|
fcntl.flock(self.file,fcntl.LOCK_UN)
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class TitlePipeline(object):
|
||||||
|
def __init__(self):
|
||||||
|
file_title = open('title.json','wb')
|
||||||
|
|
||||||
|
def process_item(self,item,spider):
|
||||||
|
fcntl.flock(file_title,fcntl.LOCK_EX)
|
||||||
|
title_item = TitleSpiderItem()
|
||||||
|
title_item['title'] = item['title']
|
||||||
|
title_item['time'] = item['time']
|
||||||
|
title_item['url'] = item['url']
|
||||||
|
line = json.dumps(dict(title_item))+'\n'
|
||||||
|
file_title.write(line)
|
||||||
|
fcntl.flock(file_title,fcntl.LOCK_UN)
|
||||||
return item
|
return item
|
||||||
|
Binary file not shown.
@ -63,9 +63,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
|
|||||||
|
|
||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
||||||
#ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
# 'news_spider.pipelines.TouTiaoPipeline': 300,
|
# 'news_spider.pipelines.TouTiaoPipeline': 300,
|
||||||
#}
|
'news_spider.pipelines.TitlePipeline': 500,
|
||||||
|
}
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
Binary file not shown.
@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
allowed_domains=['news.163.com']
|
allowed_domains=['news.163.com']
|
||||||
|
|
||||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||||
year = ['2016','2015']
|
# year = ['2016','2015']
|
||||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
'20','19','18','17','16','15','14','13','12','11','10',
|
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||||
'09','08','07','06','05','04','03','02','01']
|
# '09','08','07','06','05','04','03','02','01']
|
||||||
# day = ['31']
|
day = ['31']
|
||||||
# year = ['2016']
|
year = ['2016']
|
||||||
# month = ['03']
|
month = ['03']
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
for y in self.year:
|
for y in self.year:
|
||||||
@ -37,14 +37,16 @@ class NetEaseSpider(scrapy.Spider):
|
|||||||
def parseNews(self,response):
|
def parseNews(self,response):
|
||||||
data = response.xpath("//div[@class='post_content_main']")
|
data = response.xpath("//div[@class='post_content_main']")
|
||||||
item = NewsSpiderItem()
|
item = NewsSpiderItem()
|
||||||
time = data.xpath("//div[@class='post_time_source']/text()").extract()
|
timee = data.xpath("//div[@class='post_time_source']/text()").extract()
|
||||||
title = data.xpath("//h1/text()").extract()
|
title = data.xpath("//h1/text()").extract()
|
||||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||||
|
|
||||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
|
||||||
item['time'] = time_pattern.findall(time[0])[0]
|
tm = time_pattern.findall(timee[0])[0]
|
||||||
|
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
|
||||||
item['title'] = title[0]
|
item['title'] = title[0]
|
||||||
|
item['url'] = response.url
|
||||||
cc=''
|
cc=''
|
||||||
if(len(content)!=0):
|
if(len(content)!=0):
|
||||||
for c in content:
|
for c in content:
|
||||||
|
Binary file not shown.
@ -12,13 +12,17 @@ class TencentSpider(scrapy.Spider):
|
|||||||
allowed_domains=['news.qq.com']
|
allowed_domains=['news.qq.com']
|
||||||
|
|
||||||
base_url = 'http://news.qq.com/b/history/index'
|
base_url = 'http://news.qq.com/b/history/index'
|
||||||
year = ['2016','2015','2014']
|
# year = ['2016','2015','2014']
|
||||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
'20','19','18','17','16','15','14','13','12','11','10',
|
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||||
'09','08','07','06','05','04','03','02','01']
|
# '09','08','07','06','05','04','03','02','01']
|
||||||
tp = ['am','pm']
|
tp = ['am','pm']
|
||||||
|
|
||||||
|
day = ['31']
|
||||||
|
year = ['2016']
|
||||||
|
month = ['03']
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
for y in self.year:
|
for y in self.year:
|
||||||
for m in self.month:
|
for m in self.month:
|
||||||
@ -37,14 +41,16 @@ class TencentSpider(scrapy.Spider):
|
|||||||
def parseNews(self,response):
|
def parseNews(self,response):
|
||||||
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
||||||
item = NewsSpiderItem()
|
item = NewsSpiderItem()
|
||||||
time = data.xpath("//span[@class='article-time']/text()").extract()
|
timee = data.xpath("//span[@class='article-time']/text()").extract()
|
||||||
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
||||||
content = data.xpath("//p/text()").extract()
|
content = data.xpath("//p/text()").extract()
|
||||||
|
|
||||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
|
||||||
item['time'] = time_pattern.findall(time[0])[0]
|
tm = time_pattern.findall(timee[0])[0]
|
||||||
|
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
|
||||||
item['title'] = title[0]
|
item['title'] = title[0]
|
||||||
|
item['url'] = response.url
|
||||||
cc=''
|
cc=''
|
||||||
if(len(content)!=0):
|
if(len(content)!=0):
|
||||||
for c in content:
|
for c in content:
|
||||||
|
Binary file not shown.
@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
|
|||||||
]
|
]
|
||||||
base_class_url = 'http://toutiao.com/articles_news_society'
|
base_class_url = 'http://toutiao.com/articles_news_society'
|
||||||
base_url = 'http://toutiao.com'
|
base_url = 'http://toutiao.com'
|
||||||
maxpage = 501;#允许爬的最大的页数
|
maxpage = 10;#允许爬的最大的页数
|
||||||
category = ['articles_news_society','articles_news_entertainment',
|
category = ['articles_news_society','articles_news_entertainment',
|
||||||
'articles_movie','articles_news_tech','articles_digital',
|
'articles_movie','articles_news_tech','articles_digital',
|
||||||
'articels_news_sports','articles_news_finance','articles_news_military',
|
'articels_news_sports','articles_news_finance','articles_news_military',
|
||||||
@ -36,12 +36,15 @@ class TouTiaoSpider(scrapy.Spider):
|
|||||||
#解析具体新闻内容
|
#解析具体新闻内容
|
||||||
def parseNews(self,response):
|
def parseNews(self,response):
|
||||||
articles = response.xpath("//div[@id='pagelet-article']")
|
articles = response.xpath("//div[@id='pagelet-article']")
|
||||||
for article in articles:
|
item = NewsSpiderItem()
|
||||||
item = NewsSpiderItem()
|
title = articles.xpath("//div[@class='article-header']/h1/text()").extract()[0]
|
||||||
item['title'] = article.xpath("//div[@class='article-header']/h1/text()").extract()[0]
|
tm = articles.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
|
||||||
item['time'] = article.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
|
content = articles.xpath("//div[@class='article-content']//p/text()").extract()
|
||||||
content = article.xpath("//div[@class='article-content']//p/text()").extract()
|
|
||||||
#item['content'] = article.xpath("//div[@class='article-content']//p/text()").extract()
|
if(len(title)!=0 and len(tm)!=0 and len(content)!=0):
|
||||||
|
item['title'] = title
|
||||||
|
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
|
||||||
|
item['url'] = response.url
|
||||||
cc=''
|
cc=''
|
||||||
if(len(content) != 0):
|
if(len(content) != 0):
|
||||||
for c in content:
|
for c in content:
|
||||||
|
Binary file not shown.
@ -7,5 +7,5 @@
|
|||||||
default = news_spider.settings
|
default = news_spider.settings
|
||||||
|
|
||||||
[deploy]
|
[deploy]
|
||||||
#url = http://localhost:6800/
|
url = http://localhost:6800/
|
||||||
project = news_spider
|
project = news_spider
|
||||||
|
@ -5,13 +5,19 @@ reload(sys)
|
|||||||
sys.setdefaultencoding( "utf-8" )
|
sys.setdefaultencoding( "utf-8" )
|
||||||
|
|
||||||
file = open(sys.argv[1])
|
file = open(sys.argv[1])
|
||||||
data = json.load(file)
|
while 1:
|
||||||
|
line = file.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
data = json.loads(line)
|
||||||
|
print data['time'],data['title'],data['url']
|
||||||
|
|
||||||
c = 0
|
#data = json.load(file)
|
||||||
for article in data:
|
#c = 0
|
||||||
c+=1
|
#for article in data:
|
||||||
print article['time'],"--------",article['title']
|
# c+=1
|
||||||
# print "[----Time-----]\n",article['time'],article['title']
|
# print "[----Time-----]\n",article['time'],article['title']
|
||||||
# print "[----Title----]\n",article['title']
|
# print "[----Title----]\n",article['title']
|
||||||
|
# print article['time'],article['title']
|
||||||
# print "[----Article--]\n",article['content'],"\n\n"
|
# print "[----Article--]\n",article['content'],"\n\n"
|
||||||
print c
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
time = " - -- 2015-06-15 15:34 "
|
timee = " - -- 2015-06-15 15:34 "
|
||||||
|
|
||||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||||
'20','19','18','17','16','15','14','13','12','11','10',
|
'20','19','18','17','16','15','14','13','12','11','10',
|
||||||
@ -8,4 +9,7 @@ day = ['31','30','29','28','27','26','25','24','23','22','21',
|
|||||||
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||||
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
|
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
|
||||||
#pattern = re.compile("[0-9]")
|
#pattern = re.compile("[0-9]")
|
||||||
print pattern.findall(time)[0]
|
tm = pattern.findall(timee)[0]
|
||||||
|
|
||||||
|
a = time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))
|
||||||
|
print int(a)
|
||||||
|
81691
news_spider/toutiao.data
81691
news_spider/toutiao.data
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user