修改时间为时间戳格式,同时运行三个spider写入一个文件会出现脏数据
This commit is contained in:
parent
3201d09c43
commit
cfed93f3ef
@ -35,5 +35,4 @@ class Command(ScrapyCommand):
|
||||
for spidername in args or spider_loader.list():
|
||||
print "*********cralall spidername************" + spidername
|
||||
self.crawler_process.crawl(spidername, **opts.spargs)
|
||||
|
||||
self.crawler_process.start()
|
||||
|
Binary file not shown.
@ -13,3 +13,10 @@ class NewsSpiderItem(scrapy.Item):
|
||||
title = scrapy.Field()
|
||||
time = scrapy.Field()
|
||||
content = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
|
||||
class TitleSpiderItem(scrapy.Item):
|
||||
title = scrapy.Field()
|
||||
time = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
|
||||
|
Binary file not shown.
@ -6,21 +6,34 @@
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import codecs
|
||||
import json
|
||||
from items import TitleSpiderItem
|
||||
import fcntl
|
||||
|
||||
|
||||
class NewsSpiderPipeline(object):
|
||||
|
||||
def process_item(self,item,spider):
|
||||
return item
|
||||
|
||||
|
||||
class TouTiaoPipeline(object):
|
||||
def __init__(self):
|
||||
# self.file = codecs.open('toutiao.json','wb',encoding='utf-8')
|
||||
self.file = open('toutiao.json','wb')
|
||||
self.file = open('news.json','wb')
|
||||
|
||||
def process_item(self,item,spider):
|
||||
fcntl.flock(self.file,fcntl.LOCK_EX)
|
||||
line = json.dumps(dict(item))+'\n'
|
||||
# self.file.write(line.decode("unicode_escape"))
|
||||
self.file.write(line)
|
||||
fcntl.flock(self.file,fcntl.LOCK_UN)
|
||||
return item
|
||||
|
||||
|
||||
class TitlePipeline(object):
|
||||
def __init__(self):
|
||||
file_title = open('title.json','wb')
|
||||
|
||||
def process_item(self,item,spider):
|
||||
fcntl.flock(file_title,fcntl.LOCK_EX)
|
||||
title_item = TitleSpiderItem()
|
||||
title_item['title'] = item['title']
|
||||
title_item['time'] = item['time']
|
||||
title_item['url'] = item['url']
|
||||
line = json.dumps(dict(title_item))+'\n'
|
||||
file_title.write(line)
|
||||
fcntl.flock(file_title,fcntl.LOCK_UN)
|
||||
return item
|
||||
|
Binary file not shown.
@ -63,9 +63,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML
|
||||
|
||||
# Configure item pipelines
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
ITEM_PIPELINES = {
|
||||
# 'news_spider.pipelines.TouTiaoPipeline': 300,
|
||||
#}
|
||||
'news_spider.pipelines.TitlePipeline': 500,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
|
Binary file not shown.
@ -12,14 +12,14 @@ class NetEaseSpider(scrapy.Spider):
|
||||
allowed_domains=['news.163.com']
|
||||
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
year = ['2016','2015']
|
||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
'09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31']
|
||||
# year = ['2016']
|
||||
# month = ['03']
|
||||
# year = ['2016','2015']
|
||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||
# '09','08','07','06','05','04','03','02','01']
|
||||
day = ['31']
|
||||
year = ['2016']
|
||||
month = ['03']
|
||||
|
||||
def parse(self,response):
|
||||
for y in self.year:
|
||||
@ -37,14 +37,16 @@ class NetEaseSpider(scrapy.Spider):
|
||||
def parseNews(self,response):
|
||||
data = response.xpath("//div[@class='post_content_main']")
|
||||
item = NewsSpiderItem()
|
||||
time = data.xpath("//div[@class='post_time_source']/text()").extract()
|
||||
timee = data.xpath("//div[@class='post_time_source']/text()").extract()
|
||||
title = data.xpath("//h1/text()").extract()
|
||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||
|
||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||
item['time'] = time_pattern.findall(time[0])[0]
|
||||
if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
|
||||
tm = time_pattern.findall(timee[0])[0]
|
||||
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
|
||||
item['title'] = title[0]
|
||||
item['url'] = response.url
|
||||
cc=''
|
||||
if(len(content)!=0):
|
||||
for c in content:
|
||||
|
Binary file not shown.
@ -12,13 +12,17 @@ class TencentSpider(scrapy.Spider):
|
||||
allowed_domains=['news.qq.com']
|
||||
|
||||
base_url = 'http://news.qq.com/b/history/index'
|
||||
year = ['2016','2015','2014']
|
||||
month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
'09','08','07','06','05','04','03','02','01']
|
||||
# year = ['2016','2015','2014']
|
||||
# month = ['12','11','10','09','08','07','06','05','04','03','02','01']
|
||||
# day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
# '20','19','18','17','16','15','14','13','12','11','10',
|
||||
# '09','08','07','06','05','04','03','02','01']
|
||||
tp = ['am','pm']
|
||||
|
||||
day = ['31']
|
||||
year = ['2016']
|
||||
month = ['03']
|
||||
|
||||
def parse(self,response):
|
||||
for y in self.year:
|
||||
for m in self.month:
|
||||
@ -37,14 +41,16 @@ class TencentSpider(scrapy.Spider):
|
||||
def parseNews(self,response):
|
||||
data = response.xpath("//div[@id='C-Main-Article-QQ']")
|
||||
item = NewsSpiderItem()
|
||||
time = data.xpath("//span[@class='article-time']/text()").extract()
|
||||
timee = data.xpath("//span[@class='article-time']/text()").extract()
|
||||
title = data.xpath("//div[@class='hd']//h1/text()").extract()
|
||||
content = data.xpath("//p/text()").extract()
|
||||
|
||||
time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||
item['time'] = time_pattern.findall(time[0])[0]
|
||||
if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
|
||||
tm = time_pattern.findall(timee[0])[0]
|
||||
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
|
||||
item['title'] = title[0]
|
||||
item['url'] = response.url
|
||||
cc=''
|
||||
if(len(content)!=0):
|
||||
for c in content:
|
||||
|
Binary file not shown.
@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
|
||||
]
|
||||
base_class_url = 'http://toutiao.com/articles_news_society'
|
||||
base_url = 'http://toutiao.com'
|
||||
maxpage = 501;#允许爬的最大的页数
|
||||
maxpage = 10;#允许爬的最大的页数
|
||||
category = ['articles_news_society','articles_news_entertainment',
|
||||
'articles_movie','articles_news_tech','articles_digital',
|
||||
'articels_news_sports','articles_news_finance','articles_news_military',
|
||||
@ -36,12 +36,15 @@ class TouTiaoSpider(scrapy.Spider):
|
||||
#解析具体新闻内容
|
||||
def parseNews(self,response):
|
||||
articles = response.xpath("//div[@id='pagelet-article']")
|
||||
for article in articles:
|
||||
item = NewsSpiderItem()
|
||||
item['title'] = article.xpath("//div[@class='article-header']/h1/text()").extract()[0]
|
||||
item['time'] = article.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
|
||||
content = article.xpath("//div[@class='article-content']//p/text()").extract()
|
||||
#item['content'] = article.xpath("//div[@class='article-content']//p/text()").extract()
|
||||
item = NewsSpiderItem()
|
||||
title = articles.xpath("//div[@class='article-header']/h1/text()").extract()[0]
|
||||
tm = articles.xpath("//div[@id='pagelet-article']//span[@class='time']/text()").extract()[0]
|
||||
content = articles.xpath("//div[@class='article-content']//p/text()").extract()
|
||||
|
||||
if(len(title)!=0 and len(tm)!=0 and len(content)!=0):
|
||||
item['title'] = title
|
||||
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
|
||||
item['url'] = response.url
|
||||
cc=''
|
||||
if(len(content) != 0):
|
||||
for c in content:
|
||||
|
Binary file not shown.
@ -7,5 +7,5 @@
|
||||
default = news_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
url = http://localhost:6800/
|
||||
project = news_spider
|
||||
|
@ -5,13 +5,19 @@ reload(sys)
|
||||
sys.setdefaultencoding( "utf-8" )
|
||||
|
||||
file = open(sys.argv[1])
|
||||
data = json.load(file)
|
||||
while 1:
|
||||
line = file.readline()
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
print data['time'],data['title'],data['url']
|
||||
|
||||
c = 0
|
||||
for article in data:
|
||||
c+=1
|
||||
print article['time'],"--------",article['title']
|
||||
#data = json.load(file)
|
||||
#c = 0
|
||||
#for article in data:
|
||||
# c+=1
|
||||
# print "[----Time-----]\n",article['time'],article['title']
|
||||
# print "[----Title----]\n",article['title']
|
||||
# print article['time'],article['title']
|
||||
# print "[----Article--]\n",article['content'],"\n\n"
|
||||
print c
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
import re
|
||||
import time
|
||||
|
||||
time = " - -- 2015-06-15 15:34 "
|
||||
timee = " - -- 2015-06-15 15:34 "
|
||||
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
@ -8,4 +9,7 @@ day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
|
||||
#pattern = re.compile("[0-9]")
|
||||
print pattern.findall(time)[0]
|
||||
tm = pattern.findall(timee)[0]
|
||||
|
||||
a = time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))
|
||||
print int(a)
|
||||
|
81691
news_spider/toutiao.data
81691
news_spider/toutiao.data
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user