主要增加网易新闻抓取,其他:修改头条新闻内容换行表达,增加浏览器标识解决网易无法抓取问题
This commit is contained in:
parent
d10ebdec47
commit
47865e367d
73
news_spider/ne.json
Normal file
73
news_spider/ne.json
Normal file
File diff suppressed because one or more lines are too long
@ -17,6 +17,7 @@ NEWSPIDER_MODULE = 'news_spider.spiders'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'news_spider (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS=32
|
||||
|
Binary file not shown.
37
news_spider/news_spider/spiders/NetEase.py
Normal file
37
news_spider/news_spider/spiders/NetEase.py
Normal file
@ -0,0 +1,37 @@
|
||||
#encoding=utf-8
|
||||
import scrapy
|
||||
from news_spider.items import NewsSpiderItem
|
||||
import json
|
||||
import time
|
||||
|
||||
class NetEaseSpider(scrapy.Spider):
|
||||
|
||||
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
|
||||
name='netease'
|
||||
allowed_domains=['news.163.com']
|
||||
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
|
||||
def parse(self,response):
|
||||
count = 1
|
||||
urls = response.xpath("//a/@href").extract()
|
||||
for url in urls:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
|
||||
def parseNews(self,response):
|
||||
data = response.xpath("//div[@class='post_content_main']")
|
||||
item = NewsSpiderItem()
|
||||
time = data.xpath("//div[@class='post_time_source']/text()").extract()
|
||||
title = data.xpath("//h1/text()").extract()
|
||||
content = data.xpath("//div[@class='post_text']/p/text()").extract()
|
||||
|
||||
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
|
||||
item['time'] = time[0][13:-5]
|
||||
item['title'] = title[0]
|
||||
cc=''
|
||||
if(len(content)!=0):
|
||||
for c in content:
|
||||
cc = cc+c+'\n'
|
||||
item['content'] = cc
|
||||
yield item
|
||||
|
BIN
news_spider/news_spider/spiders/NetEase.pyc
Normal file
BIN
news_spider/news_spider/spiders/NetEase.pyc
Normal file
Binary file not shown.
33
news_spider/news_spider/spiders/NetEase.py~
Normal file
33
news_spider/news_spider/spiders/NetEase.py~
Normal file
@ -0,0 +1,33 @@
|
||||
#encoding=utf-8
|
||||
import scrapy
|
||||
from news_spider.items import NewsSpiderItem
|
||||
import json
|
||||
import time
|
||||
|
||||
class NetEaseSpider(scrapy.Spider):
|
||||
|
||||
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
|
||||
name='netease'
|
||||
allowed_domains=['news.163.com']
|
||||
|
||||
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
|
||||
|
||||
def parse(self,response):
|
||||
count = 1
|
||||
urls = response.xpath("//a/@href").extract()
|
||||
for url in urls:
|
||||
yield scrapy.Request(url,self.parseNews)
|
||||
|
||||
def parseNews(self,response):
|
||||
content = response.xpath("//div[@class='post_content_main']")
|
||||
item = NewsSpiderItem()
|
||||
item['time'] = content.xpath("//div[@class='post_time_source']").extract()[0]
|
||||
item['title'] = content.xpath("//h1/text()").extract()[0]
|
||||
# content = content.xpath("//div[@class='post_text']/p/text()")
|
||||
# cc=''
|
||||
# if(len(content)!=0):
|
||||
# for cc in content:
|
||||
# cc = cc+content+'\n'
|
||||
# item['content'] = cc
|
||||
yield item
|
||||
|
@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
|
||||
]
|
||||
base_class_url = 'http://toutiao.com/articles_news_society'
|
||||
base_url = 'http://toutiao.com'
|
||||
maxpage = 3;#允许爬的最大的页数
|
||||
maxpage = 2;#允许爬的最大的页数
|
||||
category = ['articles_news_society','articles_news_entertainment',
|
||||
'articles_movie','articles_news_tech','articles_digital',
|
||||
'articels_news_sports','articles_news_finance','articles_news_military',
|
||||
@ -45,9 +45,9 @@ class TouTiaoSpider(scrapy.Spider):
|
||||
cc=''
|
||||
if(len(content) != 0):
|
||||
for c in content:
|
||||
cc = cc+c
|
||||
cc = cc+c+'\n'
|
||||
item['content'] = cc
|
||||
yield item
|
||||
yield item
|
||||
|
||||
def printC(self,text):
|
||||
for t in text:
|
||||
|
Binary file not shown.
10
news_spider/show.py
Executable file
10
news_spider/show.py
Executable file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/python
|
||||
import json
|
||||
|
||||
file = open('ne.json')
|
||||
data = json.load(file)
|
||||
|
||||
for article in data:
|
||||
print "[----Time-----]\n",article['time']
|
||||
print "[----Title----]\n",article['title']
|
||||
print "[----Article--]\n",article['content'],"\n\n"
|
Loading…
Reference in New Issue
Block a user