主要增加网易新闻抓取,其他:修改头条新闻内容换行表达,增加浏览器标识解决网易无法抓取问题

This commit is contained in:
lzjqsdd 2016-04-19 18:18:45 +08:00
parent d10ebdec47
commit 47865e367d
9 changed files with 157 additions and 3 deletions

73
news_spider/ne.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -17,6 +17,7 @@ NEWSPIDER_MODULE = 'news_spider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'news_spider (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32

Binary file not shown.

View File

@ -0,0 +1,37 @@
#encoding=utf-8
import scrapy
from news_spider.items import NewsSpiderItem
import json
import time
class NetEaseSpider(scrapy.Spider):
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
name='netease'
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
def parse(self,response):
count = 1
urls = response.xpath("//a/@href").extract()
for url in urls:
yield scrapy.Request(url,self.parseNews)
def parseNews(self,response):
data = response.xpath("//div[@class='post_content_main']")
item = NewsSpiderItem()
time = data.xpath("//div[@class='post_time_source']/text()").extract()
title = data.xpath("//h1/text()").extract()
content = data.xpath("//div[@class='post_text']/p/text()").extract()
if(len(time)!=0 and len(title)!=0 and len(content)!=0):
item['time'] = time[0][13:-5]
item['title'] = title[0]
cc=''
if(len(content)!=0):
for c in content:
cc = cc+c+'\n'
item['content'] = cc
yield item

Binary file not shown.

View File

@ -0,0 +1,33 @@
#encoding=utf-8
import scrapy
from news_spider.items import NewsSpiderItem
import json
import time
class NetEaseSpider(scrapy.Spider):
start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html']
name='netease'
allowed_domains=['news.163.com']
base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!'
def parse(self,response):
count = 1
urls = response.xpath("//a/@href").extract()
for url in urls:
yield scrapy.Request(url,self.parseNews)
def parseNews(self,response):
content = response.xpath("//div[@class='post_content_main']")
item = NewsSpiderItem()
item['time'] = content.xpath("//div[@class='post_time_source']").extract()[0]
item['title'] = content.xpath("//h1/text()").extract()[0]
# content = content.xpath("//div[@class='post_text']/p/text()")
# cc=''
# if(len(content)!=0):
# for cc in content:
# cc = cc+content+'\n'
# item['content'] = cc
yield item

View File

@ -12,7 +12,7 @@ class TouTiaoSpider(scrapy.Spider):
]
base_class_url = 'http://toutiao.com/articles_news_society'
base_url = 'http://toutiao.com'
maxpage = 3;#允许爬的最大的页数
maxpage = 2;#允许爬的最大的页数
category = ['articles_news_society','articles_news_entertainment',
'articles_movie','articles_news_tech','articles_digital',
'articels_news_sports','articles_news_finance','articles_news_military',
@ -45,9 +45,9 @@ class TouTiaoSpider(scrapy.Spider):
cc=''
if(len(content) != 0):
for c in content:
cc = cc+c
cc = cc+c+'\n'
item['content'] = cc
yield item
yield item
def printC(self,text):
for t in text:

10
news_spider/show.py Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/python
import json
file = open('ne.json')
data = json.load(file)
for article in data:
print "[----Time-----]\n",article['time']
print "[----Title----]\n",article['title']
print "[----Article--]\n",article['content'],"\n\n"