From 37288e72606c23512330051cdfd1348e282f655e Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Tue, 19 Apr 2016 23:49:36 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=BD=91=E6=98=93=E5=8E=86?= =?UTF-8?q?=E5=8F=B2=E6=96=B0=E9=97=BB=E6=95=B0=E6=8D=AE=E6=8A=93=E5=8F=96?= =?UTF-8?q?=EF=BC=8C=E7=94=B1=E4=BA=8E=E5=90=84=E4=B8=AA=E9=A1=B5=E9=9D=A2?= =?UTF-8?q?=E7=9A=84=E6=97=B6=E9=97=B4=E5=B8=83=E5=B1=80=E6=9C=89=E5=87=BA?= =?UTF-8?q?=E5=85=A5=EF=BC=8C=E5=BA=94=E9=87=87=E7=94=A8=E6=AD=A3=E5=88=99?= =?UTF-8?q?=E6=8F=90=E5=8F=96=E6=97=B6=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- news_spider/news_spider/spiders/NetEase.py | 13 +++++++- news_spider/news_spider/spiders/NetEase.pyc | Bin 1879 -> 2338 bytes news_spider/news_spider/spiders/NetEase.py~ | 33 -------------------- 3 files changed, 12 insertions(+), 34 deletions(-) delete mode 100644 news_spider/news_spider/spiders/NetEase.py~ diff --git a/news_spider/news_spider/spiders/NetEase.py b/news_spider/news_spider/spiders/NetEase.py index 965d091..3bc30ab 100644 --- a/news_spider/news_spider/spiders/NetEase.py +++ b/news_spider/news_spider/spiders/NetEase.py @@ -11,9 +11,20 @@ class NetEaseSpider(scrapy.Spider): allowed_domains=['news.163.com'] base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!' +# year = ['2016','2015'] +# month = ['12','11','10','09','08','07','06','05','04','03','02','01'] + year = ['2016'] + month = ['03'] def parse(self,response): - count = 1 + for y in self.year: + for m in self.month: + for d in range(1,30): + url = self.base_url+'/'+y+'-'+m+'/'+str(d)+'/12.html' + yield scrapy.Request(url,self.parseList) + + + def parseList(self,response): urls = response.xpath("//a/@href").extract() for url in urls: yield scrapy.Request(url,self.parseNews) diff --git a/news_spider/news_spider/spiders/NetEase.pyc b/news_spider/news_spider/spiders/NetEase.pyc index 47372277db12d424e0f49cc05dd89255b7faae86..1467a0f5a6194b6b112cd7d9b44e28ba78f718c0 100644 GIT binary patch delta 752 zcmZ8eL2DC17=1IFon&LvmR6g(HBtp(Rcz7}JbJO7lzQ84@SacT+xnvKodMiOak9q^*g>VVe#HNog5 z#y|#a@8y_Cj54PM>3vK*jCMf%i9jN8D(US#ObTv`hr~m5EKyAgv=V1cr;ZCa6_{3O z*y_ZG@sTNveb-cElO1u~!0vHKRmN@tv?HFY)fI%@%LIm5-Qlgg#C)^4ab)_vh=se^ zNg5By8t6YAm?OuA@!&8uyyJ14SJM6G!=CX-IH^H{>YN%&h>fw zLp@s1^Q70Pa*hoqC?g}x@QdbGwReR|mV&DIqlGwz~vTM@54tWEc>6*RnXZ|YD zb^F%8CeGO}{{1!0(dZ%rr>-wB@Bn(%*3^^Q5>X`sp+q2+7Z3)ez>i$RoPDPv5!$cn F%|A?oezpJr delta 413 zcmX9(OG*Pl6s(?4&qR_8DlQZR4G4pq0aq>svlD`JAc7EN%yckgOf=oWEGD>!dkK95 zPoTJP;SoH5M-bOuz<%T6Ro8p3sCxap_EqQZZ*3*T zI_=0#teS70`+=n!rW5S3&=kSRW_iwKs^o)E!7x diff --git a/news_spider/news_spider/spiders/NetEase.py~ b/news_spider/news_spider/spiders/NetEase.py~ deleted file mode 100644 index e12c6a4..0000000 --- a/news_spider/news_spider/spiders/NetEase.py~ +++ /dev/null @@ -1,33 +0,0 @@ -#encoding=utf-8 -import scrapy -from news_spider.items import NewsSpiderItem -import json -import time - -class NetEaseSpider(scrapy.Spider): - - start_urls = ['http://snapshot.news.163.com/wgethtml/http+!!news.163.com!/2016-04/17/12.html'] - name='netease' - allowed_domains=['news.163.com'] - - base_url = 'http://snapshot.news.163.com/wgethtml/http+!!news.163.com!' - - def parse(self,response): - count = 1 - urls = response.xpath("//a/@href").extract() - for url in urls: - yield scrapy.Request(url,self.parseNews) - - def parseNews(self,response): - content = response.xpath("//div[@class='post_content_main']") - item = NewsSpiderItem() - item['time'] = content.xpath("//div[@class='post_time_source']").extract()[0] - item['title'] = content.xpath("//h1/text()").extract()[0] -# content = content.xpath("//div[@class='post_text']/p/text()") -# cc='' -# if(len(content)!=0): -# for cc in content: -# cc = cc+content+'\n' -# item['content'] = cc - yield item -