From d7a3e28f595435382754ed92fe967856f1c12278 Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Sat, 23 Apr 2016 12:34:35 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0UserAgent=E6=B1=A0=E9=98=B2?= =?UTF-8?q?=E6=AD=A2=E7=88=AC=E8=99=AB=E8=A2=AB=E7=A6=81=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E8=BF=9B=E7=A8=8B=E9=94=81=E9=98=B2=E6=AD=A2=E5=90=8C?= =?UTF-8?q?=E6=97=B6=E5=BC=80=E5=90=AF=E5=A4=9A=E4=B8=AA=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E5=86=99=E5=85=A5=E5=90=8C=E4=B8=80=E4=B8=AA=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=87=BA=E7=8E=B0=E9=94=99=E8=AF=AF=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + news_spider/news_spider/commands/crawlall.py | 2 +- news_spider/news_spider/commands/crawlall.pyc | Bin 2489 -> 2490 bytes news_spider/news_spider/pipelines.py | 38 +++++++--- news_spider/news_spider/pipelines.pyc | Bin 1966 -> 2411 bytes news_spider/news_spider/rotateuseragent.py | 66 ++++++++++++++++++ news_spider/news_spider/rotateuseragent.pyc | Bin 0 -> 3303 bytes news_spider/news_spider/settings.py | 12 ++-- news_spider/news_spider/settings.pyc | Bin 546 -> 647 bytes news_spider/news_spider/spiders/NetEase.pyc | Bin 2587 -> 2587 bytes news_spider/news_spider/spiders/Tencent.py | 2 +- news_spider/news_spider/spiders/Tencent.pyc | Bin 2564 -> 2512 bytes .../news_spider/spiders/TouTiaoSpider.pyc | Bin 3042 -> 3042 bytes news_spider/show.py | 3 + 14 files changed, 108 insertions(+), 17 deletions(-) create mode 100644 news_spider/news_spider/rotateuseragent.py create mode 100644 news_spider/news_spider/rotateuseragent.pyc diff --git a/.gitignore b/.gitignore index fa6a5d5..0d243de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ news_spider/tt.json news_spider/ne.json news_spider/te.json +news_spider/title.json +news_spider/news.json diff --git a/news_spider/news_spider/commands/crawlall.py b/news_spider/news_spider/commands/crawlall.py index 767df19..446c44e 100644 --- a/news_spider/news_spider/commands/crawlall.py +++ b/news_spider/news_spider/commands/crawlall.py @@ -33,6 +33,6 @@ class Command(ScrapyCommand): spider_loader = self.crawler_process.spider_loader for spidername in args or spider_loader.list(): - print "*********cralall spidername************" + spidername + print "*********crawlall spidername************" + spidername self.crawler_process.crawl(spidername, **opts.spargs) self.crawler_process.start() diff --git a/news_spider/news_spider/commands/crawlall.pyc b/news_spider/news_spider/commands/crawlall.pyc index 8437e3cce421a8881afb84f437443c90b52de808..40e640b98fb290a810cd14c499fe1465d2c2a712 100644 GIT binary patch delta 39 ocmdlfyi1s!`7G6h)k6rPUIbaBS4dotVKdV)GtOhP-AhP_rwCsC zubw<I^6*Hz!>9}C@YKR0({nte2Xzu@H(7Q7NgInnS^S2R>& z4=4|)&@|MfW_}GCHYm|#=jec)8uXRuoan=_Ny*%WwRvf6A^0F{aia7WfBKslkHd4y5dU$UCY}fCtXz2ek=y z_nJlTuqtdXKh2I@lJq{M{&2eAyE;`52_q+mzM+q2$8}rd6W``QQIwXckD~jC9Izl2 zYnI51CDnao#S>+Ud_ZT4&gGvGU;m=%v_~gT$ZN_Pl+6Jqg-w&P7M(F{iqvk|BBfNR zXm2;+P_=JceD^rsAVlWA10>xLfO+H=z<^;gEyhkvNg8_tH`(P?{MqBqNAb}#omep* zq`CF@)$t_tRvu4mZb#0B2x@RTapb3kZ}P6jsgB64a2HU|bDHs_imh`I@~$8)I+4_d z(xlabT2WfHgGChs3oIN{;78dE+l}UcgB-;yvUV3GFL2x%nJCUHXOqV;ATOz~%W~E~ z<*2YxUOmM_bofsD z!RBe_Y2~@!;KAmvZR~=27t}jl?=)>Q)nx%@y(R(XfuzBN(YPQCVGE`Sj;pvOm@Tq4 zoi+YQ7`1Qb8eyQqzKppCqS8}7EiC6FnojZ=mq>@Pg$#aRFzp6}Vb&qgjYc{ZejtuW zqM3VmfSDBvujdSnBEu%FE`Bn~1}~YrxERa?;<`;1q&jL*{x*XDk}qU>pQGGs6k$Md z<{<TA#*LHwin$Av%T<*v)OSq+RyCl h1tcAGb?u_v+E5C$Zr5bGgMO8&=b5#Z)x2I?{SBOJ>m2|9 literal 1966 zcmcIlO>fgc5S_J?G$~0b6j}u&&OSANAVP@3fe!_=0YP)IsJ)>!{s`HPDo}a~|CS>^ z0OpN@({d$Evg4Wc?9R;Fc^m&%Z};WT{)dRxAD^F3c-WW)k62MgG^uq(6D9K=Wgg`| zO?*;ozXnYj6#HbFG-am-{UBNqolaU5H~-GI>e(G^J`Q#`Q1Y8Uqd>9hZnJooxXg@a zNo?jXoXOo?o*S79TTQ)9W+qDtQzwmePkGn}EDWbBDRT6QhZsQkI;+9cParJ|Mm7^I_1f9;s=(2^Y$JN~o0Ez5aT1?9*B z$K3$9%%Kx*0_ZM>na#_{SQ{cPdeH8&Aa$tvYM-TKw?2{LQ*Of0!66Lu zGG1hCYUHZ1a+Be9`$6LfdOR$UYIz;s!4d-fKTptGbweGnag`^R*GPiG9cU{o(O4;D z&{!{Qsg4nTU`i;?8(6l+5W)N*N4d%jbWmN$ijAtZnaogXTJRgtfD2R9XARHIlv;ETo%Vk{gFi7xQdQ zFCeDDBHf0-b)`W$&4aYOy#);ot8w^xk41pp1#*pRLhrMX?vz;4(fSVjcx?HOl7@f8 z+gfx!sRPwfd-Cn89$OWu(N6jv#LbiBK8L}7Lq!tR@dH39H%U~+Cb9yf90jOuD`^azs=yAWf*C4{L3T3zke*>mpQ)y@6AxxVw)cg+w+R}P=Q;w_(I!SQcE z4A3*6Wq?8NCd4MROx>1)I0wBvwDQ=N2a$(U1NaJj1h@ov(^{eK4ZDC-MgQPs8)zwl zh|_j``1Szb{*WSj(iVw2ia3wOd5{V9E!KOf3Y55^^=i4>*y}gE%i{<-vGV|{43!1fNuf64}2T=1K?jndz+|v4f`9M zUEvL(iN&Y;0ByO6#f1SE0KOKip=piCtz+W=$@u}XvuU8p(1G#lIhpq{i2KpNh^Vn)0H{IPG>q);Ki>5ev5vgLevd0|De(~(p(aRsLI64#7p$N~?9qXWzQRtP+ zti+shiMlm{(;$n6V`P6C?X`mt2ibAMirkuOmCJ68IgZ0h(}=j1`H&@obSh<5-5X=l zs3j@7%$-Y72)|)o=j7m5VSnYYatj!n}q4ihgr^V&OY1QW0hT29fP3{B_4^B zL89IEGD-&TtamlnpA8TfrXqxYu4=PRXzNH^{jh@E6DY zLNHTXtFrV)UJmVcNIBP!M2x4~B{hRKpVYFE_=PYx9D)nE2tNzgcX36wrvv`8=TeoG zcNf^gRoskWsrm-eYrt*IrxJ%>NI(MKf#rKn-CwyOL7{1?2`5?wukCGSfJ{4apZyE*f7Kb^7??^TjsxFl~#iQ literal 0 HcmV?d00001 diff --git a/news_spider/news_spider/settings.py b/news_spider/news_spider/settings.py index dad51e0..3a12d0e 100644 --- a/news_spider/news_spider/settings.py +++ b/news_spider/news_spider/settings.py @@ -18,7 +18,8 @@ COMMANDS_MODULE='news_spider.commands' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'news_spider (+http://www.yourdomain.com)' -USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' +#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' + # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS=32 @@ -51,9 +52,10 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'news_spider.middlewares.MyCustomDownloaderMiddleware': 543, -#} +DOWNLOADER_MIDDLEWARES = { + 'news_spider.middlewares.MyCustomDownloaderMiddleware': None, + 'news_spider.rotateuseragent.RotateUserAgentMiddleware':400 +} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html @@ -64,7 +66,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { -# 'news_spider.pipelines.TouTiaoPipeline': 300, + 'news_spider.pipelines.NewsSpiderPipeline': 300, 'news_spider.pipelines.TitlePipeline': 500, } diff --git a/news_spider/news_spider/settings.pyc b/news_spider/news_spider/settings.pyc index 663d5514bd1c86f1351a4a3c62bf6d73affef0db..31c6588568336b65473c6acaaa72fba8c67a391a 100644 GIT binary patch delta 329 zcmZ3)(#|T){F#^Q;617EWCkc;2GR~dTSAs4iw{HNZ|yE0eM^u!5Z8XgIw!PfO_*%%ZuZS3o=tui}Z3cQ&MtL%M*)Ii}idf zolA>L@^fAC%ky&b6M@pcaM{cWj6h3G(R3H(mn4>?mKLWLC8nq5mFNY5`Jq6*BZ!Zr zT?eFHgFy?`_JYiU)SS#bpxu5zHwA;86aW>Q%+09J$UfPc(N~!TXsus6PE~MEP0r5ORB+BH%Fj*J2P!boGcqu- z1R5Khn3hsti0Qr(N^#A|> diff --git a/news_spider/news_spider/spiders/NetEase.pyc b/news_spider/news_spider/spiders/NetEase.pyc index 976b66e6e1fd7fbd763b2f5681c0155606632c1b..42dc16d4930b954d6ac9ed1764980be96b210866 100644 GIT binary patch delta 16 XcmbO&GFyb5`7>^wMC!qu~ diff --git a/news_spider/news_spider/spiders/Tencent.py b/news_spider/news_spider/spiders/Tencent.py index de35f9f..549f8c7 100644 --- a/news_spider/news_spider/spiders/Tencent.py +++ b/news_spider/news_spider/spiders/Tencent.py @@ -11,7 +11,7 @@ class TencentSpider(scrapy.Spider): name='tencent' allowed_domains=['news.qq.com'] - base_url = 'http://news.qq.com/b/history/index' +# base_url = 'http://news.qq.com/b/history/index' # year = ['2016','2015','2014'] # month = ['12','11','10','09','08','07','06','05','04','03','02','01'] # day = ['31','30','29','28','27','26','25','24','23','22','21', diff --git a/news_spider/news_spider/spiders/Tencent.pyc b/news_spider/news_spider/spiders/Tencent.pyc index 8b1f9598b664a5119c0ad841ee9bd7b0caa9e550..64884377eaa5af53bf461de10dcc1cd5e0bf9868 100644 GIT binary patch delta 124 zcmZn>xggBW{F#^Q_7|y%?7JD0CLUICPGMn4XJUwAWk_LVNM~e-Vq-{Q12fneQrN)^ z4u%vCh86~fC{BhHPB4RuA%zRf;ARL4*5ICOz-YF)mvJ{6BhO?{4xh=hIJ$U57#JAX Qm^c~P7&$p68*yF&0L8EwYybcN delta 144 zcmca0+#sGNdyxM6ofXurZ`FGDNX6q_Be- z91JNOK*nSNMls$LZiW^HhA19}pkNK2$y$tNOvOr*^%+G4^po^6GK)*{iz@Xq^HNeP rHYYIdW@F@?EW_c$!o$G8Fgcy0n@_N$m?6