From b3d12c4e6b6facb355ef33eec85dd167a7ee3224 Mon Sep 17 00:00:00 2001 From: rugantio Date: Mon, 18 Feb 2019 02:12:52 +0100 Subject: [PATCH] refactoring comments spider --- fbcrawl/__pycache__/items.cpython-37.pyc | Bin 7325 -> 8458 bytes fbcrawl/items.py | 65 +++++- .../__pycache__/comments.cpython-37.pyc | Bin 3103 -> 4348 bytes .../__pycache__/fbcrawl.cpython-37.pyc | Bin 8331 -> 8337 bytes fbcrawl/spiders/comments.py | 220 ++++++++++-------- fbcrawl/spiders/fbcrawl.py | 7 +- 6 files changed, 188 insertions(+), 104 deletions(-) diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index 825ad8e19cd7530dbb754713661d4c3e39475b42..7536f6f49b4f97ee343723b4ceb24444a396e02c 100644 GIT binary patch delta 1668 zcmZWpUuauZ7(eI!%e_f%l6F~|KW$cB+cm4TT|wQLOsz5}ld^RnB~IJ4=PpT`=BD3G ztx9UuP8>qD<-i2-Pm6D(_^`kbr-DB0Nd#Z-gNTTN;uH}dd{Do0b649f55Jt>_x;Xy z&iDIH^4_sGMkAx986^10{8T>w+So=UN$VSj3dzR_i?HZL!lD+L4Xw&~lhJb|7u$Gq zJn>|dkOpNmMV6(8#H2ZC-GFO6tEZY#TB34v;S_&Sx-J_FSDQ!W)6~R3BgjpNX(0}55tj5Mm;DWQwHW_B zG@ad%-0cN#?2+8ypQxEO4cuK(Wc){UK-Qqf8`>1FYxY3vni^Kn;nHo#2B5NQcYxmv zG5(cmwCV8jt~fHUs*Zj5RR>#D16b97cGN1^tGK!W;kw$|yGlKCboiFX0>y(-IE8U_ zNB7ZfIs@vc>*Bu!GWBU-RseXQ9peTro&F!XM=*l=iXxo z!+8|H(q=-imG!QQ3{4p1;8#^ z4fAgEAv(;b%+vHpbK4BcG1&$yuM{d3M7vlle|SAcC5`gW+LnTyln_V4U#!L-j!jBg z+WeqBN$E}g@xJYZ)JaEZOzx-sG%j@`_S2Zu&8zWjvlO3@=qP!y^pmllBbO1B}?8^wn$#JWk3={#tRh?1+EO-X@!_x>Z-_G=!UkjTLBh{%+`FEKC{%0yN zav1}6dgXbhd7TTt=x)FGiF9gz$cvi36X=oH!At3$6|_0-CHMQu1XIu^)eS#t&(B-c zy5srOYS?!I3V4gLTn9|!H_QlN0xSU+1QZcm9jsdAg-P`5tcZ;YkOhkYzol_v-EKUH<{F`>(tJ delta 1165 zcmZXTO>7%g5Xawp`|0(r*KXV-ZsXV`aa)t5EfQZA1&P3^MFNonWgFU95= z&FsE6`C%S zj&w}^CRgm`n|MKdsTj-Sony*z?vT;8q(+Pi)A*74$&o3x(KB^2br5Y`<_heJvCZUQ zjQvKAQed*&Q@CNz;B`Hn)8O{Mc1YnH=9r?v1}+&hXs3dSUS1>4dx2Or$)dwMEv*-O zp2vsL;RFt;6S!y%VNG?nEO*3)n}lddhye%x7xD*%6!BSWX5T(4VZ#lnNL6Z5mxeTD zN?I~4ZRyC2ybr&#ihU!O?KQQ{;_I<(wx-L`t@J|=81S_mm}1lxh0>UCjmkzH(6<>W znW(5Am3aRsb>KPNBOGIYtWV)NHIF~14SoQRi4s4EXGE#mySqw+1wZueDvJ>xexIl+ zsT911S*L(siGC#oc|2z?m+a;132}c!gqIRE7Uz1D_`}5hNNf{tE4iEnZ}o`y#H6S# zkCD4`kGq5A%tY<(_$~OQt;e^H=AMQY(Rb!P9y^Vs?nI=vOnLW9i6)wcziB_3XuJ=% z;_*<1WTI*K7WcSwH52MFaGYP`60aJ z9_NQUPiHMfQ|pa}|M96lE`DV=lV25oGW;n&C;noc$=!pT-@udo*E)|3Obh->=ThN) z&d+v6_pWpP0bU)dJ$RLVCZBm+GEex*8P(U$Gs!P~KzA79p6>?5Y zWnwpe$7f2T_+oK4=NJ^rkMnLeSZOs^TBn;3%=%#%Kvi_zg{6BeI8F6oKP^?>O@)55 z4)s<5xKn)C0YP?G4}%q$_0b)!KIx^vug|s?f~C;Y8Viel=&AMj`n;z$g0sG-tOl!| z8ZOL{F$Vyi7S7kfhnJ!nCu#BU_2G*98D}gUCuwq(TLM2F9urju!}1BNl=~`O^XRLA bTxs}En8bv;VldCM!dz$TY}(A46?4}=#nwBK diff --git a/fbcrawl/items.py b/fbcrawl/items.py index ecafbd9..4d1bdb5 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -127,7 +127,40 @@ def parse_date(init_date,loader_context): day = int(date[0]) month = months[date[1]] year = int(date[2]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() + #9 ore fa + elif date[0].isdigit() and date[1] == 'ore': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #9 ore fa (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #ieri alle 20:45 + elif date[0].lower() == 'ieri' and date[1] == 'alle': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #oggi alle 11:11 + elif date[0].lower() == 'oggi' and date[1] == 'alle': + return datetime(year,month,day).date() + #lunedì alle 12:34 + elif date[0].isalpha() and date[1] == 'alle': + today = datetime.now().weekday() #today as a weekday + weekday = giorni[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #lunedì = 0 sabato = 6, mar 1 ven 5 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() #parsing failed else: return date @@ -427,9 +460,7 @@ def url_strip(url): class FbcrawlItem(scrapy.Item): - source = scrapy.Field( - output_processor=TakeFirst() - ) + source = scrapy.Field() date = scrapy.Field( # when was the post published input_processor=TakeFirst(), output_processor=parse_date @@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item): output_processor=url_strip ) shared_from = scrapy.Field() + +class CommentsItem(scrapy.Item): + source = scrapy.Field() + reply_to=scrapy.Field() + date = scrapy.Field( # when was the post published + output_processor=parse_date + ) + text = scrapy.Field( + output_processor=Join(separator=u'') + ) # full text of the post + reactions = scrapy.Field( + output_processor=reactions_strip + ) # num of reactions + likes = scrapy.Field( + output_processor=reactions_strip + ) + ahah = scrapy.Field() + love = scrapy.Field() + wow = scrapy.Field() + sigh = scrapy.Field() + grrr = scrapy.Field() + share = scrapy.Field() # num of shares + url = scrapy.Field( + output_processor=url_strip + ) + shared_from = scrapy.Field() diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index 633f30c7c336d10b85a8b7439d72e60ef754b6b9..4d5fcb9529ff66c6c018706ae293151610af891b 100644 GIT binary patch literal 4348 zcmchb&2JmW6~K3Pm*kQnsjpZv{gG_ck7!#-jO4m?47)WPS#1DA4s4+=))5vf&X8Pc zxl7LsEr}tE0(Oy;10&}iq@3DY5B)QG>}jvP)Ik4&T-x_$mz3=|O$-!SV&Bfk&b&8o zX5Md>H!GEr2G@J}&L{u4plSca%=j`PaRY8~8wAxT2{jj=dPusus}oEcq0u#6Q>D!? z*Uh_mmCl8QuH{-p`?W^%wD3fu1(ttoxJ63tX^!K#LPEB>H`j9y5p zW9_LugX<+5u8(~#n7wt=Ze+Ht*sb6Zi)`QS#X{Q45o1)?4bCW79{8bfRJ+6)H%R8T zZr$4SZvF1#J9l@yt&eWq-n^IOMcn5NmROwi!Xr<{iAjCQ5>v86=@vQj8#0I^;pW0% zp9$CW+kV?M!}t+%jf42WHAT>Zf)?jIG5S1gkjzB}Jn^1}Tg-uw+GDLlhGeLJPH^^M zuw>iOlbq=H7*8zE3!*@Jo?|4YU|};ceclpDe*XZUFi}O%YlOZKp7$5+t84Xk++}s% zZ}~9XxV{-5L?O(us5kc-+&>5>Pobt&)W?2U>m6}4>Mb-(2ceO{;>$i?8{5)Q_3+9R z+_;8uHHhEC7%7q3kaP&uYuaP72y1jv8|jCC7?NXssGpEcZI_RXp>eEJ^Mo85C)&tV zxm=dZ!JFm~mkTVIKN>ZSS~cy2jPgURlRM7S;&EY^KOraJD`|C#Lo6@R($EC0Q)A7z z#QtQ`%2V^V8qppu-3Oae{m+OnX?0XMbX7?~S#*13$g-?F)`kY1eyWcw=(9SsI@5IK znl`kKVR3@Bu;)}7mgpo6%QQ>Ro@h$f-(J!+7;6E&gTqfSM;DaN?NM=9gxq2_?h@3Q z!%{ZdQa0MNFxt`!qs5VGojI!0T0rQzbbQaq!{z(1(_!VDAN*M@Z`>L^wKaNvi`g`0 z!tVA%8T3MC%XSd8#MQ}*`0~1At7T&2f)5y7mGSC$Wk65RM;x)$y`xol*vrkB+nZbW z(yfHe@!D|g7L%~oU?;|r9mR6>w6n?noM;_)?w|^d+L!8n9e4Y(GgyMW|KPNXHfPP1C2R~W z>o+jF{ro?0aopvD+BQT`hNi1f3^$FRv4KqO`}^jomQt%e_l zdwyd-nWA8^*9<}lLBQ-X=?}j6UK|GVw)`EP=Ilb8G6SGNVnJwP4H<*2p&zw)5r0aNhXMW`vJ(RQky*)m5w*6Y=f-x_G*#L(REOF7Dmgkx~Ps>aWwxQGSwi2^m@Ad&*E80bAI`jmR9wj%B_Pr0m= zw#UuoOz|0P0k{!8zzTpy*_OjKcBGt{)O-5geLBu2yz+#>qL!){y)@+!qK zil^cXrs^OT;TBgxXoUqbL#psQOXf%(emXIUMdtJ}Jh7&n{i?>|Id1rsFv8(acG8L$9>T;JE95&qXh389WqLxsWSNSC@aAoKzdAoLm#-T(-l z9L+)KH6Y*v22_YOj@9`gOXu+%F-gODV)9NE^jTT1m<8nv_q9U|#e_~Pt>MVbw3^>V zs~Q1f2oZobmFUbQ{cXCxP(UDb76N?AQ&M#_! zk_%&$L|0S|&esCWm(KrnIDEl#(f8*$z}nxx=DQcYmILd){6^oMfsx|MgL(S~-(CFw z`i{O00Dfoin+f4k0IMx*Lww3`AXaQ1$8wyt17SmWfS?sLZKS}YIX?$GiPu03SW3!M z0wKh(7vv8xbbJG&#wqf1a=i7 z2lUNH^5z5iAIuqCew_iYoCmya%$c{2&h<0WT)YWTwwvHyx?)Ry1Sb}p(-vR{Dirs3 zo>OpFVyXh9`WW+S9w)4aQpF=AQ)2aa5c5DDDI^W9ynJ%N?-l@>-GoDfXfx2@t%7hBv>y3kpQUhH!~Z`obr-Ym#I@}Ea4pMo{6oU6jBD`(Bhs@D>Nqv-^w5!5 U*HS8YQhv)7*b6>`dZ3yN|)Dlq#8yKf)fzW08=-R(}N zMd0b)|MD-NEfexz{J4A^=xjle+fXsWXhuS8t&EPW(4yG4GkfHOj_EsDZB!5Ilzc{5 zjn!WfR_A2cIJHBU(Ou#<{s(%$ksKFLrf z!4;Pe+JldL0u?7AWhAs14Q*zH4(onR`jk1W_Np1e-&x}o3G2*dO=uge#TKA-S(|mB zZGKH061KpmMcsW8C;Xr&j&{c>*i&w{Jb1o{h2-9oZ7(TCBc3a{?hVpoo_n!3jtAUZ z%Qyo5q>+%{uK~F3=>oG`I%#+E5nI_%EPRMA0cZwT4!oOr5_`N~To2GqvPT zl+EZV{fg%8DM&=l)PlM;wdKb!W0{%y)P}J-vj-^AU<9_|?3DNZTueN)%2S0m7R52_ z#ygI)lzDsDJRYoA6JCD1@#VGqihYgLd#Yx+PUCR5TIb!kTs@ z&-wx#CE8eZP+$G%Bo+e+xzUWGB!fLf(Lc$z9|ecShzDXah#>?;@JVr!XOL7f=vNsu zZ>0&X3@&5S8@~`u(2Uw8q&*>Y*>Y*is?)mVU~84PzMmJp%RMC)%ENx`_*(k7ISHUF@=0#gJrimL0^&kzW&-Sh~{}#V8V7j*DFKEC1=%7!x@{M}g)= zly&D~xN~8uw{W6nWN|0bkOFn61#g>{H{bAu5u7^$(|qm-pan?gb6|@LD*g`Jr}9tc z+q$6V)&(%eZSoS_a&Avy1QQ=*9hd{;UN4~l&8L{^RMk;m)yJMtSf4j8i2MpQvIc5o z?k-X8unD1P;(Uq$X|dLsy^DWy*aBGgubBk!!ECD>IL|2 zU(lBT!C~jT%a*711#DR@4wn#?!yD|aGYf)!v%FnV$OOUxuso4oAF+ukUoPH_$B0m1 zjVLJ?FWbH7@%{dzjSa=0D*+xPIP~z|hcBv$(v?c2`C!d2mp;uEXjF$BsEP3nuY7kq zX5L((8TA-fJW-tWcEav_%KQtm;0K^z+`^x{9|SBt4$^!)QF~ip&xCKTetxuSxMEAa z7{jNW9E?)6x*y&|LWrt_jR2fZ!`(jC;m{aokmdR(p;wNPP1HhdHXan3q z{?cW%8B)!_EEV+6doDvCm_`uhga zh-xSR7$=2mly)n;!YN#r#Y7~$?Dm4-@L>=Km7leP?F17CCgeEI_wqum%@3+dYJJuF z_cz|E@0$o+EATXx6S#DgWej5bGwt5g1FdLOr3a#6LWf7FKu?9un6b#fHf-V<9--N< zgA8tX@sMj9sPxSk^*})%%Iwry0>HHK)cWzYfVqk|@Kpm1m{ z9Ii3#D0&9Fn$0w#h!qJ8J2+r265fUf&*LQE3IlpPrI_y+r)I;aMJqtaj7iMF4Gc&{%c4jx99=cB&*Xnw8 du{GMRjHr6%+;=~!Hu^ChJSNdyU>$g^?*MWwGsOS^ diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index beb2aa0841fc6f45987ca5a90adc293f2a0db687..098523eeec4eb43473f5d381a1d7356438e87900 100644 GIT binary patch delta 599 zcmZ8cJ#W)c6!o>8#IfTxG#^c3Hzrb-Y!Q_JsR(FVO4SYl0Ub#O4AI*dSeQC=K#!dPCXhg43_k#|@Eds72!_7loOjPX@1FZ!u1r?!wr!h=Jmb&d z&dJ1n2hrq1X%}@&VHz`NU=~fZFo!nI;4J1J*&+C;Bjq9BIh-3Cei{o{94meXOIVg{ z;5=3&XR(Sk$tF7JO16#^UGZ~wl;#4z2yK17gu(tVkbh}vtRRV9 zkucTwg1E=G^3TB+U-O5$!`jr_B%eAphAbSgFg{?bQV?ND2UrzR`4hkzU#cv(ZjgDM z;HHGBIqb1f&>ap&aq@z-sO3;glM>tDZz`?o8i|+w!)3Z{NO&QBRvKU=*&5q?o3E)Y z%59oVC67)(0_zYYW0D1KQTcZZe%)=A4ZAf!&%TSvMHzW91 zv`8U>2!@M)fh!k^GT=&Z)13&eXV;|*7cSj-@098t_?^Rd9^ahN(#s|DrfF)Dc)#y& ze;%2ifR4WAj-5Cv7{>%AQNc|0EapxnN5x5; z;xUCbUg1&0JQjG=v53<=8be8u9CJgel|pN!(tF?MbQ2P|%87?Mz;7iMg6aWihD;M2ngtTNa2c?}+Tb@7RSeB^SpIY6_hf9XA9v1@y@CE>F+c zsQ7K-((+%|R?wS1gluG1!_Gy#-DivTeTF~N^AFNtl0E_LE=QT%l2W=#K yj}F-Da#a^MQDH(`>`VFTtj$w3rU3tB4K%P~#-ggLbLutKpmAR0hc?)F<>W8@4w@kV diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index c9da10b..728010c 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -2,106 +2,134 @@ import scrapy from scrapy.loader import ItemLoader from scrapy.http import FormRequest -from fbcrawl.items import FbcrawlItem +from fbcrawl.spiders.fbcrawl import FacebookSpider +from fbcrawl.items import CommentsItem -class FacebookSpider(scrapy.Spider): + +class CommentsSpider(FacebookSpider): """ - Parse FB comments, given a page (needs credentials) + Parse FB comments, given a post (needs credentials) """ name = "comments" + custom_settings = { + 'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \ + 'reactions','likes','ahah','love','wow', \ + 'sigh','grrr','url'] + } - def __init__(self, email='', password='', page='', **kwargs): - super(FacebookSpider, self).__init__(**kwargs) - - if not email or not password: - raise ValueError("You need to provide valid email and password!") - else: - self.email = email - self.password = password - - if not page: - raise ValueError("You need to provide a valid page name to crawl!") - else: - self.page = page - - self.start_urls = ['https://mbasic.facebook.com'] - - - def parse(self, response): - return FormRequest.from_response( - response, - formxpath='//form[contains(@action, "login")]', - formdata={'email': self.email,'pass': self.password}, - callback=self.parse_home - ) - - def parse_home(self, response): - '''Parse user news feed page''' - if response.css('#approvals_code'): - # Handle 'Approvals Code' checkpoint (ask user to enter code). - if not self.code: - # Show facebook messages via logs - # and request user for approval code. - message = response.css('._50f4::text').extract()[0] - self.log(message) - message = response.css('._3-8y._50f4').xpath('string()').extract()[0] - self.log(message) - self.code = input('Enter the code: ') - self.code = str(self.code) - if not (self.code and self.code.isdigit()): - self.log('Bad approvals code detected.') - return - return FormRequest.from_response( - response, - formdata={'approvals_code': self.code}, - callback=self.parse_home, - ) - elif response.xpath("//div/input[@value='Ok' and @type='submit']"): - # Handle 'Save Browser' checkpoint. - return FormRequest.from_response( - response, - formdata={'name_action_selected': 'dont_save'}, - callback=self.parse_home, - dont_filter=True, - ) - elif response.css('button#checkpointSubmitButton'): - # Handle 'Someone tried to log into your account' warning. - return FormRequest.from_response( - response, callback=self.parse_home, dont_filter=True,) - # Else go to the user profile. - href = response.urljoin(self.page) - self.logger.info('Parse function called on %s', href) - return scrapy.Request( - url=href, - callback=self.parse_page, - ) + def __init__(self, *args, **kwargs): + super().__init__(*args,**kwargs) def parse_page(self, response): - #answer from page - for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): -# resp = ItemLoader(item=FbcrawlItem(),selector=risposta) - rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href') - risp = response.urljoin(rispostina[0].extract()) - yield scrapy.Request(risp, callback=self.parse_rispostina) - - -# for i in range(len(rispostina)): -# risp = response.urljoin(rispostina[i].extract()) -# -# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts -# new = ItemLoader(item=FbcrawlItem(),selector=post) -# new.add_xpath('source', "./div/h3/a/text()") -# new.add_xpath('text',"./div[1]/div[1]/text()") -# yield new.load_item() -# -# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") -# if len(next_page) > 0: -# next_page = response.urljoin(next_page[0].extract()) -# yield scrapy.Request(next_page, callback=self.parse_page) - - def parse_rispostina(self,response): - for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts - new = ItemLoader(item=FbcrawlItem(),selector=daje) - new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()") - new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()") - yield new.load_item() + ''' + parse page does multiple things: + 1) loads replied-to-comments page one-by-one (for DFS) + 2) gets common not-replied-to comments + ''' + #loads replied-to comments pages + path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' + for reply in response.xpath(path): + source = reply.xpath('.//h3/a/text()').extract() + answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() + ans = response.urljoin(answer[::-1][0]) + self.logger.info('Nested comment at page {}'.format(ans)) + yield scrapy.Request(ans, + callback=self.parse_reply, + meta={'reply_to':source, + 'url':response.url, + 'index':response.meta['index'], + 'flag':'init'}) + #loads regular comments + if not response.xpath(path): + path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' + for reply in response.xpath(path2): + new = ItemLoader(item=CommentsItem(),selector=reply) + new.context['lang'] = self.lang + new.add_xpath('source','.//h3/a/text()') + new.add_xpath('text','.//div[h3]/div[1]//text()') + new.add_xpath('date','.//abbr/text()') + yield new.load_item() +# +# #previous comments + if not response.xpath(path) and not response.xpath(path2): + for next_page in response.xpath('.//div[contains(@id,"see_next")]'): + new_page = next_page.xpath('.//@href').extract() + new_page = response.urljoin(new_page[0]) + self.logger.info('New page to be crawled {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + meta={'index':1}) +# + def parse_reply(self,response): + ''' + parse reply to comments, root comment is added if flag + ''' + if response.meta['flag'] == 'init': + #parse root comment + for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): + new = ItemLoader(item=CommentsItem(),selector=root) + new.context['lang'] = self.lang + new.add_xpath('source', './/h3/a/text()') + new.add_value('reply_to','ROOT') + new.add_xpath('text','.//div[1]//text()') + new.add_xpath('date','.//abbr/text()') + new.add_value('url',response.url) + yield new.load_item() + #parse all replies in the page + for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): + new = ItemLoader(item=CommentsItem(),selector=reply) + new.context['lang'] = self.lang + new.add_xpath('source', './/h3/a/text()') + new.add_value('reply_to',response.meta['reply_to']) + new.add_xpath('text','.//div[h3]/div[1]//text()') + new.add_xpath('date','.//abbr/text()') + new.add_value('url',response.url) + yield new.load_item() + + back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() + if back: + self.logger.info('Back found, trying to go back') + back_page = response.urljoin(back[0]) + yield scrapy.Request(back_page, + callback=self.parse_reply, + priority=100, + meta={'reply_to':response.meta['reply_to'], + 'flag':'back', + 'url':response.meta['url'], + 'index':response.meta['index']}) + else: + next_reply = response.meta['url'] + self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) + yield scrapy.Request(next_reply, dont_filter=True, + callback=self.parse_page, + meta={'index':response.meta['index']+1}) + + elif response.meta['flag'] == 'back': + #parse all comments + for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): + new = ItemLoader(item=CommentsItem(),selector=reply) + new.context['lang'] = self.lang + new.add_xpath('source', './/h3/a/text()') + new.add_value('reply_to',response.meta['reply_to']) + new.add_xpath('text','.//div[h3]/div[1]//text()') + new.add_xpath('date','.//abbr/text()') + new.add_value('url',response.url) + yield new.load_item() + #keep going backwards + back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() + self.logger.info('Back found, trying to go back') + if back: + back_page = response.urljoin(back[0]) + yield scrapy.Request(back_page, + callback=self.parse_reply, + priority=100, + meta={'reply_to':response.meta['reply_to'], + 'flag':'back', + 'url':response.meta['url'], + 'index':response.meta['index']}) + else: + next_reply = response.meta['url'] + self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) + yield scrapy.Request(next_reply, dont_filter=True, + callback=self.parse_page, + meta={'index':response.meta['index']+1}) \ No newline at end of file diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index 024273b..11832b6 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider): Parse FB pages (needs credentials) """ name = "fb" - is_debug = True custom_settings = { 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'reactions','likes','ahah','love','wow', \ @@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider): #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs logger = logging.getLogger('scrapy.middleware') logger.setLevel(logging.WARNING) - super().__init__(**kwargs) + super().__init__(*args,**kwargs) #email & pass need to be passed as attributes! if 'email' not in kwargs or 'password' not in kwargs: @@ -121,7 +120,7 @@ class FacebookSpider(scrapy.Spider): self.lang = 'it' elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"): self.logger.info('Language recognized: lang="pt"') - self.lang = 'pt' + self.lang = 'pt' else: raise AttributeError('Language not recognized\n' 'Change your interface lang from facebook ' @@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider): #navigate to provided page href = response.urljoin(self.page) self.logger.info('Scraping facebook page {}'.format(href)) - return scrapy.Request(url=href,callback=self.parse_page) + return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1}) def parse_page(self, response): '''