From d875e89c520b4ff5475888c9418fb2199def10e6 Mon Sep 17 00:00:00 2001 From: rugantio Date: Sat, 11 May 2019 15:22:56 +0200 Subject: [PATCH] Adding features: crawl comments from page, crawl posts and comments from groups --- fbcrawl/__pycache__/items.cpython-37.pyc | Bin 3264 -> 11163 bytes fbcrawl/__pycache__/settings.cpython-37.pyc | Bin 523 -> 551 bytes fbcrawl/items.py | 510 +++++++++++++++++- fbcrawl/settings.py | 1 + .../__pycache__/comments.cpython-37.pyc | Bin 5148 -> 7765 bytes .../__pycache__/fbcrawl.cpython-37.pyc | Bin 8497 -> 8791 bytes fbcrawl/spiders/comments.py | 130 ++++- fbcrawl/spiders/fbcrawl.py | 62 ++- 8 files changed, 648 insertions(+), 55 deletions(-) diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index 314db6b5e83b9ac6d34b5f7281c53c617dd86757..75fdef82aa21b82775c087260974235101b53dfc 100644 GIT binary patch literal 11163 zcmd5?U5p#ob)J6?xm+&y$NF&;dE?l!s3^O(;<$mJ$WEi!Zh#7HWYodHE;HhtB}Y4) z;occ)wOcYxvvPil2G|B|5CnPH4T2&yjKC<0peO=ABtakY7zBM7^sPXV4T_@3qh6Bl z+!>NHB$t-$$jB~u=l-61?m6E%_s*RA$Mf?g34j0i)Bk+sC0mmIn+k(J4VmZg3-2Of z5|cg2qNn1?ZN*aBs-?CyOOvTi_4KxG>21R@P}Z1!SF#GsUSc$Qe1 z&Eq-8Dr^DI@(U8HvcLFm+!*kpF(L~h%OiB&q! zE%lY2!jw&gKhwi{PD=+^$Dz#hO_iTx(hVuCm6>q}UVIC^3+VlK!+P^j?yGk$_f(85 zrX#=7Q<*xbc}+Pj^@b%@Itj^9-ASphZe4#FZ8Xab)MBl3*JI7I{bsDV5tqTGWnz78 z$B*oz-{Nn#xY|g{Zmc*yifMCrPri6W10V;TZ@4~-4akYRothRKlmXF-7ud|<^+w=F z&Yg%CXo8aOY}b3~-e&h!0*Me=IhKB1KHyhmE_J&V4a@yfq zXD6I7XbG_^Oo!MFDC}k`c2P4Fd(y7u#C~BUc2FEq`2KxU@LrZ8t*7qEJy5-<-%>8~N1}gt7h+hy}8^3B3vsbAm%G zukj+DyXva5%IBz!ywI+`YWr-pX2h!J_^}>#JV1Y*2I!5T>qoKbA}P2bbDM5d)A<4o ztx_WJTLgX+_*#Vx?m9jT1&Whoi3lxl$8G?~eu~aBs0^P$lAzO6stGPtxHbG&fYGX4 zl1*g^$Xx?o&&VtC36U?WhQc4l$Oai3{i*noa}kc^XUG7pZ(x#YA@Tzrf;c@7DD5;l=T2{Wu=252FIp7-QP*);hYW=H-r z;P*u!ZL8En+lKNNB9HlLyHj?bEv=@1FF4U^I|!py-w7j!t!_Fymv7r%*IDh@E?*5c z1`W(@M8i?ERqDbs$6B=0abk^{QCXrV%)sV5u?kwTE{N4+vXDlXhwg2sE(VT_hu5&E z&(rEi=#sW1m*uLm`{9nwL#IwV?%EYEXxLtOb#07@Rqk|edT!XN({>CC*KUOYOk8Xw z{Bcl=Rah=3O~&&ZJZMwDFtXd7nhB;sJF)W%)InyJvEkWKo$)77=I6zD?nHvunk*RQ z!rn_G=01mBAvp_}x>Au1rCh=a)Uz^w3Uxz3Q4jJt!ifKMWqb?TFY@0ol}bR^vqsyPo-6WI|Ez8HKz4T z>|t7CGJ@9BpJN|5DD~#BS%L2>5<4@aUA-dhuWY@EwPNGnSNZoahPJ*BqNMPbG91sP zvO7z%qdqV_GntiE3%Ti#)&Gz4nV|r_-@9za>F*b}{x9YE5wQXzGh?1-pkfNo3jh8% z%?yPH!n6Nv!doIkoQO&8pG(RIgfWyxlI<7aMn&HPAP(D~Il*m~mWI*a${T%wetoN_5B6D|_W7v6?jjlYzm3B;i+G=-ICxt&2X z86mjR`B{_*!Ps=@FaY1K?4vfL)B8b%#{YeS^WU?TkJC?)Oumccs}C@NmFQw6WRs{J zAD6Cv;E#0vi&O$168(RXS@{D&b7uZWF{8ni%={=F!!IGdz)xiQqLljSdHC*fk8n-W zNN_*jEBEG4N&QN%B7BX(-xyz6uX2BWccNeFcPBWlu+kr4#s@tQj`$*Atw)Ig^tazBM;4 z^~Xc5W{gWQUGm?;%h-nnk8kA8YT1+CV>3^Bk54)2ox@3QA$QVy;@zC|_DlR*nX}#; z|1#uAK8L*19KW2F$hV%y8s_-YIGW$9-Q~`DH8IE2leRcK>s^>}mZP)YM`pC=&w3Sp z7ptS4GMV-D$yr4!?K*Sef24gcqEzPJ%t#@>n+h8pVMd$_C5-suu_MG@Pp9s&UJqoi z8ebzmFAYwQA0-aLAExVELur1-`B!i~vCet^TxOl`*9tCej<2EbW5kn3V#}O29|}App$s?nK%+_V6U}8~ze)WF1fCN`#V88=+)@?`CwKw+6xUNMflq zEbxuYi1!C2i(3N=zC=8S|B|lmW4RL9`lyX;y~-io;r^Xp-E>>8@>|D_NOVZHp6~U( z$Y6NNdVS1Zs{Gf9G=#k*C7?o_!ciLOZ;JKI*a361Xh7SaWoAiAgIQ3@&GHWR*)cm? zZqM~CS*(OX?m}qFB9S&lllxSaK^gE&Z^4@4dPNJM5=#bm==a=-_)f z3be4c!yAD&Yy;D`hlSgY?{IXqjL^PmgKd)`^y#||yD zF}_cuPfg+U{X3TLjIqtHq56mTrToUD=vPiSlM{urCgLxEV9ZnF%9Q+Eg`$|S4zxJMvj9=2 zQ7gby0-xDCRw3#-A!Ul&4ns(XvS!pm;DvGqqL-p3w4*K;HDQ;=P1oxEAyH^_8We?2 z$$V0sTE!iTG10taUfa_roz(D5C>^WeuZT-!k);7jVXU;^U0OWk&yt3; zR?y|KPH7k$=+KRvxR5+U5%;tdJE~PgK$Bm@5D~88b;_+#@(__hc-r$`|&4dYzo#lQq^XS70VE_v2%xLrsF@?h-LwT!Nuk+Rj8&d@kd3sB_pNW0p;=q$u*V ze^Z-Qt6)U7-c-x-dBs#sSyM}hr7xj$0clk+R1Ht6SyD;}#$A%D>UsIRypqW;DXSD< zL~nW`r<&-Sql$Pv^)(U07j@u;o=YjMq~DA_as`op!3EY* z#W(}8=s_K7L~WVq{hnl1nk_53H-j~OK3&a7+6fcPIvwk|$a;L**P;8=5|y zP{>+D_?3(-jpx%H3X12`zas9s;En^`Y`_N+gqU(Do}(wUb<-HM?7kjS49`IBk;DTdQaO^@fEPp)?T2{uIjB36O}2knnG$Ek>; zPz}0KryJEfJZR8agI}+mj?H@Ax7$v=9+&F%cEGwG<;(T@R@e5@79+%+Fyf(E>^dL?N|%l zCMueo!zG3-n>*sF2!90Rgsyl?Aj&Ig z{X?nJ%Y)m4q;;HLw@DI-^7(D#uh&lSJ5;wr$u1>-MM;m6K9bnvPRHA+10rP5Bc<{$ zQjso7@h?&GH%Q`AqWU^6{UrMNGBxc{a+i|VDESH{U!`PZks~#I9SyINnjS-Dpe9nl zNL5MgAmucoRuz0(rxGZxCHElLhENVUNJ^a0$jO|OJ)VqSz%p6Kqf~r~60ubB1FJ0b9F delta 920 zcmXw$&rcIk5Xax_k9OVkx3zW?z$ikyML_CLlt6++V~if`p-M1K?OQ^nwDs+hVBCg7 ziD-<8@Z!k}aPTbAn-?!8{tf#ln3#C-pram=K68Z+sCnFNOlvonVue?qndjQPO9Vm-xI zxU#B<&+w(&G+JWwNTTM^Iaoj*}bk^Hmpf=m`i*^S}GO7<|5@3R%WFP|*pqUXx zSfA!~7dB7hi3W}sJ`tpjrlPqI5F9vbu^#xa2R=p?+lQXwDSg&cx7jjqg{xb}E4DH4 zlKDy>7TBZIlG4+9y081Hug$Qho6|2i$h?^55;nRQFmAB!V#ph7$Qs&aC>pYETh{-b4&7+ zZRHFs{QuQ(d@@lE+dRfL!}6UyyU8^YFNodftgSh1&%NzAYbeGgs(Jx|{zd%Uvs24m%pn5P?W~K2Q(KRUR(OqF`^3ae zTzQS=<`GAOAB~*|MZlBX%T321wmSbdw1=uY>y>#q`m=J RNGV3j7%|dD&WL0N{{kPsykP(U diff --git a/fbcrawl/__pycache__/settings.cpython-37.pyc b/fbcrawl/__pycache__/settings.cpython-37.pyc index eef5a3f2ed047812174f9ab3c333df46205fae36..b18e5e5b05fe32edd9054e54cadf56abc4198ff3 100644 GIT binary patch delta 122 zcmeBXSM= 0: + return datetime(year,month,day).date() + #22 min (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #4 h (oggi) + elif date[1] == 'h': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #4 h (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #2 gen + elif len(date[1]) == 3 and date[1].isalpha(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #2 gennaio + elif len(date[1]) > 3 and date[1].isalpha(): + day = int(date[0]) + month = months[date[1]] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 3 + elif l == 3: + #21 giu 2017 + if len(date[1]) == 3 and date[2].isdigit(): + day = int(date[0]) + month = months_abbr[date[1]] + year = int(date[2]) + return datetime(year,month,day).date() + #21 giugno 2017 + elif len(date[1]) > 3 and date[2].isdigit(): + day = int(date[0]) + month = months[date[1]] + year = int(date[2]) + return datetime(year,month,day).date() + #9 ore fa + elif date[0].isdigit() and date[1][:2] == 'or': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #9 ore fa (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #7 minuti fa + elif date[0].isdigit() and date[1][:3] == 'min': + return datetime(year,month,day).date() + + #ieri alle 20:45 + elif date[0].lower() == 'ieri' and date[1] == 'alle': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #oggi alle 11:11 + elif date[0].lower() == 'oggi' and date[1] == 'alle': + return datetime(year,month,day).date() + #lunedì alle 12:34 + elif date[0].isalpha() and date[1] == 'alle': + today = datetime.now().weekday() #today as a weekday + weekday = giorni[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #lunedì = 0 sabato = 6, mar 1 ven 5 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 4 + elif l == 4: + #Ieri alle ore 23:32 + if date[0].lower() == 'ieri' and date[1] == 'alle': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #domenica alle ore 19:29 + elif date[0].isalpha() and date[1] == 'alle': + today = datetime.now().weekday() #today as a weekday + weekday = giorni[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #lunedì = 0 sabato = 6, mar 1 ven 5 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 5 + elif l == 5: + if date[2] == 'alle': + #29 feb alle ore 21:49 + if len(date[1]) == 3: + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 + else: + day = int(date[0]) + month = months[date[1].lower()] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 6 + elif l == 6: + if date[3] == 'alle': + #29 feb 2016 alle ore 21:49 + if len(date[1]) == 3: + day = int(date[0]) + month = months_abbr[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #29 febbraio 2016 alle ore 21:49 + else: + day = int(date[0]) + month = months[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# ============================================================================= +# English - status:beta +# ============================================================================= + elif lang == 'en': + months = { + 'january':1, + 'february':2, + 'march':3, + 'april':4, + 'may':5, + 'june':6, + 'july':7, + 'august':8, + 'september':9, + 'october':10, + 'november':11, + 'december':12 + } + + months_abbr = { + 'jan':1, + 'feb':2, + 'mar':3, + 'apr':4, + 'may':5, + 'jun':6, + 'jul':7, + 'aug':8, + 'sep':9, + 'oct':10, + 'nov':11, + 'dec':12 + } + + days = { + 'monday':0, + 'tuesday':1, + 'wednesday':2, + 'thursday':3, + 'friday':4, + 'saturday':5, + 'sunday':6 + } + + date = init_date[0].split() + year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today + + l = len(date) + + #sanity check + if l == 0: + return 'Error: no data' + + #Yesterday, Now, 4hr, 50mins + elif l == 1: + if date[0].isalpha(): + if date[0].lower() == 'yesterday': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + #check that yesterday was not in another month + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + elif date[0].lower() == 'now': + return datetime(year,month,day).date() #return today + else: #not recognized, (return date or init_date) + return date + else: + #4h, 50min (exploit future parsing) + l = 2 + new_date = [x for x in date[0] if x.isdigit()] + date[0] = ''.join(new_date) + new_date = [x for x in date[0] if not(x.isdigit())] + date[1] = ''.join(new_date) +# l = 2 + elif l == 2: + if date[1] == 'now': + return datetime(year,month,day).date() + #22 min (ieri) + if date[1] == 'min' or date[1] == 'mins': + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #22 min (oggi) + else: + return datetime(year,month,day).date() + + #4 h (ieri) + elif date[1] == 'hr' or date[1] == 'hrs': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #4 h (oggi) + else: + return datetime(year,month,day).date() + + #2 jan + elif len(date[1]) == 3 and date[1].isalpha(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #2 january + elif len(date[1]) > 3 and date[1].isalpha(): + day = int(date[0]) + month = months[date[1]] + return datetime(year,month,day).date() + #jan 2 + elif len(date[0]) == 3 and date[0].isalpha(): + day = int(date[1]) + month = months_abbr[date[0].lower()] + return datetime(year,month,day).date() + #january 2 + elif len(date[0]) > 3 and date[0].isalpha(): + day = int(date[1]) + month = months[date[0]] + return datetime(year,month,day).date() + #parsing failed + else: + return date + return date +# l = 3 + elif l == 3: + #5 hours ago + if date[2] == 'ago': + if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs': + # 5 hours ago (yesterday) + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + # 5 hours ago (today) + else: + return datetime(year,month,day).date() + #10 minutes ago + elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins': + #22 minutes ago (yesterday) + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #22 minutes ago (today) + else: + return datetime(year,month,day).date() + else: + return date + else: + #21 Jun 2017 + if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #21 June 2017 + elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit(): + day = int(date[0]) + month = months[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #Jul 11, 2016 + elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha(): + day = int(date[1][:-1]) + month = months_abbr[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 4 + elif l == 4: + #yesterday at 23:32 PM + if date[0].lower() == 'yesterday' and date[1] == 'at': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #Thursday at 4:27 PM + elif date[1] == 'at': + today = datetime.now().weekday() #today as a weekday + weekday = days[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #monday = 0 saturday = 6 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 5 + elif l == 5: + if date[2] == 'at': + #Jan 29 at 10:00 PM + if len(date[0]) == 3: + day = int(date[1]) + month = months_abbr[date[0].lower()] + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 + else: + day = int(date[1]) + month = months[date[0].lower()] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 6 + elif l == 6: + if date[3] == 'at': + date[1] + #Aug 25, 2016 at 7:00 PM + if len(date[0]) == 3: + day = int(date[1][:-1]) + month = months_abbr[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #August 25, 2016 at 7:00 PM + else: + day = int(date[1][:-1]) + month = months[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l > 6 + #parsing failed - l too big + else: + return date + #parsing failed - language not supported + else: + return init_date def id_strip(post_id): import json @@ -122,11 +578,21 @@ class FbcrawlItem(scrapy.Item): likes = scrapy.Field( output_processor=reactions_strip ) - ahah = scrapy.Field() - love = scrapy.Field() - wow = scrapy.Field() - sigh = scrapy.Field() - grrr = scrapy.Field() + ahah = scrapy.Field( + output_processor=reactions_strip + ) + love = scrapy.Field( + output_processor=reactions_strip + ) + wow = scrapy.Field( + output_processor=reactions_strip + ) + sigh = scrapy.Field( + output_processor=reactions_strip + ) + grrr = scrapy.Field( + output_processor=reactions_strip + ) share = scrapy.Field() # num of shares url = scrapy.Field( output_processor=url_strip @@ -140,7 +606,7 @@ class CommentsItem(scrapy.Item): source = scrapy.Field() reply_to=scrapy.Field() date = scrapy.Field( # when was the post published - output_processor=parse_date + output_processor=parse_date2 ) text = scrapy.Field( output_processor=Join(separator=u'') @@ -153,9 +619,9 @@ class CommentsItem(scrapy.Item): ) source_url = scrapy.Field() url = scrapy.Field() - #ahah = scrapy.Field() - #love = scrapy.Field() - #wow = scrapy.Field() - #sigh = scrapy.Field() - #grrr = scrapy.Field() - #share = scrapy.Field() # num of shares + ahah = scrapy.Field() + love = scrapy.Field() + wow = scrapy.Field() + sigh = scrapy.Field() + grrr = scrapy.Field() + share = scrapy.Field() # num of shares diff --git a/fbcrawl/settings.py b/fbcrawl/settings.py index fafad9b..40d3a15 100644 --- a/fbcrawl/settings.py +++ b/fbcrawl/settings.py @@ -88,6 +88,7 @@ DOWNLOAD_DELAY = 3 #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV +URLLENGTH_LIMIT = 99999 FEED_EXPORT_ENCODING = 'utf-8' DUPEFILTER_DEBUG = True LOG_LEVEL = 'INFO' diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index dd5952830ec94452dba7337c9e63d8f98d513bc6..2e928fc634fa5d414c079633f59f4a789970a4a6 100644 GIT binary patch literal 7765 zcmc&(OK=>=d7jrUW*57Q2SGeYiX4Jsh*j_~AF^mhawrl&q=bfKg0k$D;K^XS0SquZ zv+S8AfaPvd6)16)OHxoSmrp7ekXv%j(FY%L%O$5ArhN0os>(SBm#dWT@0nRFUy7NO zli8Y{e*XRU-;eL>fA5sbMFr2_|Lebh{Lvjn`86f7$3WyZ-taFGFomg}QlqctsST~B zsT9{eyr=|YT--tqO4TaxcF|w8}|Z-ae>s}T`vgvYST@Um1U>S*Mndu&C%o1-JsFneiTx( zxY%?=$Zh6CJRUrWUxOr*G9$Oa>(o>B45X3pdx&tQrZT0bF}0>Mt!6Nt87#;0q%SLM z4zMB{WTh|kTJA^6hRVvU@`+i?vmrM8iBcP2BWx63lbvE?_!fSo7)q^(v7L@9nUaax z?YD{+p7%*tYx#~g^dyA{_@ZtyWFD<`&@4?c2`|iEBS6Aax zci&sN`@#MDODhlT`%B;bU}^QiYMms|qo?l>zTpD|Ozmhpd2xnm$v47UI#B;c^=HzO z8y(ETl+sab6*-En`IWTp9ek0F^aEwv=s?crDy=Wr*;t7UG8{r@lee$`AZS@+LMsZa zu*vK0hP!VCzDEJK)_JtcVQgt?$7dEbs^;R#+fgLk^;X1}gb0L~fVwc^ev`+!5XKR) zY1^*vMz#&(7$NsIV#5)eVVvLDr7zZ6!M5w36Na|^kIHY~T-XX4d_lA}9n4~|uo&$6 z9@clbu(6Ig*!6nfP%iUuA)DU$=Ds+K?#Ib&!w3|0SS_lid`&g4wI`ljSo4`?b&YD{ zq5Vy3oEClfQ_Mq0*@pFZH1S%bZfhNNTStz;G+OJf%5t9>-&0iOQ1>@v&HRpbsEflx zb?-(;J<73xV|7tERM%eZV&;%GkJY36vC`Eym9Eh-js`mVG3FyOI|eI2LcvuJG%8`m zor3srR6H6)eDGNFwT{uvb#llz#jm8SQYtGSl{)!N^*QbTEh=`5?K0j9-k}3MkrEAe zj03f6L?h^N^gw0C1da35j&Z1pd^E<&N2jy0W0lq!O2>~TSmh*NwlR)1(q3va1Z^fU zGT7*73afE=N;y`$10C$@?NMgjR5ZmNl{Q2kRwCslHXKQ8_;B$<(&7v=U_({>SlVwi zm1_DGnLd@JJLaJtj-`5yNiEdvv+OkOf$ej07V;FsTi(fGPV#IV<3`SbI-#(M38h;g ziJd~9L`~e5+MB8Nrqupisy)dfD@-1pmXgh6^m0t7x%~>8VzLD5;!H9o|6leZXsU(( zoVGoiwuK(dm{d~xk&XI$NwiiDHF1d9I49ejPiEr38^xja@Q$?Ob2R0*N~!j* zke&U`lX5U-SbirD8?UGJ&O>9Ybh)x=*PK`odM5MiH+XVuZ5Hy5o84PJJgfo5Br*8|}wNgGY`$r`mH13(S4=Q9bY@ z$MwUR+du`Ha5U7TOYYQ%=;_?s*3k;^}ryWE(&%&g1r z+`9aoTbEbcmHCAQXMJ5P5R%MP+w=DUE?YQP8_u5Pw;EQkAy3g;*5mHHb(fAb*Wa`@ za7c!z*1kkQM>+DS^{uy1Z&oHGP|I}tRqCWq@h!nOX0L1n0gwr*F5IRJAt6Z0*-YKBAr+2z{Wf|E#UKTH}@&NICdO9)}o=&dVNi=e>|qM4pe z6D!PK0VZFmzPKEQS?6M=TFsPifAgb@Yu0ol5PaH#g?31+wGp&@Hf!y=o@aR=Hpmbl z0Mq;25lOY!*zlaq_6@1<3$#u3d`8E1p=Y~xb(y+?Y}q$;Lw#7&kGs>ez{y>k08mbb zJNdh>zN+ftRWkF5t#96YLTRa~6hYG&iG7;lT{qgYnu0%agI1Vy@VMK)l&u&dCofti zmwPK`%L#j{Cz0p}7AG_gFSZxnCp9H|;V@j?bL%_QbKs}8z4+u{+OIFW0&Gf-?)Pt2 zy&$G&!{ud@{dnL$f7rsRNVZ|hGbP)|?KPcfE7mu86z830llx5M$#PdHFiD7sc1>DU zg0MI?$ZBz}fnADr%-N51RETwFJ&bd8%u*EV*joZ{4Fxb$4F{sVU~?1f!*KnLARfS0 z6rgzV0La{S;QDcHQv|K1+!e99lQCf$wJ1YQwJ6R>RRj;4;DubCGqrNV@%QZw?nEsV z$eOiMy(I)zx7?kzL7Fog`oYMx0!VgKo*=QAEg3Njxx(rN7b*5_1QZEBMWEO2?3&wWYF(R#8jHCs~?0u4S?I)Kioxxh0UDdZdYH>LA`a-thaNQcR5$fP%`j zDMiAq4!G4A=;yWy8vB|oGo}<@W4W%r_bC7c^oXd^+Fwapq_I3;SqGiSfh1`}qhuFw>2dIyx(VkzlRUGs8ie?W~bg%I>nx`$wV1#p3wUTXT3+NfJ#Ix?*nQ!xUPU4bSE~RFTq7*H`?;LWMZsKlb4X4* zTyy5u_vh%gGehV7;__;8gQVv4T&r5dCs3Pi&GHUO=V-LWafXHwAU+fLM9v;(&kJ@#x{u-T$f`+QdsW+?g9s2-CnwIF$NkM{Yv#7R zPgMa=*CZm#Q*!3E%c^f(O*2x;GyRRL*RJ1q{hM#x{1$L6Y^^7PsP635*{dmHr=9cw zlS-#ky`>?*AEv8oYafYAI0SyHu8BACU6EX9>pECO%7vz%M_Z=J7-*ePjW55#BbiHk z+^C8^(MvG^zdCsrcf^LttfY;qK%6MVDffmNiY zOt`Ct0g`vd#ATV+LCz9rq$5_)wYFVGC(^13%cJwRJ(YI6b$v~)hpTG~snI1$Jx#oCX+?1x@|pokRj)`51d8yuc<*~hiaIRA=PhBtzdCW zx`Bnh&f&lTBgS5O+^uTK#L79<5`INAwt8Az$47#90;jhq`6m?Iq~Kc=yg@;gg4Yma z{w=v{6IN2-Ae~$QuLa%zMUDbuA@Kk$bn*iguyO5#u*m2m$7}JpKnICUb0q#4)vH62 zD~La(rf{Z2TeYHO7dA1`IG1!u(+9%ihr3`UI=;0WRlOFgZmeT}*NSAkq_bL)%#_9p zGG8*2+mP`zbs!D?8n1LbD#dZogkcHolYsjK&NYC#sS?bW5Sx_HUjp1`KM(=Eq!Ec2 z2hAW_kiq|WDl1FBhz{V6)T0AL;pjdl>A*egR{$3%0l-?5K=dCH@KpxShXi~rlHMu! z3=Dv#^sjXEZ9qIgQwONSpD1CE;_xe`ap_&`<(F8w7q76PUVNC1^y-g758T-Szxq)j z1<-RJDtkWxY^e;M1!(gLydsG<#>;9WM_B|NNqWe$5*zEqUq+{YlAx}#(}3opq(C_U z`4@oR!J`sLgNhNTM`ieeCM5aEM-|Wv@^%cf$xI72C1a!ThmiHpq~Eicg>xW0ucZEo zds!{=)JXZ~k4Cb*6!=Lxm6H9WBz9pUkxJ4m5P^#s5%56)(z%|JKA3m*e%fcUh@Pn?_#UA`E&dN(s@!*vlK`Zg*jH(uGGC2#2M({y zQ$od0k*H9ztCUKSbF2?#Q&J_V6k&Z|(K8N?TPshJ-UPXOu83J{i#v>b4Op9?68JAb z!aO-NQlir*y${~oH(x?9{aMfxyHY*XM+%+4)^846(E05c`{<}1v{;M>frUm`6TPtU!MEb&n oDdtfi9aPe3B|ARR`edqO^UcIc-^SHd3F#m+!Nce`rj@k+0$G>+aR2}S delta 1663 zcmbVM&2JM&6yMnod%gCqabil0^Wgw0F$r;jHboR@6htc|6cAKJGEi&rOdOlF*UYX< z5;o#MrK$%IXbMtA4)(E?dg9XdP^tewZ#{I>Q!f=#FI8_n^}Tf@xiBA-oVER3+nxy2iQcga{lgniy zt#cu3pZ8tLI6OJ;xpj8K_itYAHitWzGk&|xI$?n4>KOInN^Tp$varcH>r+Knswv1l z`fQarq&NcZP`RhHNJJv_0U-)iz)P0Ps;Fqa>?rv2umTnH5b=Yy&&Yyhi3$7jr6@U2 z2M`mQtyGkZl$I445ox7i5vmpxnvOJ@*)phoU;9uY%B>Sul!)}z2%;X6z@v`%)1E$+ z?I)uY#%*V_L+73n>2%k9b<={Z*~n_`qPdewWc71&geC?m8QP;XFU~r%+6XO*yUzYS zZ=K?)sR8M017*_zuO2xxRZs>5o|Kl+PtX$d;0(yix5529 zorHUOOQp8RW%oD!R|cpbXBbZ4p?XWb0$jt4_qiDYgDh=dU6%)})O8!|+0*Q2Bqxn# zhqAT5$aQHnxZmrFBe}(k#XFm&4hup?OZ6e;rNua?bZ1lQ%}z5c8{7nfxQ(zEKG&?K4Osx34?S?WD z)VbSTm&r@)Ru6_F)A2BCoOQkRcn~thI=wdIZpb()nd=A#6!F{W+;M3QkK^piS&nj- zCTBs{?*vTh*hJ%W>mp-ko*mr%m6o1@Gkg+YgTa`N3a7AeZ6EIKM>v3h1K)67%sPbO z%#{C=0OVilmo8lRfTuAg2Qjqy@zo>OE0yiASJ)x8_`UG!nddZ)LTSXZ8M3l1rntM8 ziW<5!Ucv~o%bPxLhUe+im4CeB{NKVw`gE-md^t?s!&P-mW3tLLUIE9?V6fSbkzbWsKnGVnz_i)53SFMuP641UncgR*I=V+uiOBA-}dJ5GYp2( e#gjWlc2p;6p7ma diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 74cf76fdebd679a6e616ff49888baa2cab27beb4..826194ac13de169561c469c4042a307b5fcb4625 100644 GIT binary patch delta 3159 zcmahLTWlOxb?(gU?#w=BAGY`5=lWH9lhg}wQl}AxG_l%5N)2H%_8mL>n%&+R z*AI=g0;jEn5-DN62>sYa0wh!+6_;P2$l}T>(>H6EY@_aln>w?M64hBu%|5E(hKb)(GvPz4w&mAnl{+d%|*vW@r}v3eC}e_^Wh)4#Gc7hv+c;Bku^Z zupC_y3M1|&x#0eRtgq~hS`bKcdxmZ_jkR*kHkdVYUbl>Sm@o2qLG)6K3k&n5g>PNE zdi_Re@zTQO`6VwgfAiYH;-$+s7Os~rTz+|JX_a(C1t)?Kgb~NMr`^lq^|>JXlOt5r zc9@1bgep~uRjBF+rHC1%Vf_2th|m2Y&S}*BQ0!^NeA)+`rcvQ7F(-hIYUX{dxG%wd z(8PfRO*&#V#6APLP+Os?4k3aa#j?@9Jqc{Y%xOE?)*O*0a{^6wM5OuBnRXoMt(?#y zs0id(B>@5nnnkk@3A^Kq=6pUn=|6-IAWjECCuzP7vPn8r6(17oU5*bQ!bc9_`9t_< zwFL_7SH7b$hzZPe12Rwf)bYbqpEhwQj7h}pAboC+e4o%MKlp#!DLTDJAh;e=vr~}k zR8Hv5p(ec@fhXJ7%NHn33+pJFp#tomW3C~8G^-TXS!1JV6+itPzE5%a8G@bMUE6M2 zXN$$n&CQuLebrcLG^#VJje2q44TL3eXTW_N7!V6t_e15ioPA6iUqh0@46E5NErVS^F%IUD5#R^%*8ZKyPsGS8$h(MO z(v4}a2A>0T=a~DxmUiFNPLnD3J?&<28i@r?9EzU`7I1okPwVkHa?Jf{d}E0X1GONr z4E(r3Pp%ubUeLV2R#Ug{vRP!DLLecygy8E4uqx~%057z`YL!OWWLQ9U#;qsDComWG z4dlvqnXz^x8E!#$KSoFnBkvLLB2=PIpenQ9*)iG*!Qli#hv3$=<91?CfQ_m=5O+g%5;&rB66=(oJ?Que7LVF*`w*(@* zp}mGd4^b5lJNpGMeV=btH8nm@cUoOHs-r%XtS|YQWO&@9GEB4B2rJk-BsPW^45b>t z8O=dKfwD(30`1QTyJ0lxgrB$oiz53j4`21ehlUK{kNDxEt=<+CZh$Yrh@*B3#xEh(k_;4sd39GZelWgc3WxU>+h?yB+x%v;G1ds?P~uWAuM$ zJX1|7L!%{Q?#qo#-e%Y^cjk&kT7L7}s}0lE%ceDbu1rr%SvE)*R>`Lt zY^rdZi=Dh()Qjh^lenMv+2jS+P9N_X+ctDImamn~DzxpIzP@wHy&ucEzfVsm3yX~n zlje2K&960>o6HObdB?os&Sw7EDvYwL5QitN8@3nJn@z){3?B>K#-<=PhX9)sdl7&a zs6()KJ&Eeuo@CjKMUf>VIDXeqSvAXxN>DD!5eiOlU z08eg%W&>=$fK)6L`wD_30EGy@A^fh2QeLdCo7<%|L$^2J+Tihc(bWyc4Ab_Xarl@k zfx)u5?n$Px>8bG2YnD*`qNf0%)YR7vUW=nm8lB#UVKo5=;a*WAG6}QX}_Gykbb`A62o6N-XPpw5EPr(f875uS?VqXa0}W&*}D1Sz2b#% z2f#*RCA5GG#a>6cT5IUERE887FP@H^ZUx=n4rU@xuN}7T{%x?o_X6@cm`7{{z(3sh z&>shn2FNPNN8c>&elk3@KR$exe3dT%CsvWegQ)=q7v>>P$U5Sv4z$nQACG*mh0g?b z83DgFv8Fr>h9X|9R9da+mbGfsYNgVFl>%YsQIbck0*0@AD+@r8WFkZT;k^OXkDgN^ y%5h~`Nimcv#Jj~f0k{{#2etd7{H<|@yV(maHOtgsT=2k?UmZ6#nwNXw4f7wn_64W_ delta 2847 zcmai0U2Gd!6~1@IKjXi#o!GIRzfO|6jn$tN=t|u}OPh_;-EG=NN!g{Qi<#t3Qakq8 zdp&8>%E3UoyA`NX%0;Vv0oe$NCmtXm0YU=tfOtTNMMx|80TzT1yn%$o!%DlHbH~}Z zkyvcanRCzI{m!}fo;&Z%eScP;jYNV3e&^r&_3g(Wf1;=9+LPmXgAzuVuM(<;3L#3a z5awsvL&7wRl*2n}A@XGR*n3klMP@qIlYeG{QH%<|MqLcj znRymp*GshEeS@q~HoyiSY6Txlu;fEh@UtP7g15%fECcTV8)hT$4zf`;2Jg@t#6t?< zWs(~gU!~6t4$SBCbH)5;7jG=xEY81>Uzl4KzoB)c}>ZXCMgE&*`7tHXVI1xVR2a^N8? z+4MSE4lx2NoV)%Y-he$-zkj+~Vobd4`QgIVTeVHYwk&2iHKWdJcT3DNOyjOuDKVpN zu3Ls}RxMP@%{bxu8*3}ZDO2h%OfFu&lD~fWT7FV|(>v<<0DgNv_6Aiyr?5jGjEJ9m zN0eMj{8@WBJPeYqXRTy2?iW`B6QKzds3qIs)8by>8G1&1FR*yxG)R;1G+qQklpqZ& zx*DTFHLPkhsKn?1b)T3jP6pqlX%PuszK{l6o&mCdTJF@UR!-rcM3Kg=M!jY?EdFUU zljIUAyz*o%?R_ouZ6)*^>gJG4igWrkIwfA$7wJ*)j{X^%6{n&%=rOS!U82Xu@1mQ_ z|G)%uDo^cubv;$fF>`v?yHz)xJNyhPK88d^l1DO+1k>g-K)U`-UMbf~HpkNNv*NAT z)KLV?ub|d*hg)lh5X%U!h(E=?Fon&<56>z*jngp4F5Q}L(l%kR_2*#pN?NSMCtEvo z+v9lJ-lm7C9c9}Kd(GGMZg}|T9Y6E5H5_*+?lLFf1a}DRDeq%t+vkKp6W*cBm%?)t z0grEh2OO7ujq3$Ey&YxRfn0icI*Q!QEnhpvpe=M1IF7z{oQ0a8l6;JXnerhYVY=iM z$UyJV_CSYhE0E9N{(;{1H=$jX!_0S%C~)MRL^;_6S$PP7vRKnsqAdQ{^Ae%tRsCfm zYcbI3h0>bZuG08H5AC4DM$0Ldz+P2O!;w|^caVc8v*E=NOFGa7%IO`NB5Wu{wgVWl z8R&~h`MVOLhP|jmx1pKrW@Mi~pS^ZIyDWa2%+eRdzmq>oF2NznDP673OEq3{?(?f45dRr+=!~eO zPQ>q9CWqctG1niQota^!yMm@t;#aAo$1cv-Hf?5@Mx|tL7;80dRBPNa>a~W`IBBd_ zN~;@2%@*&aMt!okToq&Ke=T3njq(MUqN}c3PSlU;{A4oW!Y5oZ$VI&?TXOSpK zyg<6Dxzgaz;&2*?+iAp?GE=Qyny6N*kfG)Oh*?Lp|~I!(ZToQB=MM+b>c4WF*;JsM8YVYxag$CnG!2Ynl`@QJ;j zjQpN1yM+a9PCuxUo3OlN{HrzFv9=t33nO6N`Aa~$flAF}#S)~!W#K-D8n=4lk7KFO zhrO5=zdk;kc>40-x54@k;`I32qlfWv69dUU9~B=QIlec0 self.max: + raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) + self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) + + #returns full post-link in a list + post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() + temp_post = response.urljoin(post[0]) + self.count -= 1 + yield scrapy.Request(temp_post, + self.parse_post, + priority = self.count, + meta={'index':1}) + + #load following page, try to click on "more" + #after few pages have been scraped, the "more" link might disappears + #if not present look for the highest year not parsed yet + #click once on the year and go back to clicking "more" + + #new_page is different for groups + if self.group == 1: + new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() + else: + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + #this is why lang is needed + + if not new_page: + self.logger.info('[!] "more" link not found, will look for a "year" link') + #self.k is the year link that we look for + if response.meta['flag'] == self.k and self.k >= self.year: + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + if new_page: + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':self.k}) + else: + while not new_page: #sometimes the years are skipped this handles small year gaps + self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) + self.k -= 1 + if self.k < self.year: + raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) + new_page = response.urljoin(new_page[0]) + self.k -= 1 + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':self.k}) + else: + self.logger.info('Crawling has finished with no errors!') + else: + new_page = response.urljoin(new_page[0]) + if 'flag' in response.meta: + self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':response.meta['flag']}) + else: + self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':self.k}) + + def parse_post(self, response): + ''' + parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments @@ -37,9 +141,10 @@ class CommentsSpider(FacebookSpider): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) - self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans)) + self.logger.info('{} nested comment'.format(str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, + priority=1000, meta={'reply_to':source, 'url':response.url, 'index':response.meta['index'], @@ -49,7 +154,7 @@ class CommentsSpider(FacebookSpider): if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i,reply in enumerate(response.xpath(path2)): - self.logger.info('{} regular comment @ page {}'.format(i,response.url)) + self.logger.info('{} regular comment'.format(i+1)) new = ItemLoader(item=CommentsItem(),selector=reply) new.context['lang'] = self.lang new.add_xpath('source','.//h3/a/text()') @@ -71,7 +176,7 @@ class CommentsSpider(FacebookSpider): new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, - callback=self.parse_page, + callback=self.parse_post, meta={'index':1, 'group':1}) else: @@ -80,7 +185,7 @@ class CommentsSpider(FacebookSpider): new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, - callback=self.parse_page, + callback=self.parse_post, meta={'index':1, 'group':group_flag}) @@ -88,6 +193,9 @@ class CommentsSpider(FacebookSpider): ''' parse reply to comments, root comment is added if flag ''' +# from scrapy.utils.response import open_in_browser +# open_in_browser(response) + if response.meta['flag'] == 'init': #parse root comment for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): @@ -120,7 +228,7 @@ class CommentsSpider(FacebookSpider): back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, - priority=100, + priority = 1000, meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], @@ -131,7 +239,7 @@ class CommentsSpider(FacebookSpider): next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, - callback=self.parse_page, + callback=self.parse_post, meta={'index':response.meta['index']+1, 'group':response.meta['group']}) @@ -155,7 +263,7 @@ class CommentsSpider(FacebookSpider): back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, - priority=100, + priority=1000, meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], @@ -166,7 +274,7 @@ class CommentsSpider(FacebookSpider): next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, - callback=self.parse_page, + callback=self.parse_post, meta={'index':response.meta['index']+1, 'group':response.meta['group']}) diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index c455308..f4f07ad 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -4,7 +4,7 @@ import logging from scrapy.loader import ItemLoader from scrapy.http import FormRequest from scrapy.exceptions import CloseSpider -from fbcrawl.items import FbcrawlItem, parse_date +from fbcrawl.items import FbcrawlItem, parse_date, parse_date2 from datetime import datetime class FacebookSpider(scrapy.Spider): @@ -15,7 +15,8 @@ class FacebookSpider(scrapy.Spider): custom_settings = { 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'reactions','likes','ahah','love','wow', \ - 'sigh','grrr','comments','post_id','url'] + 'sigh','grrr','comments','post_id','url'], + 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', } def __init__(self, *args, **kwargs): @@ -33,16 +34,19 @@ class FacebookSpider(scrapy.Spider): self.logger.info('Email and password provided, will be used to log in') #page name parsing (added support for full urls) - if 'page' not in kwargs: - raise AttributeError('You need to provide a valid page name to crawl!' - 'scrapy fb -a page="PAGENAME"') - elif self.page.find('https://www.facebook.com/') != -1: - self.page = self.page[25:] - elif self.page.find('https://mbasic.facebook.com/') != -1: - self.page = self.page[28:] - elif self.page.find('https://m.facebook.com/') != -1: - self.page = self.page[23:] - + if 'page' in kwargs: + if self.page.find('/groups/') != -1: + self.group = 1 + else: + self.group = 0 + if self.page.find('https://www.facebook.com/') != -1: + self.page = self.page[25:] + elif self.page.find('https://mbasic.facebook.com/') != -1: + self.page = self.page[28:] + elif self.page.find('https://m.facebook.com/') != -1: + self.page = self.page[23:] + + #parse date if 'date' not in kwargs: self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)') @@ -148,11 +152,19 @@ class FacebookSpider(scrapy.Spider): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) - date = parse_date(date) - current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') - + date = parse_date(date,{'lang':self.lang}) + current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date + + if current_date is None: + date_string = post.xpath('.//abbr/text()').get() + date = parse_date2([date_string],{'lang':self.lang}) + current_date = datetime(date.year,date.month,date.day) if date is not None else date + date = str(date) + + #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) + new = ItemLoader(item=FbcrawlItem(),selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) @@ -161,8 +173,8 @@ class FacebookSpider(scrapy.Spider): new.add_value('date',date) new.add_xpath('post_id','./@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") - #page_url #new.add_value('url',response.url) + #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) @@ -173,18 +185,24 @@ class FacebookSpider(scrapy.Spider): #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" - new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() - #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ + + #new_page is different for groups + if self.group == 1: + new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() + else: + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ + if not new_page: - self.logger.info('[!] "more" link not found, will look for a year') - #self.k is the year that we look for in the link. + self.logger.info('[!] "more" link not found, will look for a "year" link') + #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 - self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page)) + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps @@ -194,7 +212,7 @@ class FacebookSpider(scrapy.Spider): raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() - self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page)) + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})