From dafd01c8bd22c7b623e0e500dbbe27f1270b88f9 Mon Sep 17 00:00:00 2001 From: rugantio Date: Mon, 4 Feb 2019 19:26:00 +0100 Subject: [PATCH] fixed recursion on pages --- .~lock.Trump.csv# | 1 + fbcrawl/__pycache__/__init__.cpython-37.pyc | Bin 139 -> 139 bytes fbcrawl/__pycache__/items.cpython-37.pyc | Bin 7150 -> 7325 bytes fbcrawl/__pycache__/settings.cpython-37.pyc | Bin 596 -> 496 bytes fbcrawl/items.py | 42 +++--- fbcrawl/settings.py | 5 +- .../__pycache__/__init__.cpython-37.pyc | Bin 147 -> 147 bytes .../__pycache__/comments.cpython-37.pyc | Bin 3454 -> 3103 bytes .../__pycache__/fbcrawl.cpython-37.pyc | Bin 6663 -> 7745 bytes fbcrawl/spiders/comments.py | 34 +++-- fbcrawl/spiders/fbcrawl.py | 140 ++++++++++-------- 11 files changed, 127 insertions(+), 95 deletions(-) create mode 100644 .~lock.Trump.csv# diff --git a/.~lock.Trump.csv# b/.~lock.Trump.csv# new file mode 100644 index 0000000..6f06104 --- /dev/null +++ b/.~lock.Trump.csv# @@ -0,0 +1 @@ +,rugantio,alice,04.02.2019 17:42,file:///home/rugantio/.config/libreoffice/4; \ No newline at end of file diff --git a/fbcrawl/__pycache__/__init__.cpython-37.pyc b/fbcrawl/__pycache__/__init__.cpython-37.pyc index d4240eb55201721d82368270891e1bf24a649074..2b546852d3ac74dd3c2c3730cbc864a0e0331c2c 100644 GIT binary patch delta 19 ZcmeBX>}KS4;^pOH0D{#^gC=q}001O`1V{h? delta 19 ZcmeBX>}KS4;^pOH0D}Gd1154e001R(1atrZ diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index b2f57d3a7e36dbb0299e1019fa740ffd5e5f2aa5..825ad8e19cd7530dbb754713661d4c3e39475b42 100644 GIT binary patch delta 571 zcmb7A%St0b6s=obno2am5!AE_3SzJwAxjY%6qNxtZd?fEv$Z;=v15l@i82z=&3wSv zOA!)q>tbBH^eg%!`~X+pZWVQD6?Hi0p8Ke(r`emi!csn;W3;TRjjf60|AjxrpB70$ zXPJ(9Bxo@WHamwFw3+4|IOi>{ts@IpqWj|1#X3uk?oY;L3pAso%n_F0Th`rTjKh-{br*@s`ZEE7=))45XXQJfmU${UNdkLPQbSBI7YS;)(8T*n9US6P( z8|*i0rtI)oRQzL&i)1-gY&9{6GnHl#5Wz7DMpO3K_#hDmlIK9e^cTc!at{9e7=YWz z*w@mt+M>f$F)9qt+q>jpiNJw4YrMLPZX?7s$By=g{R@>v&pl}P;jvL&fq&!~CETi8 zRU!yaJY}7Rr%D+AkOGH@sJcwmMK2wQo77bGqyfEK!LWv-$*d0;Pvj249es_ z!yTK)Kwpi+c$C;&6b^&LNuwoidtxUo`75a#HmP-a21MFm`=SEjon%E`% w26SwlgaZ;>A&DL05+Tcf;0~?$9}@oNydw6@Vs*ce8CrQb4sNhZwheH90g<;-egFUf diff --git a/fbcrawl/__pycache__/settings.cpython-37.pyc b/fbcrawl/__pycache__/settings.cpython-37.pyc index 32cc9a47b77f45119999f5ce391d9b0d597ee2db..380b6bdf908d6bf5bb1f0d932415916cda41b392 100644 GIT binary patch delta 151 zcmcb@@_{+biIVTfW#VN7R?VoYI-VoG6( zVoqU>Vo70%VohOwKVy`zu(Q9Pb7)YIt6xZb jkZWM5YjDWq-HhgJ+&~3I+>?JW268jAfS@xABMTz{!^#`* delta 267 zcmXv}%}T>S7~Jh{e%dBRqeoxBC-5qzDF}kL3d)C2VrET)X#=~dkskCJO8X9d9S@#z z_2$V(a7CP9hMBp|xA>56gD~tdi{1P9UWJT({+53PHk?_Rfi*B<;5jD_P91beK#+ul z0~a2AbkRc}0YdB{$^&w+Pc9C~Lrgvnui42Fd5;g7PMdNS{e^Y@I%!=~Ep?_^iL|y} ztCyO(TBTWCR0~5+Sv)C2GM%L}l4bR*NHkSL(iFKJa;PGg7jy799?n^0Ta?GXW4+KU>GXMYp diff --git a/fbcrawl/items.py b/fbcrawl/items.py index 4f48bae..ecafbd9 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -413,35 +413,38 @@ def url_strip(url): #catchin '&id=' is enough to identify the post i = fullurl.find('&id=') if i != -1: - j = fullurl[:i+4] + fullurl[i+4:].split('&')[0] - return j - else: - return fullurl + return fullurl[:i+4] + fullurl[i+4:].split('&')[0] + else: #catch photos + i = fullurl.find('/photos/') + if i != -1: + return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] + else: #catch albums + i = fullurl.find('/albums/') + if i != -1: + return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] + else: + return fullurl + class FbcrawlItem(scrapy.Item): - source = scrapy.Field( - output_processor=TakeFirst() - ) # page that published the post - + source = scrapy.Field( + output_processor=TakeFirst() + ) date = scrapy.Field( # when was the post published - input_processor=TakeFirst(), - output_processor=parse_date + input_processor=TakeFirst(), + output_processor=parse_date ) - text = scrapy.Field( - output_processor=Join(separator=u'') + output_processor=Join(separator=u'') ) # full text of the post - comments = scrapy.Field( - output_processor=comments_strip + output_processor=comments_strip ) - reactions = scrapy.Field( - output_processor=reactions_strip + output_processor=reactions_strip ) # num of reactions - likes = scrapy.Field( - output_processor=reactions_strip + output_processor=reactions_strip ) ahah = scrapy.Field() love = scrapy.Field() @@ -451,4 +454,5 @@ class FbcrawlItem(scrapy.Item): share = scrapy.Field() # num of shares url = scrapy.Field( output_processor=url_strip - ) + ) + shared_from = scrapy.Field() diff --git a/fbcrawl/settings.py b/fbcrawl/settings.py index ee82e25..0d6f667 100644 --- a/fbcrawl/settings.py +++ b/fbcrawl/settings.py @@ -14,7 +14,6 @@ BOT_NAME = 'fbcrawl' SPIDER_MODULES = ['fbcrawl.spiders'] NEWSPIDER_MODULE = 'fbcrawl.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' @@ -22,7 +21,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS = 1 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay @@ -88,7 +87,7 @@ ROBOTSTXT_OBEY = False #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV +#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV FEED_EXPORT_ENCODING = 'utf-8' DUPEFILTER_DEBUG = True LOG_LEVEL = 'INFO' diff --git a/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc b/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc index da2dd567bc05cddd829c4351f503a56e08c1268b..ef2c24485f547691f63b0a72be55af8d94708566 100644 GIT binary patch delta 19 ZcmbQtIGK^#iIS7~H?wG`3NzQuQFEf;31IQ3NlADx%;)4~jxmbm^vTN=)l!(OM$n1L$QR zKzs8fs23l=r|>Pji?~q_;=(sGJKqO0pA)xZ#&Ra31NDsA-Tk%fqw$Vl<_Ql&IQcw6 zwlSQRUb1$XP#5u3)r6}GI+Y0C0S0?;1su#>pa`iS>cI=Fd>CPd$H5)y0p$>Znhj|dwJn@MA0l4mVCimg#-J0!=Wl?LY|AMj~5iltUkSNCisBTd_sYvaW8DQ zLO~t4b{ZJe~tXg6(8gs3XCHexi4O4SY^6@)& zOcUxEEnZ8Xy3@?ECkoD;{B~s z{*(P%!}e!=5##e;mdHaLSEO%j+Y}Nh7+o0QciA&<@5lH6apMs0k?}^9iqbKe;wip~ z+SR=jct%X()ERU@4-sV2k=eK6bd35LPTy*q2%{HUkI-xn6MO@BV~V-;jEJu@+Thj+ zZOmu`=uQKNeLG0c!@sn0{M-uUKJM=O~r!Zii#tun?w)1S1s$PH$Pwr_FF_fwoFyOGh#cQ z%$rN4d%%PWMEDf8aTab{v$O&(;APFiSy?wfUJy0##jL^0u$tI4_y0dC*5RdGuwFOB z6&0+5IOJbXEy7vf6C27aoXDbDH9(%qAJ#kEkdHH$R@6BQZ4jyxOf)-BDRpQ;zR$eD F`d=dotCIi# diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 41f9bb32cda1c69a0ba26a22beb6d9c4ef5bacd9..c6c0fc6ed724c3d5a679563c6106893ec172b0a4 100644 GIT binary patch literal 7745 zcmbVR&2t;cb)PSM5(FuV`ruO8nuRw@VM7tSlxr>1OPiELX_e(tERoLcLQ$|o^ne&( zfPuOPq&N$?l4xDNSgt+fkb@6oZpkqRA6%82%jIKEsnnDYuK1*XA(!O$dIm#~qGDFa znx39{-LGH2dGGz+Yup|m&nx&V|J@gV-Tn(j`6E3H{|t0iaEZS}!xW~vN;SHxRTa;g zt9A6MuF^Z*H9DzkO7;ym-N{rls&ZdpDVF|5VQE`wWnbvk98!}Ds)n6nIy1ge zUZ~YHoBo;7P+6AczR6WHEYHTiQL0&1VB@&wSdmTOo@bNn0`6mMie1FL@H53ws^gIJ zQdqoa)$JY6Yj5=&$QaD$P5fpPY!%5qp$YwXmu zSXPpm^!X5%co)q;Z7I(UrVg}rnqM0jhbq&UzN;Q-sxnXp#tYJvu~9YZUbh}j-CJ9` zTU+~L^WmfI+Pw#B>vy-R69?Xjzp@r)h3)%JcUOcudgWB8*If~2q#A7=8WsmB-QK4= z&p?WQR`@t>|NVnX)9ct3-ru#b1+Q|~+wZ#gE~0`x&__t_LTVy;ztB4f3k~jdYVjf$ zX@glbik4QhYEcVjPbzU?babe^`Y*^4s>zun z`ASPaQdPw_UMQ_pLm6a#fv17W=Leej$WJrvh5Gkuw=hr#N-Z-W}*H&6J;NGthc1-@PrVnV9->P_rY{cO^9!Q~w3j{0KEY(20jXMqiJO zwN?JRxL;_e#J~HnJCY!Jhmqrh6f3s1Z&f}yP+G;q2{u7Hi~Ex!{R^_6!ko!&53{Cd zmbi`H#iTc#^e!d6%hcl=gVf=LBii3&cQNLE<#1|{I)YY{^{$TeXQ&U2y+!h}G0eQk zX3WvWA0h5!%Vwl zIj(7S89YabeUG!He8j+P?3g#KXuxu5?X%Sf>m_(a z$R*9L)v;-HX_D`pSb^xo?Ec!u>St@EV5aH&J+V}&?CUZIre${fogJH-iywUO zk@@kmc|cU$7eDy$`+SNRf3|LQcl)#}%lEmn)AzAI&mU@*&6^L*eaCf8pC8bkcM&~> z>G&qOndvueQmxbVZBDxjR<=D;^z6FRIG8srhL401E|!3*In;)6OXdlU5plV+zPfS$ z$Ix0SxP7P3IRc$~VAh+SQ%3}Bc-%Z5VZsWQ%#z(L&7&36HaNA8Pi@aHg__;PB|@#i zL(TC+t>*{t&_>PD*Uw9GfAwS)(eL#&im&1(f?d~E>hq+zbUq>IY znL_uvy=S{&=JVA@8xJ<_hbe55D7Bi?b^KZ_EUpesV2yK+hbie$VRq;xq0w-HPVUU4^XX&=9uU;NQU}IiVPTkD4+y^VEz{Q=sO3Gyw5jtyXv8?6sN%8eSyYJ7{Kci4`=8 zmRAdEQO~M*J*}p7L(OYNbyAHlEv=iN&=~s9s*7qtn~M50UJI_A!k3}C66_@mCUm)j zOT3K+7@`h z71H2iptcXo5!Y9&I;kIqFL{*8Ps*CqAAKev%6gci3D0_#-wdnOz)K<1KEqHo{1?<< z`jisoz(j*20b#P#U=b!hP!Ew09;y5m^rjwaOpDQj`Vl_sEdx3>Sjr(iBM*G3ZxObo zS*ES=KS&N)aQG_b08~ZxmAU{bIbOVSa=-U=1r#|W0mD{+hPZa`6g3u zmdy=o&)KyAEQBADie5^)m z!0taHL;oH%5;f;46{p+l`;S+8E?}?exk%ra=e7|2>P>TK>vKT#J6j1znZ@1M=>c3;#1*HG7a8Hy6WN(?J!Gkj!2Ne~bTfdD|AlPkcL%QU5YHrG2~ zX@TK+`2nW+TO{TD+59$b@wD#<i}FuEshK#7 zG~KVCqZ-nIS<)uT#V{oUtVAV_I#-cFmEWS?_oXaJH z3~eDQL>G=@eiEapoTICbOZ*C$_w1i!tV6`Qt33IK=bEqj+6!fXSp8Caj!LDa9|G|X zQ+^u#%nOX8*oF!xOHo_aq0~pbPWq^%WnV+|M~!rtJ5mrIRf>!(KcgU88UvKMtqdEx zrD&+XWTeEq79uy^76!VbvhkP3Un#2cwD5U^BZ%aZQ)-aCha>Y)}Qzl`4({0sE-LcOA}sVmC!H0^zm9^tI=zmbxrVo5O@l1{{u zPGB~1y*SEMN>8gyF``dE_UV^8$=?0#vj(J9#c$$ym!f%W>?*V%^>`Nd%cI=cpOE=m z_32m8P$but(V6Gi$JNpC|HnRlev%jV3GKJj(DSo+BBr~>~2*;LaQ*7yT0Wd?If~*P& zy@%mk0?P65;D$uD5nQ9}>qf($pYy$5&839AMoZKjHb%mKY6q_L@{dF412wyS8WZ1HVa}AAUNI1B!i< zWLN@sqz^K4_>?}!>=&D>+n>mkse$^3N#l`aIm9S!^8?}#7XnDgyuOfP&P#5~7&E&8 z))IuG`Eo2RMfUc8dmU~d+V$B&ASiR zw#YeQ$udbV4uB|=_Zr^h_Y*EVaCYJX|Ai`jjo%1HM8y3>k zvEU31teU!lRP)ZC^aag9PQ z!b!`#LJgITIH>eFr@Eq?83j@r8tV8;)g1h-Cw;J*#dD3e70Gt8xRFeL3PytJ8zoH@ zL#8weS zbyk>B4%0`3wYr~a;cOG}7U${%O(LsACwV$o+=%>Lm$xWCNh5E7uOgwr0Z82ryh_Dq zGPQ5stl&d<-Q9{6#|?8}Rw|-rb+vyBKFQ;aD7>%4{b<6F*|E?LL&XIB+g;Rg5cYM&4dZtO=S&Th zgh>FC=Sri2s+=G3P3co)_K{<(P>+6T7;(XgC}kWjAj-zEyYVd=`w2Ctd}t<|?!!+gTa*4U`MN*wZ3rY&e;j`~ZCeD3XJPSBXEpzzXD>@LlYSA~ zv|%-^=6Pm*m%iqSnXb2IpJ(Rc>6!Yzw|}0AH&0JAgtL1tVZZm<%w5j;dDgsjY9<2v zF^Wh_rD6-eN}+rwydb*`bM$LdjdH8ef+{l1Q#MJp)bDT!s`N)}GPcd&?K5ko;YSoW zP$Od=X)e$sF+-h*!&Z6NPz&?5TE}C3*T%tXp;mj^x7?VLsnwWQ$7_RLgk#d{qZ1!a zzgo%jP5uMw$=Z_8jgu#F5kf}aX^Ul*-=slBYRG5kS2VfFEDlADOd(sm znH|f{XK!W4Bpo3@LM;6N5spVS>H_J}=2GK`w$ZOMbQ(Q8MT~yVTOdsf3G87yswpMQ d5&xH3Ct1Sm$B`m_NJ*qTILxY9;3EFD{{`tMhI{}3 literal 6663 zcmbVRTW=f372X@kB}HBA_$J#))~XYksVv6wC2nG;j-8}Q8yJBdqzxTov*HZNrIx$& z%u=#8^h=x~Kv5TY>_dS{kNE?6> z&iT%n(e>%+iiThCn_Hj%>$Il*Kp*3u1>zbWc@qWGm>y^idK-b>Ga80Yv>BK^t6?eH z3JSeqqp0XYQ0mzYTi1T3u_7ydt+5io4LdGDjhV_e{C z>itOcZt;5qE@Sn%+7#YKKs@w$BZwrw-S@%Vqyfgi!iZ3+a;>2=tzj^|VKSp(v4h`e zEnTTmZWNf!%3o`ZBCD_|yi2Ugrt!Af44cKf%;wkuyer>mme!bpsxPE7?|V(Y8bxcF znS|4qib5M#rjZtXmpL1HZ1;5*R`Q8jt>oaIyRa9Q2$B~t3z2-8 zi=nwyWahTctTj`7J~UZ@X}>n+HE@=~>#DUlYE95unrJ<(TJ2Hm@6lSGXe|y+8Y8xc z)#1bE6UTk|X1%_#v9a758P9Sv>eYAthu_XTHa$5xX6mha(r@nJJ4q}@bCx}=N73s4 z02}iEiXwUKGwuneH;}Qj${o)M2fbA;oR!zlU39KobvCILZ+!nmlxg5MKJvo$fHe2w zSoo`h7;}i?@mjDY=R;@14+19^n>4RB{82i7oc#30KnU1(u<6J_zaNPhix`Xsla43& zo6Z6c7nV>;DqDg|Kc=!DFQf(!@yOI@iPZ4p)ab{_DbmJS`0~MmQlhzEL~`jy2Rz)_ zj0VE-!g=6TTyD!YNo{&aL|hdK}1@Su_t18Ab=^Q)RH`Ci6dwbWCntaNszgSmrt2;1AbaAF;6s566^KCP$8tViZ;!exsUfY!fN zoQM$O3M&CAt?*-2gQy^Gva+gqTh&xh^97qyHA||diWGXKqiI~&zKn{Y~ z?_;m~1JOq^Am7Yc&(+3u&;o`NE|ED%hYb`q=LoSVy>)(;@pZq+7ahcFUvRbi`A91@ z=QfXXT-yblCC6_m0PzCBJ+_&fS>E-0&AI8V`)v=1O2Fx_^H607PR^3NTCX#Iz3xqz zUq|rP(qoF%?xu$*thkwFy6}2v$yr)vFj|-907I%_m&I#TC|qBx*Zr_Ri0@qM2f)Wp z6d+T&x_BEf)9g57<5|2bX3!-$mUAQa4fxf?>jT;J1e(d&laqU!Z}Daai5DKQ%n{4c zy(~A-?xFPnwVv49`Z1UH2EJ4x4yyh+JX7?+Hbm{vPdH1CQFi4cD&)JS;_NH3W~3&7 zP|dkEofecwD&G@7#)x8x3If6mI!oZ~Mt+!~k zUxs?O!lvgnHls4LDn^)MvuirCG@`3p(`-)VWrRNm#$$|H4pQEhX)xj`jK3cgl}3aU zvuv8Z0F4f18hvZtdp(zWSV{eALMqWDRb_o@#hzd4%u}QuRZ{<>q#jdqC7Ptp=W{*& z+;cs=D>eLQKGPFwy}L^0Nysz=(IoT5T;@y9E%WGvOp(i^JvPPbO6JRunb9QE$z`5; zZkZ=0WZqKNII@KYMZ^%;)mjk}k4@UaShIRVrNubv zy8%akmqfTeTcorCc^*!)shuf-ywS$Npog4VCMT9DFx*+WJAQe0&fTrY{C<3-4n7s3}KuUUzG}@;Por?0MN@hZ=uB(E}Rp-kGJCr{m zuBwoF1@sO;SW8HOi~AI~utPbk|2H~<5yz!{9Os`~#Y?%0JG3SK-^W7U*e7J=xrMxu z3-RfGLQUfGzVXjJ_xP9d@o(1V#0usiXvGBWY+CjhbF+lHggp?jSaF_OFHmujSVl=R zc~5EvJQTDe*;ywK<-(ylI+)Z-lW{xi6!MKm8Sd6sE|`|{-EP=uQv1F)xYJM^kNg!1 ztw@I(I?hzB8BixpRpNU_KSJO5dlG$oqW&@3pwS%TDaioYVV(@oDv>#=WI%wRDG^>n=W~}WgN8( zKXoHLMIxFkSdi3 zW_KIampUtF$a`%>XC20a)`tM@sP?ia+9!l=y2fU1icn zML+noyp&cyH$?)$+!XKQ<0+_QP}iJ zr{G$ZF6h+R85eY}+l$yB;JB=-y6(M!7mQj;uFIk(sx7KecU)qb#!yE{iUP`O1Sy(M zkM7c%JP2HO*Sl)EI8B_&$>`)p3P?H|+qfLFjG3A}WiQxo+7)$kSesIwoK9!Q&t+OJ zTq@GSrcUT>O5M_vy+?W(VD3cBoWRcGb%$QZBWxWb5!Y P0g$>^vJC|Sde!_t#6un3 diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index 1ec1239..c9da10b 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -4,7 +4,6 @@ from scrapy.loader import ItemLoader from scrapy.http import FormRequest from fbcrawl.items import FbcrawlItem - class FacebookSpider(scrapy.Spider): """ Parse FB comments, given a page (needs credentials) @@ -78,22 +77,27 @@ class FacebookSpider(scrapy.Spider): ) def parse_page(self, response): - for post in response.xpath('//div[count(@class)=1 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts - new = ItemLoader(item=FbcrawlItem(),selector=post) - new.add_xpath('source', "./div/h3/a/text()") - new.add_xpath('text',"//div/div/span[not(contains(text(),' ยท '))]/text() | ./div/div/text()") - yield new.load_item() - - rispostina = response.xpath('//div/a[contains(text(),"rispost")]/@href') - - for i in range(len(rispostina)): - risp = response.urljoin(rispostina[i].extract()) + #answer from page + for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): +# resp = ItemLoader(item=FbcrawlItem(),selector=risposta) + rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href') + risp = response.urljoin(rispostina[0].extract()) yield scrapy.Request(risp, callback=self.parse_rispostina) - next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") - if len(next_page) > 0: - next_page = response.urljoin(next_page[0].extract()) - yield scrapy.Request(next_page, callback=self.parse_page) + +# for i in range(len(rispostina)): +# risp = response.urljoin(rispostina[i].extract()) +# +# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts +# new = ItemLoader(item=FbcrawlItem(),selector=post) +# new.add_xpath('source', "./div/h3/a/text()") +# new.add_xpath('text',"./div[1]/div[1]/text()") +# yield new.load_item() +# +# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") +# if len(next_page) > 0: +# next_page = response.urljoin(next_page[0].extract()) +# yield scrapy.Request(next_page, callback=self.parse_page) def parse_rispostina(self,response): for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index c2ba592..10e256f 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -1,30 +1,39 @@ import scrapy +import logging from scrapy.loader import ItemLoader from scrapy.http import FormRequest from fbcrawl.items import FbcrawlItem -from scrapy.exceptions import CloseSpider - class FacebookSpider(scrapy.Spider): """ Parse FB pages (needs credentials) """ name = "fb" + custom_settings = { + 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ + 'reactions','likes','ahah','love','wow', \ + 'sigh','grrr','comments','url'] + } - def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs): - super(FacebookSpider, self).__init__(**kwargs) + def __init__(self,email='',password='',page='',year=2018,lang='_',*args,**kwargs): + #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs + logger = logging.getLogger('scrapy.middleware') + logger.setLevel(logging.WARNING) + super().__init__(**kwargs) #email & pass need to be passed as attributes! if not email or not password: - raise ValueError("You need to provide valid email and password!") + raise AttributeError('You need to provide valid email and password:\n' + 'scrapy fb -a email="EMAIL" -a password="PASSWORD"') else: self.email = email self.password = password #page name parsing (added support for full urls) if not page: - raise ValueError("You need to provide a valid page name to crawl!") + raise AttributeError('You need to provide a valid page name to crawl!' + 'scrapy fb -a page="PAGENAME"') elif page.find('https://www.facebook.com/') != -1: self.page = page[25:] elif page.find('https://mbasic.facebook.com/') != -1: @@ -35,22 +44,27 @@ class FacebookSpider(scrapy.Spider): self.page = page #parse year - assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019' + assert int(year) <= 2019 and int(year) >= 2006, 'Year must be a number 2006 <= year <= 2019' self.year = int(year) #arguments are passed as strings - + #parse lang, if not provided (but is supported) it will be guessed in parse_home if lang=='_': - self.logger.info('Language attribute not provided, I will try to guess it') - self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') + self.logger.info('Language attribute not provided, I will try to guess it from the fb interface') + self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"') + self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"') self.lang=lang elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt': - self.lang = lang + self.lang = lang.lower() else: self.logger.info('Lang "{}" not currently supported'.format(lang)) self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('Change your interface lang from facebook and try again') - raise CloseSpider('Language provided not currently supported') + raise AttributeError('Language provided not currently supported') + #current year, this variable is needed for parse_page recursion + self.k = 2019 + self.count = 0 + self.start_urls = ['https://mbasic.facebook.com'] def parse(self, response): @@ -73,29 +87,39 @@ class FacebookSpider(scrapy.Spider): ''' #handle 'save-device' redirection if response.xpath("//div/a[contains(@href,'save-device')]"): + self.logger.info('Got stuck in "save-device" checkpoint') + self.logger.info('I will now try to redirect to the correct page') return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, - callback=self.parse_home) + callback=self.parse_home + ) #set language interface if self.lang == '_': if response.xpath("//input[@placeholder='Search Facebook']"): + self.logger.info('Language recognized: lang="en"') self.lang = 'en' - elif response.xpath("//input[@value='Buscar']"): + elif response.xpath("//input[@placeholder='Buscar en Facebook']"): + self.logger.info('Language recognized: lang="es"') self.lang = 'es' - elif response.xpath("//input[@value='Rechercher']"): + elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"): + self.logger.info('Language recognized: lang="fr"') self.lang = 'fr' - elif response.xpath("//input[@value='Cerca']"): + elif response.xpath("//input[@placeholder='Cerca su Facebook']"): + self.logger.info('Language recognized: lang="it"') self.lang = 'it' - elif response.xpath("//input[@value='Pesquisar']"): + elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"): + self.logger.info('Language recognized: lang="pt"') self.lang = 'pt' else: - raise CloseSpider('Language not recognized') - + raise AttributeError('Language not recognized\n' + 'Change your interface lang from facebook ' + 'and try again') + #navigate to provided page href = response.urljoin(self.page) - self.logger.info('Parsing facebook page %s', href) + self.logger.info('Scraping facebook page {}'.format(href)) return scrapy.Request(url=href,callback=self.parse_page) def parse_page(self, response): @@ -106,6 +130,7 @@ class FacebookSpider(scrapy.Spider): #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(),selector=post) + self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()") @@ -113,54 +138,53 @@ class FacebookSpider(scrapy.Spider): #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() - temp_post = response.urljoin(post[0]) - yield scrapy.Request(temp_post, self.parse_post, meta={'item':new}) + temp_post = response.urljoin(post[0]) + self.count -= 1 + yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) #load following page - next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() - if len(next_page) == 0: - if response.meta['flag'] == 4 and self.year <= 2015: - self.logger.info('2014 reached, flag = 5') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5}) - elif response.meta['flag'] == 3 and self.year <= 2015: - self.logger.info('2015 reached, flag = 4') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4}) - elif response.meta['flag'] == 2 and self.year <= 2016: - self.logger.info('2016 reached, flag = 3') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3}) - elif response.meta['flag'] == 1 and self.year <= 2017: - self.logger.info('2017 reached, flag = 2') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2}) - elif response.meta['flag'] == 0 and self.year <= 2018: - self.logger.info('2018 reached, flag = 1') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':1}) + #tries to click on "more", otherwise it looks for the appropriate + #year for 1-click only and proceeds to click on others + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + if not new_page: + if response.meta['flag'] == self.k and self.year <= self.k: + self.logger.info('There are no more, clicking on year = {}'.format(self.k)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + if new_page: + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Everything OK, new flag: {}'.format(self.k)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) + else: + while not new_page: #sometimes the years are skipped + self.logger.info('XPATH not found for year {}'.format(self.k-1)) + self.k -= 1 + self.logger.info('Trying with previous year, flag={}'.format(self.k)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + self.logger.info('New page found with flag {}'.format(self.k)) + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Now going with flag {}'.format(self.k)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: - new_page = response.urljoin(next_page[0]) + new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: + self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag'])) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) else: - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':0}) + self.logger.info('FLAG DOES NOT REPRESENT ACTUAL YEAR') + self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) def parse_post(self,response): new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") - new.add_xpath('date', '//div/div/abbr/text()') + new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') + new.add_xpath('date','//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') + new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.urljoin(reactions[0].extract()) @@ -175,4 +199,4 @@ class FacebookSpider(scrapy.Spider): new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") - yield new.load_item() + yield new.load_item() \ No newline at end of file