From efda9a956eb5155f622516f284c6b4e2573752ae Mon Sep 17 00:00:00 2001 From: rugantio Date: Tue, 23 Apr 2019 07:31:23 +0200 Subject: [PATCH] [fb] in items.py refactoring parse_date, introducing "date" attribute --- fbcrawl/__pycache__/items.cpython-37.pyc | Bin 8429 -> 9004 bytes fbcrawl/items.py | 25 ++++++++++- .../__pycache__/comments.cpython-37.pyc | Bin 5148 -> 5148 bytes .../__pycache__/fbcrawl.cpython-37.pyc | Bin 8346 -> 8536 bytes fbcrawl/spiders/fbcrawl.py | 40 +++++++++++------- 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index 2fdfd7d829c2456639c3d8a078a21b594930b262..aec6e3c1fe6d0e524f6dbeeff2e11b1a6b6db459 100644 GIT binary patch delta 1047 zcmZ8fO-~a+7@paWvi4&sLVX5O@IQDm*^@^T<8RRSUBD2hnP;DQX5ROG-p>5K_j$=&$YhcP#>e+x z*S%*a<`@l5rk@m9nq`hiDYZiDNoO4$;BDiiqo1E=*u?oVU@$U4Z@Bfb_{J zf#d~9js_?elS=ORLF!$(dE> zo!Q>5iga%#fzsuw$~R+0b0R%|uWYmztoNRyowk2Kr{F^S)r*K|ZDQXyHS$H{CO z4GLaq3%|C@%W?f4P7rD+CQT^mBs8OlIw+Ns zp%MBa%;m-x7(J$?0j_Rq+`u<;Ok?FrtR!Qeig`NbnM>T{YF|4gq0HTyhVYU2z_l~6 zG|Du%Up-1KZB&G_=RSfPvT67ucU@hv$3indN9*C!{3d79LfAnN2tI-Yhz!A7Zl^3g@f@{91bn`tjqnm- z4`Dz2GifJ!Ks6;SSeAj*9uy*6wH^=0e^{iUe}RTXFU(q_Gm$aZ@L1dB^Y~KyF#v;J Ug|`CH_*0Cmal;rgMu#W<0&xZR;s5{u delta 441 zcmYL^F-yZh7>4hXw8o@KYO%EiOVd;$DdOPb;-C%{1zS)M4Pt3?*VbsKZfP5g5x8A zXAm4qzADbMNJ8=$B4rGHO&DEpLa-s&{Ac=ESlJZi(n4m$g@tL0EQ+xy;*nly2BdM* z+YRxhCq-80YH%Ia2c#e1;jmArYcFXYBn>uX)yvU+dm}C}CI|H1K7tYbve!~|CO#kl z!!iREs!*Y&q6_QvtdOHY@phHJCN2wBsmZpgmm~pqhzI7;;?d@DPBkZ=>~K8calxZU x*B#Fa8UFG{Sq3Vkc5z-;A>rCHNsB%>dq$-2uMj>H3F)`v!VH}&W#m*U{|6MTW#|9^ diff --git a/fbcrawl/items.py b/fbcrawl/items.py index e16cb48..f424258 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -462,12 +462,33 @@ def url_strip(url): else: return fullurl +def parse_date2(date): + import json + + d = json.loads(date[0]) #nested dict of features + flat_d = dict() #only retain 'leaves' of d tree + + def recursive_items(dictionary): + ''' + Get most nested key:value pair of nested dict + ''' + for key, value in dictionary.items(): + if type(value) is dict: + yield from recursive_items(value) + else: + yield (key, value) + + for key, value in recursive_items(d): + flat_d[key] = value + + return str(datetime.fromtimestamp(flat_d['publish_time']) - timedelta(hours=5)) + + class FbcrawlItem(scrapy.Item): source = scrapy.Field() date = scrapy.Field( # when was the post published - input_processor=TakeFirst(), - output_processor=parse_date + output_processor=parse_date2 ) text = scrapy.Field( output_processor=Join(separator=u'') diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index 4976234e18aa16d9da8edafffddab4dc8b21a176..dd5952830ec94452dba7337c9e63d8f98d513bc6 100644 GIT binary patch delta 19 ZcmbQEF-L>TiITiI7&-6`t8$E|-7HABiF<$&_SSc9>X>?HGxw*n$6&*lFA-5fZgZ6N2WfY*PI5 z?5dWOWf+G++5izCXHLl_WO`^1y%ak2(nI=lDbNN%QM8+)K-*J`9(w4dm!j{@QWk9> zNJ#A4w{PCO`QCdo@6AUS{&>;67>k7o{EmO}^Y8AQx@%_X+TCMMoMD6oYDPV@8mfm^ z!<4|cUW?Qts}V|`B`nB{cL*~qQjK=i)fl78q!7CMyO=v#;Me;pZ@q|N*DJ2NAt9E+KI9xOR-c}bxASCl0ePI zX(#R^T!kfaq)WS$5+~YA`rmA%ub?JKQhic2St?6#4jXlq_24e$AGxF&>=`WGhj{~j zY>3Yw&$2LQr15SC6h>HXUD>7fA0$6M#7_+IlSBL=B@Z&Nl4ct+U~plLRDgTi}^5s2~8y%nuTb36(etNbNM$Ui7>+s52? z=I843b8PQp-B1FRzr%?5kugg%;y1?1`5fqY+FGT-JbkUn>m`SS0+CicZObZgFVNzZ zhU00@R?G4X+u{hd3Tl^fVN%9m91f=fLUNQ0~C2|4BE+Uy0 zm&|K)M%*=*=wb0U^JO|K7UD0^BjN|~m*|}MBEGTA^PpK!`Plv}Ppex_sbG46%~r{| z$>(w41QHd=B_x-TOd&ZA#0zflTD4heaEui{DSnukJ%WJw(>SW#!eA3zCa?Xct+)z&DSg_bebQEMkhbP(J;haJ+ErKU(zfPk zy@0DRwXd`Tj?OeU&{bel=q!L626=qDfpv{@$6$Idgx`JI!7by2ok*9!p$P6MZ5;-p zy%;lkA;$zh-UT3Xx9PnE3$bt?jUgWaQ2Tt8#UzhVV*oV?P|=MPST%E`PuoG4Xkd-S z7l=ZNac87D>IU5osituemT>h7Wyu}w6++1^bA`wlqz0x8*Vt9;-@CLr#zu1JO9l>( z!hek%JlXZY;0vLTO7GxwB5am+a;Vj%8N$Xgq#Z)bZpgRXX`^Y%Kb2UG0aim|%?z;O z0%$ffY*yk2z3Mm{LGu7UyQ3obUFFv2o1aL)Y%nwD&t&l|L_)?AV}bTN7>xY|#)lU` zqwr+c!5}|BFf`$7pu7(|LS_}AO%Bgo`U;=h;P9WSLH_siql|tkYPVe_Zc6I90a=d(W$(SQ zPw9&j{3>X9>J7{B^ir#3H5k`%vMjV`kUNb;+SZUei9|sXKyt4}d1~pp&7Tmrve_HY z;m9|UJP%~Qp0=7$pRb_waU`d3NCgqU23#R53+P~zM^u$BAE*x1AJFiyI5)mQBVupdp)qlJ;ziwr zg|{CUr}C2`HW?Mam?(GrxPu&gJqVx-$SWX~ncy;XEBJX3zqsFzIHcaGKocrx2eIO> zqAsrXU=Y3yvd`eL6F>+Z3xM}gMf_y)XOnVg4Ee!d0}a2c#KoOM3xeezqG!ZTenE`q zlQadgV`3S&5y`y|+^FP!mCs(2+dz7OZOIFkn+?a>bU6Ctg=)3EMI^fM9#G zvAG(mMm5NKn3`(zNh4w$H?A5{sZ%ihI3EQyFJ{B7ueBu}o}Rj$^!XFkX4!%}1w@_W z_V;3I{?+gV`egfO;NF4pbpHw|O&=ihImG1|XX{P2QL~=H`;U!jGvF!$iGhCt((`}1 CiINlm delta 2641 zcma)8>2DiF6rb6(*SDR!PMkQ~gC_2kw4k;?(jz^oXq(WJLKN)Grge_BGrNW)I2Z`3 zpiq%^B*gW>;U7RK65fxnJhpFi8xdo7-%$F6nlN**E1&$QPF(+n~mTu?8B7(GQYfos>hB8PLM zJp01r6Sgt+$)9H^J-s}jq-kY0t1pw0Xqk)$szKIJjQ|&2qDzz#^Qaxe*#c#a6LgXC z^DFbF3I8Isp0neud4ax4@2-0+wRkPDHDzTY02V%V{Up&o@)dB}dUpbR8}cj#h7#sw z5J<502?bpJE%^=)pZ4&b9$p7t;X&8YS{X*dP3RGKb=}HpzZ(a*UhfHG1M;@U9>VNc zBZO1OHUh0@T^A{Q8yEpxyO%4zDy`IfvqqNZZw3gN+x|4Lde*jiK3B30-7MJpSfOOH zExKjcx&>!Y%$pbWTy}Iqw+s4<^Z8#uFaI4X_ekZbl4a{TL(iIe-n4bIG?g>BzN5Ey zr{34EqbvOG=-vM4+*VY`c-%eM)$(pFs028Li!+xJKl@WkrcvaB=eD)LO3gQxH(5;^ zEAXkT%~QCfLIAb+e#FVXOmMIZ#U$B{8-6K!ayb_H zL5Vz$dwobc#9y)FG%fbV2Wh8RjGv*p_%(i#t`$2IPtkSajYR3xZR@eTi?vjm=0@B>hPCAZbUE0a6Q;_~dvYZ*qi&Zxusz>o;J)d@t_$E^%Y* zzKErT3*z0nTm{aO;24Er-Bt*hd8bUPgh5puB*U-ID>k*21yWX+a#fjE;ZmwqA5*J- zTf+;w0CMs!s&FfP%)g{Stst+lfaDdh6_ zWF%lm#-n9e9LGWquy9$;Qx>`EJ4+~;j-DejEGTzr%3t;`D%Mz;j>lQ770WXrYw0|4 zFzTS_6^=6uH(vGQx)Ul(*w9AC6AN?=VU@ZyWL`tlWzA!n^3yU_zZ)y4`#{AF6&;}2 zdaqd-KDudOm_P!8Z@8*r@QcbcoB0R=rq*p&v(eFHp-K{~I-pCY#dOmL z;>qT2+8|2JYiXnSu(@Z$0MrIQ1|*}@f<>M$@Vq_C55U6QbWabZ{N$Skq1*8-yj6Ty)=*FuwpEE~zguOWCTRywVv3>=j_hxH&= 2006,\ - 'Year must be an int number 2006 <= year <= 2019' - self.year = int(self.year) #arguments are passed as strings - self.logger.info('Year attribute found, set scraping back to {}'.format(self.year)) + print(type(kwargs['date'])) + self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d') + self.year = datetime.now().year - 1 #parse lang, if not provided (but is supported) it will be guessed in parse_home if 'lang' not in kwargs: @@ -68,7 +69,7 @@ class FacebookSpider(scrapy.Spider): self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('Change your interface lang from facebook settings and try again') raise AttributeError('Language provided not currently supported') - + #current year, this variable is needed for parse_page recursion self.k = 2019 #count number of posts, used to prioritized parsing and correctly insert in the csv @@ -137,10 +138,19 @@ class FacebookSpider(scrapy.Spider): Then ask recursively for another page. ''' #select all posts - for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): + for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): + many_features = post.xpath('./@data-ft').get() + date = [] + date.append(many_features) + date = parse_date2(date) + current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') + + if self.date > current_date: + raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(),selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) - new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') + new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') + new.add_xpath('date','./@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) @@ -151,7 +161,7 @@ class FacebookSpider(scrapy.Spider): yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) #load following page, try to click on "more" - #after few pages have gone scraped, the "more" link disappears + #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet, click once #and keep looking for "more" new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() @@ -197,7 +207,7 @@ class FacebookSpider(scrapy.Spider): new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') - new.add_xpath('date','//div/div/abbr/text()') + # new.add_xpath('date','//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")