diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index b43a3a1..c5b192c 100644 Binary files a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc differ diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 9eb4ab4..4b03814 100644 Binary files a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc differ diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index 76af529..138cb70 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -24,10 +24,15 @@ class CommentsSpider(FacebookSpider): ''' parse page does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) - 2) retrieves not-replied-to comments + 2) call parse_reply on the nested comments + 3) adds simple (not-replied-to) comments + 4) follows to new comment page ''' - #loads replied-to comments pages + #load replied-to comments pages + #select nested comment one-by-one matching with the index: response.meta['index'] path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' + group_flag = response.meta['group'] if 'group' in response.meta else None + for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() @@ -38,9 +43,10 @@ class CommentsSpider(FacebookSpider): meta={'reply_to':source, 'url':response.url, 'index':response.meta['index'], - 'flag':'init'}) - #loads regular comments - if not response.xpath(path): + 'flag':'init', + 'group':group_flag}) + #load regular comments + if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i,reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment @ page {}'.format(i,response.url)) @@ -53,15 +59,29 @@ class CommentsSpider(FacebookSpider): new.add_value('url',response.url) yield new.load_item() - #previous comments + #new comment page if not response.xpath(path): - for next_page in response.xpath('.//div[contains(@id,"see_next")]'): - new_page = next_page.xpath('.//@href').extract() - new_page = response.urljoin(new_page[0]) - self.logger.info('New page to be crawled {}'.format(new_page)) - yield scrapy.Request(new_page, - callback=self.parse_page, - meta={'index':1}) + #for groups + next_xpath = './/div[contains(@id,"see_next")]' + prev_xpath = './/div[contains(@id,"see_prev")]' + if not response.xpath(next_xpath) or group_flag == 1: + for next_page in response.xpath(prev_xpath): + new_page = next_page.xpath('.//@href').extract() + new_page = response.urljoin(new_page[0]) + self.logger.info('New page to be crawled {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + meta={'index':1, + 'group':1}) + else: + for next_page in response.xpath(next_xpath): + new_page = next_page.xpath('.//@href').extract() + new_page = response.urljoin(new_page[0]) + self.logger.info('New page to be crawled {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + meta={'index':1, + 'group':group_flag}) def parse_reply(self,response): ''' @@ -101,13 +121,16 @@ class CommentsSpider(FacebookSpider): meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], - 'index':response.meta['index']}) + 'index':response.meta['index'], + 'group':response.meta['group']}) + else: next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_page, - meta={'index':response.meta['index']+1}) + meta={'index':response.meta['index']+1, + 'group':response.meta['group']}) elif response.meta['flag'] == 'back': #parse all comments @@ -132,10 +155,39 @@ class CommentsSpider(FacebookSpider): meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], - 'index':response.meta['index']}) + 'index':response.meta['index'], + 'group':response.meta['group']}) + else: next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_page, - meta={'index':response.meta['index']+1}) + meta={'index':response.meta['index']+1, + 'group':response.meta['group']}) + +# ============================================================================= +# CRAWL REACTIONS +# ============================================================================= +# def parse_reactions(self,response): +# new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item']) +# new.context['lang'] = self.lang +# new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") +# new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") +# new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") +# new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") +# new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") +# new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") +# yield new.load_item() +# +# #substitute +# yield new.load_item() +# ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾ +# _________v___ +# #response --> reply/root +# reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href") +# reactions = response.urljoin(reactions[0].extract()) +# if reactions: +# yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) +# else: +# yield new.load_item() \ No newline at end of file diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index 9e7f0db..eaae19f 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -66,7 +66,7 @@ class FacebookSpider(scrapy.Spider): else: self.logger.info('Lang "{}" not currently supported'.format(self.lang)) self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') - self.logger.info('Change your interface lang from facebook and try again') + self.logger.info('Change your interface lang from facebook settings and try again') raise AttributeError('Language provided not currently supported') #current year, this variable is needed for parse_page recursion @@ -85,7 +85,7 @@ class FacebookSpider(scrapy.Spider): formxpath='//form[contains(@action, "login")]', formdata={'email': self.email,'pass': self.password}, callback=self.parse_home - ) + ) def parse_home(self, response): ''' @@ -140,7 +140,7 @@ class FacebookSpider(scrapy.Spider): for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(),selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) - new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") + new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) @@ -150,9 +150,10 @@ class FacebookSpider(scrapy.Spider): self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) - #load following page - #tries to click on "more", otherwise it looks for the appropriate - #year for 1-click only and proceeds to click on others + #load following page, try to click on "more" + #after few pages have gone scraped, the "more" link disappears + #if not present look for the highest year not parsed yet, click once + #and keep looking for "more" new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() if not new_page: if response.meta['flag'] == self.k and self.k >= self.year: @@ -165,7 +166,7 @@ class FacebookSpider(scrapy.Spider): self.logger.info('Everything OK, new flag: {}'.format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: - while not new_page: #sometimes the years are skipped + while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info('XPATH not found for year {}'.format(self.k-1)) self.k -= 1 self.logger.info('Trying with previous year, flag={}'.format(self.k))