[comments.py] Added support for groups

2019-04-23 03:41:52 +02:00 · 2019-04-23 03:41:52 +02:00 · 462cb0eff1
commit 462cb0eff1
parent 2d404a7667
4 changed files with 77 additions and 24 deletions
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -24,10 +24,15 @@ class CommentsSpider(FacebookSpider):
        '''
        parse page does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
-            2) retrieves not-replied-to comments
+            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''
-        #loads replied-to comments pages
+        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'  + '['+ str(response.meta['index']) + ']'
        group_flag = response.meta['group'] if 'group' in response.meta else None
        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
@ -38,9 +43,10 @@ class CommentsSpider(FacebookSpider):
                                 meta={'reply_to':source,
                                       'url':response.url,
                                       'index':response.meta['index'],
-                                       'flag':'init'})
+                                       'flag':'init',
-        #loads regular comments     
+                                       'group':group_flag})
-        if not response.xpath(path):
+        #load regular comments     
        if not response.xpath(path): #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i,reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment @ page {}'.format(i,response.url))
@ -53,15 +59,29 @@ class CommentsSpider(FacebookSpider):
                new.add_value('url',response.url)
                yield new.load_item()
-        #previous comments
+        #new comment page
        if not response.xpath(path):
-            for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
+            #for groups
-                new_page = next_page.xpath('.//@href').extract()
+            next_xpath = './/div[contains(@id,"see_next")]'
-                new_page = response.urljoin(new_page[0])
+            prev_xpath = './/div[contains(@id,"see_prev")]'
-                self.logger.info('New page to be crawled {}'.format(new_page))
+            if not response.xpath(next_xpath) or group_flag == 1:
-                yield scrapy.Request(new_page,
+                for next_page in response.xpath(prev_xpath):
-                                     callback=self.parse_page,
+                    new_page = next_page.xpath('.//@href').extract()
-                                     meta={'index':1})        
+                    new_page = response.urljoin(new_page[0])
                    self.logger.info('New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'index':1,
                                               'group':1})        
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info('New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'index':1,
                                               'group':group_flag})        
    def parse_reply(self,response):
        '''
@ -101,13 +121,16 @@ class CommentsSpider(FacebookSpider):
                                     meta={'reply_to':response.meta['reply_to'],
                                           'flag':'back',
                                           'url':response.meta['url'],
-                                           'index':response.meta['index']})
+                                           'index':response.meta['index'],
                                           'group':response.meta['group']})
            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_page,
-                                     meta={'index':response.meta['index']+1})
+                                     meta={'index':response.meta['index']+1,
                                           'group':response.meta['group']})
        elif response.meta['flag'] == 'back':
            #parse all comments
@ -132,10 +155,39 @@ class CommentsSpider(FacebookSpider):
                                     meta={'reply_to':response.meta['reply_to'],
                                           'flag':'back',
                                           'url':response.meta['url'],
-                                           'index':response.meta['index']})
+                                           'index':response.meta['index'],
                                           'group':response.meta['group']})
            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_page,
-                                     meta={'index':response.meta['index']+1})
+                                     meta={'index':response.meta['index']+1,
                                           'group':response.meta['group']})
 # =============================================================================
 # CRAWL REACTIONS
 # =============================================================================
 #    def parse_reactions(self,response):
 #        new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
 #        new.context['lang'] = self.lang           
 #        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
 #        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
 #        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
 #        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
 #        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
 #        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        
 #        yield new.load_item()     
 #
 #    #substitute
 #    yield new.load_item()
 #    ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
 #    _________v___
 #    #response --> reply/root
 #    reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
 #    reactions = response.urljoin(reactions[0].extract())
 #    if reactions:
 #        yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
 #    else:
 #        yield new.load_item() 
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -66,7 +66,7 @@ class FacebookSpider(scrapy.Spider):
        else:
            self.logger.info('Lang "{}" not currently supported'.format(self.lang))                             
            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
-            self.logger.info('Change your interface lang from facebook and try again')
+            self.logger.info('Change your interface lang from facebook settings and try again')
            raise AttributeError('Language provided not currently supported')
        #current year, this variable is needed for parse_page recursion
@ -85,7 +85,7 @@ class FacebookSpider(scrapy.Spider):
                formxpath='//form[contains(@action, "login")]',
                formdata={'email': self.email,'pass': self.password},
                callback=self.parse_home
-        )
+                )
    def parse_home(self, response):
        '''
@ -140,7 +140,7 @@ class FacebookSpider(scrapy.Spider):
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):            
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
-            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")        
+            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')        
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            #page_url #new.add_value('url',response.url)
@ -150,9 +150,10 @@ class FacebookSpider(scrapy.Spider):
            self.count -= 1
            yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})       
-        #load following page
+        #load following page, try to click on "more"
-        #tries to click on "more", otherwise it looks for the appropriate
+        #after few pages have gone scraped, the "more" link disappears 
-        #year for 1-click only and proceeds to click on others
+        #if not present look for the highest year not parsed yet, click once 
        #and keep looking for "more"
        new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
        if not new_page: 
            if response.meta['flag'] == self.k and self.k >= self.year:                
@ -165,7 +166,7 @@ class FacebookSpider(scrapy.Spider):
                    self.logger.info('Everything OK, new flag: {}'.format(self.k))                                
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
                else:
-                    while not new_page: #sometimes the years are skipped 
+                    while not new_page: #sometimes the years are skipped this handles small year gaps
                        self.logger.info('XPATH not found for year {}'.format(self.k-1))
                        self.k -= 1
                        self.logger.info('Trying with previous year, flag={}'.format(self.k))