refactoring comments spider

2019-02-18 02:12:52 +01:00 · 2019-02-18 02:12:52 +01:00 · b3d12c4e6b
commit b3d12c4e6b
parent bdeae9f4b5
6 changed files with 188 additions and 104 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -127,7 +127,40 @@ def parse_date(init_date,loader_context):
                day = int(date[0])
                month = months[date[1]]
                year = int(date[2])
-                return datetime(year,month,day).date()                  
+                return datetime(year,month,day).date()     
+            #9 ore fa
+            elif date[0].isdigit() and date[1] == 'ore':
+                if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
+                    return datetime(year,month,day).date()
+                #9 ore fa (ieri)
+                else:
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                    return datetime(year,month,day).date()   
+            #ieri alle 20:45            
+            elif date[0].lower() == 'ieri' and date[1] == 'alle':
+                day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                return datetime(year,month,day).date()               
+            #oggi alle 11:11            
+            elif date[0].lower() == 'oggi' and date[1] == 'alle':
+                return datetime(year,month,day).date()               
+            #lunedì alle 12:34
+            elif date[0].isalpha() and date[1] == 'alle':
+                today = datetime.now().weekday() #today as a weekday
+                weekday = giorni[date[0].lower()]   #day to be match as number weekday
+                #weekday is chronologically always lower than day
+                delta = today - weekday   
+                if delta >= 0:
+                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
+                    return datetime(year,month,day).date()
+                #lunedì = 0 sabato = 6, mar 1 ven 5
+                else:
+                    delta += 8
+                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
+                    return datetime(year,month,day).date()
            #parsing failed
            else:
                return date
@ -427,9 +460,7 @@ def url_strip(url):
    

 class FbcrawlItem(scrapy.Item):
-    source = scrapy.Field( 
-        output_processor=TakeFirst()
-    )   
+    source = scrapy.Field()   
    date = scrapy.Field(      # when was the post published
        input_processor=TakeFirst(),
        output_processor=parse_date
@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item):
        output_processor=url_strip
    )
    shared_from = scrapy.Field()
+
+class CommentsItem(scrapy.Item):
+    source = scrapy.Field()   
+    reply_to=scrapy.Field()
+    date = scrapy.Field(      # when was the post published
+        output_processor=parse_date
+    )       
+    text = scrapy.Field(
+        output_processor=Join(separator=u'')
+    )                       # full text of the post
+    reactions = scrapy.Field(
+        output_processor=reactions_strip
+    )                  # num of reactions
+    likes = scrapy.Field(
+        output_processor=reactions_strip
+    )                      
+    ahah = scrapy.Field()                      
+    love = scrapy.Field()                      
+    wow = scrapy.Field()                      
+    sigh = scrapy.Field()                      
+    grrr = scrapy.Field()                      
+    share = scrapy.Field()                      # num of shares
+    url = scrapy.Field(
+        output_processor=url_strip
+    )
+    shared_from = scrapy.Field()
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -2,106 +2,134 @@ import scrapy

 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
-from fbcrawl.items import FbcrawlItem
+from fbcrawl.spiders.fbcrawl import FacebookSpider
+from fbcrawl.items import CommentsItem

-class FacebookSpider(scrapy.Spider):
+
+class CommentsSpider(FacebookSpider):
    """
-    Parse FB comments, given a page (needs credentials)
+    Parse FB comments, given a post (needs credentials)
    """    
    name = "comments"
+    custom_settings = {
+        'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
+                               'reactions','likes','ahah','love','wow', \
+                               'sigh','grrr','url']
+    }

-    def __init__(self, email='', password='', page='', **kwargs):
-        super(FacebookSpider, self).__init__(**kwargs)
-    
-        if not email or not password:
-            raise ValueError("You need to provide valid email and password!")
-        else:
-            self.email = email
-            self.password = password
-            
-        if not page:
-            raise ValueError("You need to provide a valid page name to crawl!")
-        else:
-            self.page = page
-            
-        self.start_urls = ['https://mbasic.facebook.com']    
-
-
-    def parse(self, response):
-        return FormRequest.from_response(
-                response,
-                formxpath='//form[contains(@action, "login")]',
-                formdata={'email': self.email,'pass': self.password},
-                callback=self.parse_home
-        )
-  
-    def parse_home(self, response):
-        '''Parse user news feed page'''
-        if response.css('#approvals_code'):
-            # Handle 'Approvals Code' checkpoint (ask user to enter code).
-            if not self.code:
-                # Show facebook messages via logs
-                # and request user for approval code.
-                message = response.css('._50f4::text').extract()[0]
-                self.log(message)
-                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
-                self.log(message)
-                self.code = input('Enter the code: ')
-            self.code = str(self.code)
-            if not (self.code and self.code.isdigit()):
-                self.log('Bad approvals code detected.')
-                return
-            return FormRequest.from_response(
-                response,
-                formdata={'approvals_code': self.code},
-                callback=self.parse_home,
-            )
-        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
-            # Handle 'Save Browser' checkpoint.
-            return FormRequest.from_response(
-                response,
-                formdata={'name_action_selected': 'dont_save'},
-                callback=self.parse_home,
-                dont_filter=True,
-            )
-        elif response.css('button#checkpointSubmitButton'):
-            # Handle 'Someone tried to log into your account' warning.
-            return FormRequest.from_response(
-                response, callback=self.parse_home, dont_filter=True,)
-        # Else go to the user profile.
-        href = response.urljoin(self.page)
-        self.logger.info('Parse function called on %s', href)
-        return scrapy.Request(
-            url=href,
-            callback=self.parse_page,
-        )
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args,**kwargs)

    def parse_page(self, response):
-        #answer from page
-        for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):            
-#            resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
-            rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
-            risp = response.urljoin(rispostina[0].extract())
-            yield scrapy.Request(risp, callback=self.parse_rispostina)
-        
-
-#        for i in range(len(rispostina)):
-#            risp = response.urljoin(rispostina[i].extract())
-#
-#        for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts            
-#            new = ItemLoader(item=FbcrawlItem(),selector=post)
-#            new.add_xpath('source', "./div/h3/a/text()")
-#            new.add_xpath('text',"./div[1]/div[1]/text()")            
-#            yield new.load_item()          
-#
-#        next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
-#        if len(next_page) > 0:
-#            next_page = response.urljoin(next_page[0].extract())
-#            yield scrapy.Request(next_page, callback=self.parse_page)
-
-    def parse_rispostina(self,response):
-        for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts                                
-            new = ItemLoader(item=FbcrawlItem(),selector=daje)
-            new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()")             
-            new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()")
-            yield new.load_item()
+        '''
+        parse page does multiple things:
+            1) loads replied-to-comments page one-by-one (for DFS)
+            2) gets common not-replied-to comments
+        '''
+        #loads replied-to comments pages
+        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'  + '['+ str(response.meta['index']) + ']'
+        for reply in response.xpath(path):
+            source = reply.xpath('.//h3/a/text()').extract()
+            answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
+            ans = response.urljoin(answer[::-1][0])
+            self.logger.info('Nested comment at page {}'.format(ans))
+            yield scrapy.Request(ans,
+                                 callback=self.parse_reply,
+                                 meta={'reply_to':source,
+                                       'url':response.url,
+                                       'index':response.meta['index'],
+                                       'flag':'init'})
+        #loads regular comments     
+        if not response.xpath(path):
+            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
+            for reply in response.xpath(path2):
+                new = ItemLoader(item=CommentsItem(),selector=reply)
+                new.context['lang'] = self.lang           
+                new.add_xpath('source','.//h3/a/text()')  
+                new.add_xpath('text','.//div[h3]/div[1]//text()')
+                new.add_xpath('date','.//abbr/text()')
+                yield new.load_item()
+#            
+#        #previous comments
+        if not response.xpath(path) and not response.xpath(path2):
+            for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
+                new_page = next_page.xpath('.//@href').extract()
+                new_page = response.urljoin(new_page[0])
+                self.logger.info('New page to be crawled {}'.format(new_page))
+                yield scrapy.Request(new_page,
+                                     callback=self.parse_page,
+                                     meta={'index':1})        
+#        
+    def parse_reply(self,response):
+        '''
+        parse reply to comments, root comment is added if flag
+        '''
+        if response.meta['flag'] == 'init':
+            #parse root comment
+            for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): 
+                new = ItemLoader(item=CommentsItem(),selector=root)
+                new.context['lang'] = self.lang           
+                new.add_xpath('source', './/h3/a/text()')
+                new.add_value('reply_to','ROOT')
+                new.add_xpath('text','.//div[1]//text()')
+                new.add_xpath('date','.//abbr/text()')
+                new.add_value('url',response.url)
+                yield new.load_item()
+            #parse all replies in the page
+            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 
+                new = ItemLoader(item=CommentsItem(),selector=reply)
+                new.context['lang'] = self.lang           
+                new.add_xpath('source', './/h3/a/text()')
+                new.add_value('reply_to',response.meta['reply_to'])
+                new.add_xpath('text','.//div[h3]/div[1]//text()')
+                new.add_xpath('date','.//abbr/text()')
+                new.add_value('url',response.url)   
+                yield new.load_item()
+                
+            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
+            if back:
+                self.logger.info('Back found, trying to go back')
+                back_page = response.urljoin(back[0])
+                yield scrapy.Request(back_page, 
+                                     callback=self.parse_reply,
+                                     priority=100,
+                                     meta={'reply_to':response.meta['reply_to'],
+                                           'flag':'back',
+                                           'url':response.meta['url'],
+                                           'index':response.meta['index']})
+            else:
+                next_reply = response.meta['url']
+                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
+                yield scrapy.Request(next_reply, dont_filter=True,
+                                     callback=self.parse_page,
+                                     meta={'index':response.meta['index']+1})
+                
+        elif response.meta['flag'] == 'back':
+            #parse all comments
+            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 
+                new = ItemLoader(item=CommentsItem(),selector=reply)
+                new.context['lang'] = self.lang           
+                new.add_xpath('source', './/h3/a/text()')
+                new.add_value('reply_to',response.meta['reply_to'])
+                new.add_xpath('text','.//div[h3]/div[1]//text()')
+                new.add_xpath('date','.//abbr/text()')
+                new.add_value('url',response.url)   
+                yield new.load_item()
+            #keep going backwards
+            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
+            self.logger.info('Back found, trying to go back')
+            if back:
+                back_page = response.urljoin(back[0])
+                yield scrapy.Request(back_page, 
+                                     callback=self.parse_reply,
+                                     priority=100,
+                                     meta={'reply_to':response.meta['reply_to'],
+                                           'flag':'back',
+                                           'url':response.meta['url'],
+                                           'index':response.meta['index']})
+            else:
+                next_reply = response.meta['url']
+                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
+                yield scrapy.Request(next_reply, dont_filter=True,
+                                     callback=self.parse_page,
+                                     meta={'index':response.meta['index']+1})
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider):
    Parse FB pages (needs credentials)
    """    
    name = "fb"
-    is_debug = True
    custom_settings = {
        'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
                               'reactions','likes','ahah','love','wow', \
@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider):
        #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
        logger = logging.getLogger('scrapy.middleware')
        logger.setLevel(logging.WARNING)
-        super().__init__(**kwargs)
+        super().__init__(*args,**kwargs)
        
        #email & pass need to be passed as attributes!
        if 'email' not in kwargs or 'password' not in kwargs:
@ -121,7 +120,7 @@ class FacebookSpider(scrapy.Spider):
                self.lang = 'it'
            elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"):
                self.logger.info('Language recognized: lang="pt"')
-                self.lang = 'pt'                
+                self.lang = 'pt'
            else:
                raise AttributeError('Language not recognized\n'
                                     'Change your interface lang from facebook ' 
@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider):
        #navigate to provided page
        href = response.urljoin(self.page)
        self.logger.info('Scraping facebook page {}'.format(href))
-        return scrapy.Request(url=href,callback=self.parse_page)
+        return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})

    def parse_page(self, response):
        '''