refactoring comments spider

2019-02-18 02:12:52 +01:00 · 2019-02-18 02:12:52 +01:00 · b3d12c4e6b
commit b3d12c4e6b
parent bdeae9f4b5
6 changed files with 188 additions and 104 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -128,6 +128,39 @@ def parse_date(init_date,loader_context):
                month = months[date[1]]
                year = int(date[2])
                return datetime(year,month,day).date()     
            #9 ore fa
            elif date[0].isdigit() and date[1] == 'ore':
                if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
                    return datetime(year,month,day).date()
                #9 ore fa (ieri)
                else:
                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                    return datetime(year,month,day).date()   
            #ieri alle 20:45            
            elif date[0].lower() == 'ieri' and date[1] == 'alle':
                day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                return datetime(year,month,day).date()               
            #oggi alle 11:11            
            elif date[0].lower() == 'oggi' and date[1] == 'alle':
                return datetime(year,month,day).date()               
            #lunedì alle 12:34
            elif date[0].isalpha() and date[1] == 'alle':
                today = datetime.now().weekday() #today as a weekday
                weekday = giorni[date[0].lower()]   #day to be match as number weekday
                #weekday is chronologically always lower than day
                delta = today - weekday   
                if delta >= 0:
                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
                    return datetime(year,month,day).date()
                #lunedì = 0 sabato = 6, mar 1 ven 5
                else:
                    delta += 8
                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
                    return datetime(year,month,day).date()
            #parsing failed
            else:
                return date
@ -427,9 +460,7 @@ def url_strip(url):
 class FbcrawlItem(scrapy.Item):
-    source = scrapy.Field( 
+    source = scrapy.Field()   
        output_processor=TakeFirst()
    )   
    date = scrapy.Field(      # when was the post published
        input_processor=TakeFirst(),
        output_processor=parse_date
@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item):
        output_processor=url_strip
    )
    shared_from = scrapy.Field()
 class CommentsItem(scrapy.Item):
    source = scrapy.Field()   
    reply_to=scrapy.Field()
    date = scrapy.Field(      # when was the post published
        output_processor=parse_date
    )       
    text = scrapy.Field(
        output_processor=Join(separator=u'')
    )                       # full text of the post
    reactions = scrapy.Field(
        output_processor=reactions_strip
    )                  # num of reactions
    likes = scrapy.Field(
        output_processor=reactions_strip
    )                      
    ahah = scrapy.Field()                      
    love = scrapy.Field()                      
    wow = scrapy.Field()                      
    sigh = scrapy.Field()                      
    grrr = scrapy.Field()                      
    share = scrapy.Field()                      # num of shares
    url = scrapy.Field(
        output_processor=url_strip
    )
    shared_from = scrapy.Field()
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -2,106 +2,134 @@ import scrapy
 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
-from fbcrawl.items import FbcrawlItem
+from fbcrawl.spiders.fbcrawl import FacebookSpider
 from fbcrawl.items import CommentsItem
-class FacebookSpider(scrapy.Spider):
+
 class CommentsSpider(FacebookSpider):
    """
-    Parse FB comments, given a page (needs credentials)
+    Parse FB comments, given a post (needs credentials)
    """    
    name = "comments"
    custom_settings = {
        'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
                               'reactions','likes','ahah','love','wow', \
                               'sigh','grrr','url']
    }
-    def __init__(self, email='', password='', page='', **kwargs):
+    def __init__(self, *args, **kwargs):
-        super(FacebookSpider, self).__init__(**kwargs)
+        super().__init__(*args,**kwargs)
        if not email or not password:
            raise ValueError("You need to provide valid email and password!")
        else:
            self.email = email
            self.password = password
        if not page:
            raise ValueError("You need to provide a valid page name to crawl!")
        else:
            self.page = page
        self.start_urls = ['https://mbasic.facebook.com']    
    def parse(self, response):
        return FormRequest.from_response(
                response,
                formxpath='//form[contains(@action, "login")]',
                formdata={'email': self.email,'pass': self.password},
                callback=self.parse_home
        )
    def parse_home(self, response):
        '''Parse user news feed page'''
        if response.css('#approvals_code'):
            # Handle 'Approvals Code' checkpoint (ask user to enter code).
            if not self.code:
                # Show facebook messages via logs
                # and request user for approval code.
                message = response.css('._50f4::text').extract()[0]
                self.log(message)
                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
                self.log(message)
                self.code = input('Enter the code: ')
            self.code = str(self.code)
            if not (self.code and self.code.isdigit()):
                self.log('Bad approvals code detected.')
                return
            return FormRequest.from_response(
                response,
                formdata={'approvals_code': self.code},
                callback=self.parse_home,
            )
        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
            # Handle 'Save Browser' checkpoint.
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home,
                dont_filter=True,
            )
        elif response.css('button#checkpointSubmitButton'):
            # Handle 'Someone tried to log into your account' warning.
            return FormRequest.from_response(
                response, callback=self.parse_home, dont_filter=True,)
        # Else go to the user profile.
        href = response.urljoin(self.page)
        self.logger.info('Parse function called on %s', href)
        return scrapy.Request(
            url=href,
            callback=self.parse_page,
        )
    def parse_page(self, response):
-        #answer from page
+        '''
-        for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):            
+        parse page does multiple things:
-#            resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
+            1) loads replied-to-comments page one-by-one (for DFS)
-            rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
+            2) gets common not-replied-to comments
-            risp = response.urljoin(rispostina[0].extract())
+        '''
-            yield scrapy.Request(risp, callback=self.parse_rispostina)
+        #loads replied-to comments pages
-        
+        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'  + '['+ str(response.meta['index']) + ']'
-
+        for reply in response.xpath(path):
-#        for i in range(len(rispostina)):
+            source = reply.xpath('.//h3/a/text()').extract()
-#            risp = response.urljoin(rispostina[i].extract())
+            answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
-#
+            ans = response.urljoin(answer[::-1][0])
-#        for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts            
+            self.logger.info('Nested comment at page {}'.format(ans))
-#            new = ItemLoader(item=FbcrawlItem(),selector=post)
+            yield scrapy.Request(ans,
-#            new.add_xpath('source', "./div/h3/a/text()")
+                                 callback=self.parse_reply,
-#            new.add_xpath('text',"./div[1]/div[1]/text()")            
+                                 meta={'reply_to':source,
-#            yield new.load_item()          
+                                       'url':response.url,
-#
+                                       'index':response.meta['index'],
-#        next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
+                                       'flag':'init'})
-#        if len(next_page) > 0:
+        #loads regular comments     
-#            next_page = response.urljoin(next_page[0].extract())
+        if not response.xpath(path):
-#            yield scrapy.Request(next_page, callback=self.parse_page)
+            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
-
+            for reply in response.xpath(path2):
-    def parse_rispostina(self,response):
+                new = ItemLoader(item=CommentsItem(),selector=reply)
-        for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts                                
+                new.context['lang'] = self.lang           
-            new = ItemLoader(item=FbcrawlItem(),selector=daje)
+                new.add_xpath('source','.//h3/a/text()')  
-            new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()")             
+                new.add_xpath('text','.//div[h3]/div[1]//text()')
-            new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()")
+                new.add_xpath('date','.//abbr/text()')
                yield new.load_item()
 #            
 #        #previous comments
        if not response.xpath(path) and not response.xpath(path2):
            for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
                new_page = next_page.xpath('.//@href').extract()
                new_page = response.urljoin(new_page[0])
                self.logger.info('New page to be crawled {}'.format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'index':1})        
 #        
    def parse_reply(self,response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): 
                new = ItemLoader(item=CommentsItem(),selector=root)
                new.context['lang'] = self.lang           
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to','ROOT')
                new.add_xpath('text','.//div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
                new.add_value('url',response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 
                new = ItemLoader(item=CommentsItem(),selector=reply)
                new.context['lang'] = self.lang           
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to',response.meta['reply_to'])
                new.add_xpath('text','.//div[h3]/div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
                new.add_value('url',response.url)   
                yield new.load_item()
            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
            if back:
                self.logger.info('Back found, trying to go back')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page, 
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={'reply_to':response.meta['reply_to'],
                                           'flag':'back',
                                           'url':response.meta['url'],
                                           'index':response.meta['index']})
            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
                yield scrapy.Request(next_reply, dont_filter=True,
                                     callback=self.parse_page,
                                     meta={'index':response.meta['index']+1})
        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 
                new = ItemLoader(item=CommentsItem(),selector=reply)
                new.context['lang'] = self.lang           
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to',response.meta['reply_to'])
                new.add_xpath('text','.//div[h3]/div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
                new.add_value('url',response.url)   
                yield new.load_item()
            #keep going backwards
            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
            self.logger.info('Back found, trying to go back')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page, 
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={'reply_to':response.meta['reply_to'],
                                           'flag':'back',
                                           'url':response.meta['url'],
                                           'index':response.meta['index']})
            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
                yield scrapy.Request(next_reply, dont_filter=True,
                                     callback=self.parse_page,
                                     meta={'index':response.meta['index']+1})
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider):
    Parse FB pages (needs credentials)
    """    
    name = "fb"
    is_debug = True
    custom_settings = {
        'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
                               'reactions','likes','ahah','love','wow', \
@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider):
        #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
        logger = logging.getLogger('scrapy.middleware')
        logger.setLevel(logging.WARNING)
-        super().__init__(**kwargs)
+        super().__init__(*args,**kwargs)
        #email & pass need to be passed as attributes!
        if 'email' not in kwargs or 'password' not in kwargs:
@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider):
        #navigate to provided page
        href = response.urljoin(self.page)
        self.logger.info('Scraping facebook page {}'.format(href))
-        return scrapy.Request(url=href,callback=self.parse_page)
+        return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})
    def parse_page(self, response):
        '''