added experimental support for languages en, es, fr, it, pt

2019-01-30 20:34:25 +01:00 · 2019-01-30 20:34:25 +01:00 · fb32a4213e
commit fb32a4213e
parent 9de51e0ce8
5 changed files with 134 additions and 117 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/pycache/settings.cpython-37.pyc
+++ b/fbcrawl/pycache/settings.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -9,102 +9,106 @@ import scrapy
 from scrapy.loader.processors import TakeFirst, Join, MapCompose
 from datetime import datetime, timedelta

-def parse_date(init_date):
-    mesi = {
-    'gennaio':1,
-    'febbraio':2,
-    'marzo':3,
-    'aprile':4,
-    'maggio':5,
-    'giugno':6,
-    'luglio':7,
-    'agosto':8,
-    'settembre':9,
-    'ottobre':10,
-    'novembre':11,
-    'dicembre':12
-    }
-
-    mesi_abbr = {
-    'gen':1,
-    'feb':2,
-    'mar':3,
-    'apr':4,
-    'mag':5,
-    'giu':6,
-    'lug':7,
-    'ago':8,
-    'set':9,
-    'ott':10,
-    'nov':11,
-    'dic':12
-    }    
+def parse_date(init_date,loader_context):
+    lang = loader_context['lang']
+    if lang == 'it':
+        mesi = {
+        'gennaio':1,
+        'febbraio':2,
+        'marzo':3,
+        'aprile':4,
+        'maggio':5,
+        'giugno':6,
+        'luglio':7,
+        'agosto':8,
+        'settembre':9,
+        'ottobre':10,
+        'novembre':11,
+        'dicembre':12
+        }
    
-    giorni = {
-    'domenica':0,
-    'lunedì':1,
-    'martedì':2,
-    'mercoledì':3,
-    'giovedì':4,
-    'venerdì':5,
-    'sabato':6
-    }    
-    date = init_date
-    date = date[0].split()
-    year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
-    
-    #sanity check
-    if len(date) == 0:
-        return 'Error: no data'
-    
-    #yesterday 
-    elif len(date) == 1:
-        day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+        mesi_abbr = {
+        'gen':1,
+        'feb':2,
+        'mar':3,
+        'apr':4,
+        'mag':5,
+        'giu':6,
+        'lug':7,
+        'ago':8,
+        'set':9,
+        'ott':10,
+        'nov':11,
+        'dic':12
+        }    
        
-    #4h
-    elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
-        pass
-    
-    #22h (yesterday)
-    elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
-        day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
-    
-    #yesterday
-    elif date[0].isdigit() == False and date[1].isdigit() == False:
-        day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+        giorni = {
+        'domenica':0,
+        'lunedì':1,
+        'martedì':2,
+        'mercoledì':3,
+        'giovedì':4,
+        'venerdì':5,
+        'sabato':6
+        }    
+        date = init_date
+        date = date[0].split()
+        year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
        
-    #day with 3 month length of this year
-    elif len(date[1]) == 3 and not(date[2].isdigit()):
-        day = int(date[0])
-        month = mesi_abbr[date[1]]
-
-    elif len(date[1]) > 3 and not(date[2].isdigit()):
-        day = int(date[0])
-        month = mesi[date[1]]
-
-    elif len(date[1]) == 3 and date[2].isdigit():
-        day = int(date[0])
-        month = mesi_abbr[date[1]]
-        year = int(date[2])
-
-    #usual dates, with regular length month 
-    elif date[0].isdigit() and date[2].isdigit():
-        day = int(date[0])
-        month = mesi[date[1]]
-        year = int(date[2])
-
-    #dates with weekdays (this function assumes that the month is the same)
-    elif date[0].isdigit() == False and date[1].isdigit() == False:
-        today = datetime.now().weekday() #today as a weekday
-        weekday = giorni[date[0]]   #day to be match as number weekday
-        #weekday is chronologically always lower than day
-        if weekday < today:
-            day -= today - weekday 
-        elif weekday > today:
-            weekday += 7 
-            day -= today - weekday
+        #sanity check
+        if len(date) == 0:
+            return 'Error: no data'
+        
+        #yesterday 
+        elif len(date) == 1:
+            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+            
+        #4h
+        elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
+            pass
+        
+        #22h (yesterday)
+        elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
+            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+        
+        #yesterday
+        elif date[0].isdigit() == False and date[1].isdigit() == False:
+            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+            
+        #day with 3 month length of this year
+        elif len(date[1]) == 3 and not(date[2].isdigit()):
+            day = int(date[0])
+            month = mesi_abbr[date[1]]
+    
+        elif len(date[1]) > 3 and not(date[2].isdigit()):
+            day = int(date[0])
+            month = mesi[date[1]]
+    
+        elif len(date[1]) == 3 and date[2].isdigit():
+            day = int(date[0])
+            month = mesi_abbr[date[1]]
+            year = int(date[2])
+    
+        #usual dates, with regular length month 
+        elif date[0].isdigit() and date[2].isdigit():
+            day = int(date[0])
+            month = mesi[date[1]]
+            year = int(date[2])
+    
+        #dates with weekdays (this function assumes that the month is the same)
+        elif date[0].isdigit() == False and date[1].isdigit() == False:
+            today = datetime.now().weekday() #today as a weekday
+            weekday = giorni[date[0]]   #day to be match as number weekday
+            #weekday is chronologically always lower than day
+            if weekday < today:
+                day -= today - weekday 
+            elif weekday > today:
+                weekday += 7 
+                day -= today - weekday
+        else:
+        #date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
+            return init_date
    else:
-    #date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
        return init_date
    date = datetime(year,month,day)
    return date.date()
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -11,10 +11,10 @@ class FacebookSpider(scrapy.Spider):
    """    
    name = "fb"

-    def __init__(self, email='', password='', page='', year=2018, **kwargs):
+    def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
        super(FacebookSpider, self).__init__(**kwargs)
        
-        self.year = int(year)        #arguments are passed as strings
+        self.year = int(year)    #arguments are passed as strings
    
        if not email or not password:
            raise ValueError("You need to provide valid email and password!")
@ -27,8 +27,19 @@ class FacebookSpider(scrapy.Spider):
        else:
            self.page = page
            
-        self.start_urls = ['https://mbasic.facebook.com']    
+        if not(lang):
+            self.logger.info('Language attribute not provided, assuming "en"')
+            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
+            self.lang = 'en'
+        elif lang == 'en'  or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
+            self.lang = lang
+        else:
+            self.logger.info('Lang:{} not currently supported'.format(lang))                             
+            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
+            self.logger.info('Change your interface lang from facebook and try again')     
+            return                        

+        self.start_urls = ['https://mbasic.facebook.com']    

    def parse(self, response):
        return FormRequest.from_response(
@ -59,29 +70,24 @@ class FacebookSpider(scrapy.Spider):
            return FormRequest.from_response(
                response,
                formdata={'approvals_code': self.code},
-                callback=self.parse_home,
-            )
-        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
+                callback=self.parse_home)
+        elif response.xpath("//div/a[contains(@href,'save-device')]"):
+#        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
            # Handle 'Save Browser' checkpoint.
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
-                callback=self.parse_home,
-                dont_filter=True,
-            )
+                callback=self.parse_home)
        elif response.css('button#checkpointSubmitButton'):
            # Handle 'Someone tried to log into your account' warning.
            return FormRequest.from_response(
-                response, callback=self.parse_home, dont_filter=True,)
+                response, callback=self.parse_home)
        # Else go to the page requested.
-        if self.page.find('.facebook.com/') != -1:
-            self.page = self.page[28:]
+        if self.page.find('https://www.facebook.com/') != -1:
+            self.page = self.page[25:]
        href = response.urljoin(self.page)
        self.logger.info('Parse function called on %s', href)
-        return scrapy.Request(
-            url=href,
-            callback=self.parse_page,
-        )
+        return scrapy.Request(url=href,callback=self.parse_page)

    def parse_page(self, response):
        #select all posts
@ -97,29 +103,35 @@ class FacebookSpider(scrapy.Spider):
            yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})       

        #load following page
-#        next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
-        next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ece')) and not(contains(text(),number()))]/@href").extract()      
-        if len(next_page) == 0:
-            if response.meta['flag'] == 3 and self.year <= 2015:
-                self.logger.info('2015 reached, flag = {}'.format(response.meta['flag']))
+        #next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
+        next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
+        if len(next_page) == 0: 
+            if response.meta['flag'] == 4 and self.year <= 2015:
+                self.logger.info('2014 reached, flag = 5')
+                next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
+                self.logger.info('next_page = {}'.format(next_page[0]))
+                new_page = response.urljoin(next_page[0])
+                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5}) 
+            elif response.meta['flag'] == 3 and self.year <= 2015:
+                self.logger.info('2015 reached, flag = 4')
                next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4}) 
            elif response.meta['flag'] == 2 and self.year <= 2016:
-                self.logger.info('2016 reached, flag = {}'.format(response.meta['flag']))                
+                self.logger.info('2016 reached, flag = 3')                
                next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3}) 
            elif response.meta['flag'] == 1 and self.year <= 2017:            
-                self.logger.info('2017 reached, flag = {}'.format(response.meta['flag']))                
+                self.logger.info('2017 reached, flag = 2')          
                next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})      
            elif response.meta['flag'] == 0 and self.year <= 2018:                      
-                self.logger.info('2018 reached, flag = {}'.format(response.meta['flag']))
+                self.logger.info('2018 reached, flag = 1')
                next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
@ -132,7 +144,7 @@ class FacebookSpider(scrapy.Spider):
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':0})
                
    def parse_post(self,response):
-        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])            
+        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
@ -144,6 +156,7 @@ class FacebookSpider(scrapy.Spider):
        
    def parse_reactions(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
+        new.context['lang'] = self.lang           
        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")