improved support for languages en, es, fr, it, pt

2019-01-31 06:54:31 +01:00 · 2019-01-31 06:54:31 +01:00 · a9982865d9
commit a9982865d9
parent fb32a4213e
6 changed files with 423 additions and 132 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -11,8 +11,11 @@ from datetime import datetime, timedelta

 def parse_date(init_date,loader_context):
    lang = loader_context['lang']
+# =============================================================================
+# Italian - status:final
+# =============================================================================
    if lang == 'it':
-        mesi = {
+        months = {
        'gennaio':1,
        'febbraio':2,
        'marzo':3,
@ -27,7 +30,7 @@ def parse_date(init_date,loader_context):
        'dicembre':12
        }
    
-        mesi_abbr = {
+        months_abbr = {
        'gen':1,
        'feb':2,
        'mar':3,
@ -43,101 +46,379 @@ def parse_date(init_date,loader_context):
        }    
        
        giorni = {
-        'domenica':0,
-        'lunedì':1,
-        'martedì':2,
-        'mercoledì':3,
-        'giovedì':4,
-        'venerdì':5,
-        'sabato':6
+        'lunedì':0,
+        'martedì':1,
+        'mercoledì':2,
+        'giovedì':3,
+        'venerdì':4,
+        'sabato':5,
+        'domenica':6
        }    
-        date = init_date
-        date = date[0].split()
+        date = init_date[0].split()
        year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
+
+        l = len(date)
        
        #sanity check
-        if len(date) == 0:
+        if l == 0:
            return 'Error: no data'
        
-        #yesterday 
-        elif len(date) == 1:
-            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
-            
-        #4h
-        elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
-            pass
+        #adesso, ieri, 4h, 50min
+        elif l == 1:
+            if date[0].isalpha():   
+                if date[0].lower() == 'ieri':
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    #check that yesterday was not in another month
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                elif date[0].lower() == 'adesso':
+                        return datetime(year,month,day).date()    #return today
+                else:  #not recognized, (return date or init_date)
+                    return date 
+            else: 
+                #4h, 50min (exploit future parsing)
+                l = 2
+                new_date = [x for x in date[0] if x.isdigit()]
+                date[0] = ''.join(new_date)
+                new_date = [x for x in date[0] if not(x.isdigit())]
+                date[1] = ''.join(new_date) 
+# l = 2        
+        elif l == 2:
+            #22 min (oggi)
+            if date[1] == 'min':
+                if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
+                    return datetime(year,month,day).date()
+                #22 min (ieri)
+                else:
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                    return datetime(year,month,day).date()   
+            #4 h (oggi)
+            elif date[1] == 'h':
+                if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
+                    return datetime(year,month,day).date()
+                #4 h (ieri)
+                else:
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                    return datetime(year,month,day).date()   
+            #2 gen
+            elif len(date[1]) == 3 and date[1].isalpha():
+                day = int(date[0])
+                month = months_abbr[date[1].lower()]  
+                return datetime(year,month,day).date()  
+            #2 gennaio
+            elif len(date[1]) > 3 and date[1].isalpha():
+                day = int(date[0])
+                month = months[date[1]]
+                return datetime(year,month,day).date()  
+            #parsing failed
+            else:
+                return date
+# l = 3
+        elif l == 3:
+            #21 giu 2017        
+            if len(date[1]) == 3 and date[2].isdigit():
+                day = int(date[0])
+                month = months_abbr[date[1]]
+                year = int(date[2])
+                return datetime(year,month,day).date()   
+            #21 giugno 2017        
+            elif len(date[1]) > 3 and date[2].isdigit():
+                day = int(date[0])
+                month = months[date[1]]
+                year = int(date[2])
+                return datetime(year,month,day).date()                  
+            #parsing failed
+            else:
+                return date
+# l = 4
+        elif l == 4:
+            #Ieri alle ore 23:32
+            if date[0].lower() == 'ieri' and date[1] == 'alle':
+                day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                return datetime(year,month,day).date()   
+            #domenica alle ore 19:29
+            elif date[0].isalpha() and date[1] == 'alle':
+                today = datetime.now().weekday() #today as a weekday
+                weekday = giorni[date[0].lower()]   #day to be match as number weekday
+                #weekday is chronologically always lower than day
+                delta = today - weekday   
+                if delta >= 0:
+                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
+                    return datetime(year,month,day).date()
+                #lunedì = 0 sabato = 6, mar 1 ven 5
+                else:
+                    delta += 8
+                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
+                    return datetime(year,month,day).date()            
+            #parsing failed
+            else:
+                return date
+# l = 5
+        elif l == 5:
+           if date[2] == 'alle':
+               #29 feb alle ore 21:49
+               if len(date[1]) == 3:
+                   day = int(date[0])
+                   month = months_abbr[date[1].lower()]
+                   return datetime(year,month,day).date()   
+               #29 febbraio alle ore 21:49        
+               else:
+                   day = int(date[0])
+                   month = months[date[1].lower()]
+                   return datetime(year,month,day).date()   
+           #parsing failed
+           else:
+               return date
+# l = 6              
+        elif l == 6:
+           if date[3] == 'alle':
+               #29 feb 2016 alle ore 21:49
+               if len(date[1]) == 3:
+                   day = int(date[0])
+                   month = months_abbr[date[1].lower()]
+                   year = int(date[2])
+                   return datetime(year,month,day).date()   
+               #29 febbraio 2016 alle ore 21:49        
+               else:
+                   day = int(date[0])
+                   month = months[date[1].lower()]
+                   year = int(date[2])
+                   return datetime(year,month,day).date()   
+           #parsing failed    
+           else:
+               return date
+# =============================================================================
+# English - status:beta
+# =============================================================================
+    elif lang == 'en':
+        months = {
+        'january':1,
+        'february':2,
+        'march':3,
+        'april':4,
+        'may':5,
+        'june':6,
+        'july':7,
+        'august':8,
+        'september':9,
+        'october':10,
+        'november':11,
+        'december':12
+        }
+    
+        months_abbr = {
+        'jan':1,
+        'feb':2,
+        'mar':3,
+        'apr':4,
+        'may':5,
+        'jun':6,
+        'jul':7,
+        'aug':8,
+        'sep':9,
+        'oct':10,
+        'nov':11,
+        'dec':12
+        }    
        
-        #22h (yesterday)
-        elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
-            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+        date = init_date[0].split()
+        year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
+
+        l = len(date)
        
-        #yesterday
-        elif date[0].isdigit() == False and date[1].isdigit() == False:
-            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
-            
-        #day with 3 month length of this year
-        elif len(date[1]) == 3 and not(date[2].isdigit()):
-            day = int(date[0])
-            month = mesi_abbr[date[1]]
-    
-        elif len(date[1]) > 3 and not(date[2].isdigit()):
-            day = int(date[0])
-            month = mesi[date[1]]
-    
-        elif len(date[1]) == 3 and date[2].isdigit():
-            day = int(date[0])
-            month = mesi_abbr[date[1]]
-            year = int(date[2])
-    
-        #usual dates, with regular length month 
-        elif date[0].isdigit() and date[2].isdigit():
-            day = int(date[0])
-            month = mesi[date[1]]
-            year = int(date[2])
-    
-        #dates with weekdays (this function assumes that the month is the same)
-        elif date[0].isdigit() == False and date[1].isdigit() == False:
-            today = datetime.now().weekday() #today as a weekday
-            weekday = giorni[date[0]]   #day to be match as number weekday
-            #weekday is chronologically always lower than day
-            if weekday < today:
-                day -= today - weekday 
-            elif weekday > today:
-                weekday += 7 
-                day -= today - weekday
+        #sanity check
+        if l == 0:
+            return 'Error: no data'
+        
+        #Yesterday, Now, 4hr, 50mins
+        elif l == 1:
+            if date[0].isalpha():   
+                if date[0].lower() == 'yesterday':
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    #check that yesterday was not in another month
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                elif date[0].lower() == 'now':
+                        return datetime(year,month,day).date()    #return today
+                else:  #not recognized, (return date or init_date)
+                    return date 
+            else: 
+                #4h, 50min (exploit future parsing)
+                l = 2
+                new_date = [x for x in date[0] if x.isdigit()]
+                date[0] = ''.join(new_date)
+                new_date = [x for x in date[0] if not(x.isdigit())]
+                date[1] = ''.join(new_date) 
+# l = 2        
+        elif l == 2:
+            #22 min (oggi)
+            if date[1] == 'min' or date[1] == 'mins':
+                if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
+                    return datetime(year,month,day).date()
+                #22 min (ieri)
+                else:
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                    return datetime(year,month,day).date()   
+            #4 h (oggi)
+            elif date[1] == 'hr' or date[1] == 'hrs':
+                if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
+                    return datetime(year,month,day).date()
+                #4 h (ieri)
+                else:
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                    return datetime(year,month,day).date()   
+            #2 gen
+            elif len(date[1]) == 3 and date[1].isalpha():
+                day = int(date[0])
+                month = months_abbr[date[1].lower()]  
+                return datetime(year,month,day).date()  
+            #2 gennaio
+            elif len(date[1]) > 3 and date[1].isalpha():
+                day = int(date[0])
+                month = months[date[1]]
+                return datetime(year,month,day).date()  
+            #parsing failed
+            else:
+                return date
+# l = 3
+        elif l == 3:
+#            #21 Jun 2017
+#            if len(date[1] == 3) and date[2].isdigit():
+#                day = int(date[0])
+#                month = months_abbr[date[1].lower()]
+#                year = int(date[2])
+#                return datetime(year,month,day).date()   
+#            #21 June 2017        
+#            elif len(date[1] > 3) and date[2].isdigit():
+#                day = int(date[0])
+#                month = months[date[1].lower()]
+#                year = int(date[2])
+#                return datetime(year,month,day).date()                  
+#            #parsing failed
+#            else:
+                return date
+# l = 4
+        elif l == 4:
+            #Ieri alle ore 23:32
+            if date[0].lower() == 'yesteday' and date[1] == 'at':
+                day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+                month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
+                return datetime(year,month,day).date()   
+            #parsing failed
+            else:
+                return date
+# l = 5
+        elif l == 5:
+           if date[2] == 'at':
+               #Jan 29 at 10:00 PM 
+               if len(date[0]) == 3:
+                   day = int(date[1])
+                   month = months_abbr[date[0].lower()]
+                   return datetime(year,month,day).date()   
+               #29 febbraio alle ore 21:49        
+               else:
+                   day = int(date[1])
+                   month = months[date[0].lower()]
+                   return datetime(year,month,day).date()   
+           #parsing failed
+           else:
+               return date
+# l = 6              
+        elif l == 6:
+           if date[3] == 'at':
+               date[1]
+               #Aug 25, 2016 at 7:00 PM 
+               if len(date[0]) == 3:
+                   day = int(date[1][:-1])
+                   month = months_abbr[date[0].lower()]
+                   year = int(date[2])
+                   return datetime(year,month,day).date()    
+               #August 25, 2016 at 7:00 PM      
+               else:
+                   day = int(date[1][:-1])
+                   month = months[date[0].lower()]
+                   year = int(date[2])
+                   return datetime(year,month,day).date()   
+           #parsing failed    
+           else:
+               return date
+# l > 6           
+        #parsing failed - l too big
        else:
-        #date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
-            return init_date
+            return date
+    #parsing failed - language not supported
    else:
        return init_date
-    date = datetime(year,month,day)
-    return date.date()
-
-def comments_strip(string):
-    return string[0].rstrip(' commenti')
-
-def reactions_strip(string):
-    friends = 1 + string[0].count(',')
-    e = 1 + string[0].count(' e ')
-    string = string[0].split()[::-1]
-    if len(string) == 1:
-        string = string[0]
-        while string.rfind('.') != -1:
-            string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
+    
+def comments_strip(string,loader_context):
+    lang = loader_context['lang']
+    if lang == 'it':
+        if string[0].rfind('Commenta') != -1:
+            return
+        else:
+            return string[0].rstrip(' commenti')
+        
+    elif lang == 'en':
+        new_string = string[0].rstrip(' Comments')
+        while new_string.rfind(',') != -1:
+            new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:]
+        return new_string
+    else:
        return string

-    string = string[0]
-    while string.rfind('.') != -1:
-        string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
-    
-    if not string.isdigit():
-        return e
+def reactions_strip(string,loader_context):
+    lang = loader_context['lang']
+    if lang == 'it':
+        newstring = string[0]
+        #19.298.873       
+        if len(newstring.split()) == 1:
+            while newstring.rfind('.') != -1:
+                newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
+            return newstring
+        #Pamela, Luigi e altri 4
+        else:   
+            return string
+#            friends = newstring.count(' e ') + newstring.count(',')
+#            newstring = newstring.split()[::-1][0]
+#            while newstring.rfind('.') != -1:
+#                newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
+#            return int(newstring) + friends
+    elif lang == 'en':
+        newstring = string[0]
+        #19,298,873       
+        if len(newstring.split()) == 1:
+            while newstring.rfind(',') != -1:
+                newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
+            return newstring
+#        #Mark and other 254,134 
+#        elif newstring.split()[::-1][1].isdigit(): 
+#            friends = newstring.count(' and ') + newstring.count(',')
+#            newstring = newstring.split()[::-1][1]
+#            while newstring.rfind(',') != -1:
+#                newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
+#            return int(newstring) + friends
+#        #Philip and 1K others
+        else:
+            return newstring
    else:
-        return int(string) + friends
+        return string
+
+def url_strip(url):
+    fullurl = url[0]
+    #catchin '&id=' is enough to identify the post
+    i = fullurl.find('&id=')
+    if i != -1:
+        j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
+        return j
+    else:
+        return fullurl

 class FbcrawlItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
    source = scrapy.Field(
            output_processor=TakeFirst()
    )                     # page that published the post
@ -153,10 +434,7 @@ class FbcrawlItem(scrapy.Item):

    comments = scrapy.Field(
            output_processor=comments_strip
-    )                       
-    commentators = scrapy.Field(
-            output_processor=Join(separator=u'\n')
-    )                    
+    )                                       

    reactions = scrapy.Field(
            output_processor=reactions_strip
@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item):
    sigh = scrapy.Field()                      
    grrr = scrapy.Field()                      
    share = scrapy.Field()                      # num of shares
-    url = scrapy.Field()
+    url = scrapy.Field(
+        output_processor=url_strip
+        )
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -89,7 +89,6 @@ class FacebookSpider(scrapy.Spider):
        for i in range(len(rispostina)):
            risp = response.urljoin(rispostina[i].extract())
            yield scrapy.Request(risp, callback=self.parse_rispostina)
-
        
        next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
        if len(next_page) > 0:
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -3,6 +3,7 @@ import scrapy
 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
 from fbcrawl.items import FbcrawlItem
+from scrapy.exceptions import CloseSpider


 class FacebookSpider(scrapy.Spider):
@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider):
    """    
    name = "fb"

-    def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
+    def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
        super(FacebookSpider, self).__init__(**kwargs)
        
-        self.year = int(year)    #arguments are passed as strings
-    
+        #email & pass need to be passed as attributes!
        if not email or not password:
            raise ValueError("You need to provide valid email and password!")
        else:
            self.email = email
            self.password = password
            
+        #page name parsing (added support for full urls)
        if not page:
            raise ValueError("You need to provide a valid page name to crawl!")
+        elif page.find('https://www.facebook.com/') != -1:
+            self.page = page[25:]
+        elif page.find('https://mbasic.facebook.com/') != -1:
+            self.page = page[28:]
+        elif page.find('https://m.facebook.com/') != -1:
+            self.page = page[23:]
        else:
            self.page = page
-            
-        if not(lang):
-            self.logger.info('Language attribute not provided, assuming "en"')
-            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
-            self.lang = 'en'
+        
+        #parse year 
+        assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
+        self.year = int(year)    #arguments are passed as strings
+    
+        #parse lang, if not provided (but is supported) it will be guessed in parse_home
+        if lang=='_':
+            self.logger.info('Language attribute not provided, I will try to guess it')
+            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
+            self.lang=lang                            
        elif lang == 'en'  or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
            self.lang = lang
        else:
-            self.logger.info('Lang:{} not currently supported'.format(lang))                             
+            self.logger.info('Lang "{}" not currently supported'.format(lang))                             
            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
-            self.logger.info('Change your interface lang from facebook and try again')     
-            return                        
+            self.logger.info('Change your interface lang from facebook and try again')
+            raise CloseSpider('Language provided not currently supported')

        self.start_urls = ['https://mbasic.facebook.com']    

    def parse(self, response):
+        '''
+        Handle login with provided credentials
+        '''
        return FormRequest.from_response(
                response,
                formxpath='//form[contains(@action, "login")]',
@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider):
  
    def parse_home(self, response):
        '''
-        Parse user news feed page. This code is outdate and needs review.
+        This method has multiple purposes:
+        1) Handle failed logins due to facebook 'save-device' redirection
+        2) Set language interface, if not already provided
+        3) Navigate to given page 
        '''
-        if response.css('#approvals_code'):
-            # Handle 'Approvals Code' checkpoint (ask user to enter code).
-            if not self.code:
-                # Show facebook messages via logs
-                # and request user for approval code.
-                message = response.css('._50f4::text').extract()[0]
-                self.log(message)
-                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
-                self.log(message)
-                self.code = input('Enter the code: ')
-            self.code = str(self.code)
-            if not (self.code and self.code.isdigit()):
-                self.log('Bad approvals code detected.')
-                return
-            return FormRequest.from_response(
-                response,
-                formdata={'approvals_code': self.code},
-                callback=self.parse_home)
-        elif response.xpath("//div/a[contains(@href,'save-device')]"):
-#        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
-            # Handle 'Save Browser' checkpoint.
+        #handle 'save-device' redirection
+        if response.xpath("//div/a[contains(@href,'save-device')]"):
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home)
-        elif response.css('button#checkpointSubmitButton'):
-            # Handle 'Someone tried to log into your account' warning.
-            return FormRequest.from_response(
-                response, callback=self.parse_home)
-        # Else go to the page requested.
-        if self.page.find('https://www.facebook.com/') != -1:
-            self.page = self.page[25:]
+            
+        #set language interface
+        if self.lang == '_':
+            if response.xpath("//input[@placeholder='Search Facebook']"):
+                self.lang = 'en'
+            elif response.xpath("//input[@value='Buscar']"):
+                self.lang = 'es'
+            elif response.xpath("//input[@value='Rechercher']"):
+                self.lang = 'fr'
+            elif response.xpath("//input[@value='Cerca']"):
+                self.lang = 'it'
+            elif response.xpath("//input[@value='Pesquisar']"):
+                self.lang = 'pt'                
+            else:
+                raise CloseSpider('Language not recognized')
+          
+        #navigate to provided page
        href = response.urljoin(self.page)
-        self.logger.info('Parse function called on %s', href)
+        self.logger.info('Parsing facebook page %s', href)
        return scrapy.Request(url=href,callback=self.parse_page)

    def parse_page(self, response):
+        '''
+        Parse the given page selecting the posts.
+        Then ask recursively for another page.
+        '''
        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):            
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")        
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
-            #page_url
-            #new.add_value('url',response.url)
+            new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")   
+
+            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 
            temp_post = response.urljoin(post[0])        
            yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})       

        #load following page
-        #next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
        next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
        if len(next_page) == 0: 
            if response.meta['flag'] == 4 and self.year <= 2015:
@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider):
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
-        new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")   
        
        reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
        reactions = response.urljoin(reactions[0].extract())