improved support for languages en, es, fr, it, pt

2019-01-31 06:54:31 +01:00 · 2019-01-31 06:54:31 +01:00 · a9982865d9
commit a9982865d9
parent fb32a4213e
6 changed files with 423 additions and 132 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -11,8 +11,11 @@ from datetime import datetime, timedelta
 def parse_date(init_date,loader_context):
    lang = loader_context['lang']
 # =============================================================================
 # Italian - status:final
 # =============================================================================
    if lang == 'it':
-        mesi = {
+        months = {
        'gennaio':1,
        'febbraio':2,
        'marzo':3,
@ -27,7 +30,7 @@ def parse_date(init_date,loader_context):
        'dicembre':12
        }
-        mesi_abbr = {
+        months_abbr = {
        'gen':1,
        'feb':2,
        'mar':3,
@ -43,101 +46,379 @@ def parse_date(init_date,loader_context):
        }    
        giorni = {
-        'domenica':0,
+        'lunedì':0,
-        'lunedì':1,
+        'martedì':1,
-        'martedì':2,
+        'mercoledì':2,
-        'mercoledì':3,
+        'giovedì':3,
-        'giovedì':4,
+        'venerdì':4,
-        'venerdì':5,
+        'sabato':5,
-        'sabato':6
+        'domenica':6
        }    
-        date = init_date
+        date = init_date[0].split()
        date = date[0].split()
        year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
        l = len(date)
        #sanity check
-        if len(date) == 0:
+        if l == 0:
            return 'Error: no data'
-        #yesterday 
+        #adesso, ieri, 4h, 50min
-        elif len(date) == 1:
+        elif l == 1:
-            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+            if date[0].isalpha():   
-            
+                if date[0].lower() == 'ieri':
-        #4h
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
-        elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
+                    #check that yesterday was not in another month
-            pass
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                elif date[0].lower() == 'adesso':
                        return datetime(year,month,day).date()    #return today
                else:  #not recognized, (return date or init_date)
                    return date 
            else: 
                #4h, 50min (exploit future parsing)
                l = 2
                new_date = [x for x in date[0] if x.isdigit()]
                date[0] = ''.join(new_date)
                new_date = [x for x in date[0] if not(x.isdigit())]
                date[1] = ''.join(new_date) 
 # l = 2        
        elif l == 2:
            #22 min (oggi)
            if date[1] == 'min':
                if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
                    return datetime(year,month,day).date()
                #22 min (ieri)
                else:
                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                    return datetime(year,month,day).date()   
            #4 h (oggi)
            elif date[1] == 'h':
                if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
                    return datetime(year,month,day).date()
                #4 h (ieri)
                else:
                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                    return datetime(year,month,day).date()   
            #2 gen
            elif len(date[1]) == 3 and date[1].isalpha():
                day = int(date[0])
                month = months_abbr[date[1].lower()]  
                return datetime(year,month,day).date()  
            #2 gennaio
            elif len(date[1]) > 3 and date[1].isalpha():
                day = int(date[0])
                month = months[date[1]]
                return datetime(year,month,day).date()  
            #parsing failed
            else:
                return date
 # l = 3
        elif l == 3:
            #21 giu 2017        
            if len(date[1]) == 3 and date[2].isdigit():
                day = int(date[0])
                month = months_abbr[date[1]]
                year = int(date[2])
                return datetime(year,month,day).date()   
            #21 giugno 2017        
            elif len(date[1]) > 3 and date[2].isdigit():
                day = int(date[0])
                month = months[date[1]]
                year = int(date[2])
                return datetime(year,month,day).date()                  
            #parsing failed
            else:
                return date
 # l = 4
        elif l == 4:
            #Ieri alle ore 23:32
            if date[0].lower() == 'ieri' and date[1] == 'alle':
                day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                return datetime(year,month,day).date()   
            #domenica alle ore 19:29
            elif date[0].isalpha() and date[1] == 'alle':
                today = datetime.now().weekday() #today as a weekday
                weekday = giorni[date[0].lower()]   #day to be match as number weekday
                #weekday is chronologically always lower than day
                delta = today - weekday   
                if delta >= 0:
                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
                    return datetime(year,month,day).date()
                #lunedì = 0 sabato = 6, mar 1 ven 5
                else:
                    delta += 8
                    day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
                    return datetime(year,month,day).date()            
            #parsing failed
            else:
                return date
 # l = 5
        elif l == 5:
           if date[2] == 'alle':
               #29 feb alle ore 21:49
               if len(date[1]) == 3:
                   day = int(date[0])
                   month = months_abbr[date[1].lower()]
                   return datetime(year,month,day).date()   
               #29 febbraio alle ore 21:49        
               else:
                   day = int(date[0])
                   month = months[date[1].lower()]
                   return datetime(year,month,day).date()   
           #parsing failed
           else:
               return date
 # l = 6              
        elif l == 6:
           if date[3] == 'alle':
               #29 feb 2016 alle ore 21:49
               if len(date[1]) == 3:
                   day = int(date[0])
                   month = months_abbr[date[1].lower()]
                   year = int(date[2])
                   return datetime(year,month,day).date()   
               #29 febbraio 2016 alle ore 21:49        
               else:
                   day = int(date[0])
                   month = months[date[1].lower()]
                   year = int(date[2])
                   return datetime(year,month,day).date()   
           #parsing failed    
           else:
               return date
 # =============================================================================
 # English - status:beta
 # =============================================================================
    elif lang == 'en':
        months = {
        'january':1,
        'february':2,
        'march':3,
        'april':4,
        'may':5,
        'june':6,
        'july':7,
        'august':8,
        'september':9,
        'october':10,
        'november':11,
        'december':12
        }
        months_abbr = {
        'jan':1,
        'feb':2,
        'mar':3,
        'apr':4,
        'may':5,
        'jun':6,
        'jul':7,
        'aug':8,
        'sep':9,
        'oct':10,
        'nov':11,
        'dec':12
        }    
-        #22h (yesterday)
+        date = init_date[0].split()
-        elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
+        year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
-            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+
        l = len(date)
-        #yesterday
+        #sanity check
-        elif date[0].isdigit() == False and date[1].isdigit() == False:
+        if l == 0:
-            day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+            return 'Error: no data'
-            
+        
-        #day with 3 month length of this year
+        #Yesterday, Now, 4hr, 50mins
-        elif len(date[1]) == 3 and not(date[2].isdigit()):
+        elif l == 1:
-            day = int(date[0])
+            if date[0].isalpha():   
-            month = mesi_abbr[date[1]]
+                if date[0].lower() == 'yesterday':
-    
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
-        elif len(date[1]) > 3 and not(date[2].isdigit()):
+                    #check that yesterday was not in another month
-            day = int(date[0])
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
-            month = mesi[date[1]]
+                elif date[0].lower() == 'now':
-    
+                        return datetime(year,month,day).date()    #return today
-        elif len(date[1]) == 3 and date[2].isdigit():
+                else:  #not recognized, (return date or init_date)
-            day = int(date[0])
+                    return date 
-            month = mesi_abbr[date[1]]
+            else: 
-            year = int(date[2])
+                #4h, 50min (exploit future parsing)
-    
+                l = 2
-        #usual dates, with regular length month 
+                new_date = [x for x in date[0] if x.isdigit()]
-        elif date[0].isdigit() and date[2].isdigit():
+                date[0] = ''.join(new_date)
-            day = int(date[0])
+                new_date = [x for x in date[0] if not(x.isdigit())]
-            month = mesi[date[1]]
+                date[1] = ''.join(new_date) 
-            year = int(date[2])
+# l = 2        
-    
+        elif l == 2:
-        #dates with weekdays (this function assumes that the month is the same)
+            #22 min (oggi)
-        elif date[0].isdigit() == False and date[1].isdigit() == False:
+            if date[1] == 'min' or date[1] == 'mins':
-            today = datetime.now().weekday() #today as a weekday
+                if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
-            weekday = giorni[date[0]]   #day to be match as number weekday
+                    return datetime(year,month,day).date()
-            #weekday is chronologically always lower than day
+                #22 min (ieri)
-            if weekday < today:
+                else:
-                day -= today - weekday 
+                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
-            elif weekday > today:
+                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
-                weekday += 7 
+                    return datetime(year,month,day).date()   
-                day -= today - weekday
+            #4 h (oggi)
            elif date[1] == 'hr' or date[1] == 'hrs':
                if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
                    return datetime(year,month,day).date()
                #4 h (ieri)
                else:
                    day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                    month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                    return datetime(year,month,day).date()   
            #2 gen
            elif len(date[1]) == 3 and date[1].isalpha():
                day = int(date[0])
                month = months_abbr[date[1].lower()]  
                return datetime(year,month,day).date()  
            #2 gennaio
            elif len(date[1]) > 3 and date[1].isalpha():
                day = int(date[0])
                month = months[date[1]]
                return datetime(year,month,day).date()  
            #parsing failed
            else:
                return date
 # l = 3
        elif l == 3:
 #            #21 Jun 2017
 #            if len(date[1] == 3) and date[2].isdigit():
 #                day = int(date[0])
 #                month = months_abbr[date[1].lower()]
 #                year = int(date[2])
 #                return datetime(year,month,day).date()   
 #            #21 June 2017        
 #            elif len(date[1] > 3) and date[2].isdigit():
 #                day = int(date[0])
 #                month = months[date[1].lower()]
 #                year = int(date[2])
 #                return datetime(year,month,day).date()                  
 #            #parsing failed
 #            else:
                return date
 # l = 4
        elif l == 4:
            #Ieri alle ore 23:32
            if date[0].lower() == 'yesteday' and date[1] == 'at':
                day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
                month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
                return datetime(year,month,day).date()   
            #parsing failed
            else:
                return date
 # l = 5
        elif l == 5:
           if date[2] == 'at':
               #Jan 29 at 10:00 PM 
               if len(date[0]) == 3:
                   day = int(date[1])
                   month = months_abbr[date[0].lower()]
                   return datetime(year,month,day).date()   
               #29 febbraio alle ore 21:49        
               else:
                   day = int(date[1])
                   month = months[date[0].lower()]
                   return datetime(year,month,day).date()   
           #parsing failed
           else:
               return date
 # l = 6              
        elif l == 6:
           if date[3] == 'at':
               date[1]
               #Aug 25, 2016 at 7:00 PM 
               if len(date[0]) == 3:
                   day = int(date[1][:-1])
                   month = months_abbr[date[0].lower()]
                   year = int(date[2])
                   return datetime(year,month,day).date()    
               #August 25, 2016 at 7:00 PM      
               else:
                   day = int(date[1][:-1])
                   month = months[date[0].lower()]
                   year = int(date[2])
                   return datetime(year,month,day).date()   
           #parsing failed    
           else:
               return date
 # l > 6           
        #parsing failed - l too big
        else:
-        #date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
+            return date
-            return init_date
+    #parsing failed - language not supported
    else:
        return init_date
-    date = datetime(year,month,day)
+    
-    return date.date()
+def comments_strip(string,loader_context):
-
+    lang = loader_context['lang']
-def comments_strip(string):
+    if lang == 'it':
-    return string[0].rstrip(' commenti')
+        if string[0].rfind('Commenta') != -1:
-
+            return
-def reactions_strip(string):
+        else:
-    friends = 1 + string[0].count(',')
+            return string[0].rstrip(' commenti')
-    e = 1 + string[0].count(' e ')
+        
-    string = string[0].split()[::-1]
+    elif lang == 'en':
-    if len(string) == 1:
+        new_string = string[0].rstrip(' Comments')
-        string = string[0]
+        while new_string.rfind(',') != -1:
-        while string.rfind('.') != -1:
+            new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:]
-            string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
+        return new_string
    else:
        return string
-    string = string[0]
+def reactions_strip(string,loader_context):
-    while string.rfind('.') != -1:
+    lang = loader_context['lang']
-        string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
+    if lang == 'it':
-    
+        newstring = string[0]
-    if not string.isdigit():
+        #19.298.873       
-        return e
+        if len(newstring.split()) == 1:
            while newstring.rfind('.') != -1:
                newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
            return newstring
        #Pamela, Luigi e altri 4
        else:   
            return string
 #            friends = newstring.count(' e ') + newstring.count(',')
 #            newstring = newstring.split()[::-1][0]
 #            while newstring.rfind('.') != -1:
 #                newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
 #            return int(newstring) + friends
    elif lang == 'en':
        newstring = string[0]
        #19,298,873       
        if len(newstring.split()) == 1:
            while newstring.rfind(',') != -1:
                newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
            return newstring
 #        #Mark and other 254,134 
 #        elif newstring.split()[::-1][1].isdigit(): 
 #            friends = newstring.count(' and ') + newstring.count(',')
 #            newstring = newstring.split()[::-1][1]
 #            while newstring.rfind(',') != -1:
 #                newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
 #            return int(newstring) + friends
 #        #Philip and 1K others
        else:
            return newstring
    else:
-        return int(string) + friends
+        return string
 def url_strip(url):
    fullurl = url[0]
    #catchin '&id=' is enough to identify the post
    i = fullurl.find('&id=')
    if i != -1:
        j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
        return j
    else:
        return fullurl
 class FbcrawlItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    source = scrapy.Field(
            output_processor=TakeFirst()
    )                     # page that published the post
@ -153,10 +434,7 @@ class FbcrawlItem(scrapy.Item):
    comments = scrapy.Field(
            output_processor=comments_strip
-    )                       
+    )                                       
    commentators = scrapy.Field(
            output_processor=Join(separator=u'\n')
    )                    
    reactions = scrapy.Field(
            output_processor=reactions_strip
@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item):
    sigh = scrapy.Field()                      
    grrr = scrapy.Field()                      
    share = scrapy.Field()                      # num of shares
-    url = scrapy.Field()
+    url = scrapy.Field(
        output_processor=url_strip
        )
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -89,7 +89,6 @@ class FacebookSpider(scrapy.Spider):
        for i in range(len(rispostina)):
            risp = response.urljoin(rispostina[i].extract())
            yield scrapy.Request(risp, callback=self.parse_rispostina)
        next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
        if len(next_page) > 0:
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -3,6 +3,7 @@ import scrapy
 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
 from fbcrawl.items import FbcrawlItem
 from scrapy.exceptions import CloseSpider
 class FacebookSpider(scrapy.Spider):
@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider):
    """    
    name = "fb"
-    def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
+    def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
        super(FacebookSpider, self).__init__(**kwargs)
-        self.year = int(year)    #arguments are passed as strings
+        #email & pass need to be passed as attributes!
        if not email or not password:
            raise ValueError("You need to provide valid email and password!")
        else:
            self.email = email
            self.password = password
        #page name parsing (added support for full urls)
        if not page:
            raise ValueError("You need to provide a valid page name to crawl!")
        elif page.find('https://www.facebook.com/') != -1:
            self.page = page[25:]
        elif page.find('https://mbasic.facebook.com/') != -1:
            self.page = page[28:]
        elif page.find('https://m.facebook.com/') != -1:
            self.page = page[23:]
        else:
            self.page = page
-            
+        
-        if not(lang):
+        #parse year 
-            self.logger.info('Language attribute not provided, assuming "en"')
+        assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
-            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
+        self.year = int(year)    #arguments are passed as strings
-            self.lang = 'en'
+    
        #parse lang, if not provided (but is supported) it will be guessed in parse_home
        if lang=='_':
            self.logger.info('Language attribute not provided, I will try to guess it')
            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
            self.lang=lang                            
        elif lang == 'en'  or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
            self.lang = lang
        else:
-            self.logger.info('Lang:{} not currently supported'.format(lang))                             
+            self.logger.info('Lang "{}" not currently supported'.format(lang))                             
            self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')                             
-            self.logger.info('Change your interface lang from facebook and try again')     
+            self.logger.info('Change your interface lang from facebook and try again')
-            return                        
+            raise CloseSpider('Language provided not currently supported')
        self.start_urls = ['https://mbasic.facebook.com']    
    def parse(self, response):
        '''
        Handle login with provided credentials
        '''
        return FormRequest.from_response(
                response,
                formxpath='//form[contains(@action, "login")]',
@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider):
    def parse_home(self, response):
        '''
-        Parse user news feed page. This code is outdate and needs review.
+        This method has multiple purposes:
        1) Handle failed logins due to facebook 'save-device' redirection
        2) Set language interface, if not already provided
        3) Navigate to given page 
        '''
-        if response.css('#approvals_code'):
+        #handle 'save-device' redirection
-            # Handle 'Approvals Code' checkpoint (ask user to enter code).
+        if response.xpath("//div/a[contains(@href,'save-device')]"):
            if not self.code:
                # Show facebook messages via logs
                # and request user for approval code.
                message = response.css('._50f4::text').extract()[0]
                self.log(message)
                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
                self.log(message)
                self.code = input('Enter the code: ')
            self.code = str(self.code)
            if not (self.code and self.code.isdigit()):
                self.log('Bad approvals code detected.')
                return
            return FormRequest.from_response(
                response,
                formdata={'approvals_code': self.code},
                callback=self.parse_home)
        elif response.xpath("//div/a[contains(@href,'save-device')]"):
 #        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
            # Handle 'Save Browser' checkpoint.
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home)
-        elif response.css('button#checkpointSubmitButton'):
+            
-            # Handle 'Someone tried to log into your account' warning.
+        #set language interface
-            return FormRequest.from_response(
+        if self.lang == '_':
-                response, callback=self.parse_home)
+            if response.xpath("//input[@placeholder='Search Facebook']"):
-        # Else go to the page requested.
+                self.lang = 'en'
-        if self.page.find('https://www.facebook.com/') != -1:
+            elif response.xpath("//input[@value='Buscar']"):
-            self.page = self.page[25:]
+                self.lang = 'es'
            elif response.xpath("//input[@value='Rechercher']"):
                self.lang = 'fr'
            elif response.xpath("//input[@value='Cerca']"):
                self.lang = 'it'
            elif response.xpath("//input[@value='Pesquisar']"):
                self.lang = 'pt'                
            else:
                raise CloseSpider('Language not recognized')
        #navigate to provided page
        href = response.urljoin(self.page)
-        self.logger.info('Parse function called on %s', href)
+        self.logger.info('Parsing facebook page %s', href)
        return scrapy.Request(url=href,callback=self.parse_page)
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):            
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")        
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
-            #page_url
+            new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")   
-            #new.add_value('url',response.url)
+
            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 
            temp_post = response.urljoin(post[0])        
            yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})       
        #load following page
        #next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
        next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
        if len(next_page) == 0: 
            if response.meta['flag'] == 4 and self.year <= 2015:
@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider):
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
        new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")   
        reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
        reactions = response.urljoin(reactions[0].extract())