improved support for languages en, es, fr, it, pt

This commit is contained in:
rugantio 2019-01-31 06:54:31 +01:00
parent fb32a4213e
commit a9982865d9
6 changed files with 423 additions and 132 deletions

View File

@ -11,8 +11,11 @@ from datetime import datetime, timedelta
def parse_date(init_date,loader_context): def parse_date(init_date,loader_context):
lang = loader_context['lang'] lang = loader_context['lang']
# =============================================================================
# Italian - status:final
# =============================================================================
if lang == 'it': if lang == 'it':
mesi = { months = {
'gennaio':1, 'gennaio':1,
'febbraio':2, 'febbraio':2,
'marzo':3, 'marzo':3,
@ -27,7 +30,7 @@ def parse_date(init_date,loader_context):
'dicembre':12 'dicembre':12
} }
mesi_abbr = { months_abbr = {
'gen':1, 'gen':1,
'feb':2, 'feb':2,
'mar':3, 'mar':3,
@ -43,101 +46,379 @@ def parse_date(init_date,loader_context):
} }
giorni = { giorni = {
'domenica':0, 'lunedì':0,
'lunedì':1, 'martedì':1,
'martedì':2, 'mercoledì':2,
'mercoledì':3, 'giovedì':3,
'giovedì':4, 'vener':4,
'venerdì':5, 'sabato':5,
'sabato':6 'domenica':6
} }
date = init_date date = init_date[0].split()
date = date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check #sanity check
if len(date) == 0: if l == 0:
return 'Error: no data' return 'Error: no data'
#yesterday #adesso, ieri, 4h, 50min
elif len(date) == 1: elif l == 1:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) if date[0].isalpha():
if date[0].lower() == 'ieri':
#4h day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: #check that yesterday was not in another month
pass month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'adesso':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
#22 min (oggi)
if date[1] == 'min':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#22 min (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
elif date[1] == 'h':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#4 h (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#2 gen
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 gennaio
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 3
elif l == 3:
#21 giu 2017
if len(date[1]) == 3 and date[2].isdigit():
day = int(date[0])
month = months_abbr[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#21 giugno 2017
elif len(date[1]) > 3 and date[2].isdigit():
day = int(date[0])
month = months[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 4
elif l == 4:
#Ieri alle ore 23:32
if date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#domenica alle ore 19:29
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'alle':
#29 feb alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'alle':
#29 feb 2016 alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#29 febbraio 2016 alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# =============================================================================
# English - status:beta
# =============================================================================
elif lang == 'en':
months = {
'january':1,
'february':2,
'march':3,
'april':4,
'may':5,
'june':6,
'july':7,
'august':8,
'september':9,
'october':10,
'november':11,
'december':12
}
months_abbr = {
'jan':1,
'feb':2,
'mar':3,
'apr':4,
'may':5,
'jun':6,
'jul':7,
'aug':8,
'sep':9,
'oct':10,
'nov':11,
'dec':12
}
#22h (yesterday) date = init_date[0].split()
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
l = len(date)
#yesterday #sanity check
elif date[0].isdigit() == False and date[1].isdigit() == False: if l == 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) return 'Error: no data'
#day with 3 month length of this year #Yesterday, Now, 4hr, 50mins
elif len(date[1]) == 3 and not(date[2].isdigit()): elif l == 1:
day = int(date[0]) if date[0].isalpha():
month = mesi_abbr[date[1]] if date[0].lower() == 'yesterday':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
elif len(date[1]) > 3 and not(date[2].isdigit()): #check that yesterday was not in another month
day = int(date[0]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
month = mesi[date[1]] elif date[0].lower() == 'now':
return datetime(year,month,day).date() #return today
elif len(date[1]) == 3 and date[2].isdigit(): else: #not recognized, (return date or init_date)
day = int(date[0]) return date
month = mesi_abbr[date[1]] else:
year = int(date[2]) #4h, 50min (exploit future parsing)
l = 2
#usual dates, with regular length month new_date = [x for x in date[0] if x.isdigit()]
elif date[0].isdigit() and date[2].isdigit(): date[0] = ''.join(new_date)
day = int(date[0]) new_date = [x for x in date[0] if not(x.isdigit())]
month = mesi[date[1]] date[1] = ''.join(new_date)
year = int(date[2]) # l = 2
elif l == 2:
#dates with weekdays (this function assumes that the month is the same) #22 min (oggi)
elif date[0].isdigit() == False and date[1].isdigit() == False: if date[1] == 'min' or date[1] == 'mins':
today = datetime.now().weekday() #today as a weekday if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
weekday = giorni[date[0]] #day to be match as number weekday return datetime(year,month,day).date()
#weekday is chronologically always lower than day #22 min (ieri)
if weekday < today: else:
day -= today - weekday day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
elif weekday > today: month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
weekday += 7 return datetime(year,month,day).date()
day -= today - weekday #4 h (oggi)
elif date[1] == 'hr' or date[1] == 'hrs':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#4 h (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#2 gen
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 gennaio
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 3
elif l == 3:
# #21 Jun 2017
# if len(date[1] == 3) and date[2].isdigit():
# day = int(date[0])
# month = months_abbr[date[1].lower()]
# year = int(date[2])
# return datetime(year,month,day).date()
# #21 June 2017
# elif len(date[1] > 3) and date[2].isdigit():
# day = int(date[0])
# month = months[date[1].lower()]
# year = int(date[2])
# return datetime(year,month,day).date()
# #parsing failed
# else:
return date
# l = 4
elif l == 4:
#Ieri alle ore 23:32
if date[0].lower() == 'yesteday' and date[1] == 'at':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'at':
#Jan 29 at 10:00 PM
if len(date[0]) == 3:
day = int(date[1])
month = months_abbr[date[0].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[1])
month = months[date[0].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'at':
date[1]
#Aug 25, 2016 at 7:00 PM
if len(date[0]) == 3:
day = int(date[1][:-1])
month = months_abbr[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#August 25, 2016 at 7:00 PM
else:
day = int(date[1][:-1])
month = months[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l > 6
#parsing failed - l too big
else: else:
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface' return date
return init_date #parsing failed - language not supported
else: else:
return init_date return init_date
date = datetime(year,month,day)
return date.date() def comments_strip(string,loader_context):
lang = loader_context['lang']
def comments_strip(string): if lang == 'it':
return string[0].rstrip(' commenti') if string[0].rfind('Commenta') != -1:
return
def reactions_strip(string): else:
friends = 1 + string[0].count(',') return string[0].rstrip(' commenti')
e = 1 + string[0].count(' e ')
string = string[0].split()[::-1] elif lang == 'en':
if len(string) == 1: new_string = string[0].rstrip(' Comments')
string = string[0] while new_string.rfind(',') != -1:
while string.rfind('.') != -1: new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:]
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:] return new_string
else:
return string return string
string = string[0] def reactions_strip(string,loader_context):
while string.rfind('.') != -1: lang = loader_context['lang']
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:] if lang == 'it':
newstring = string[0]
if not string.isdigit(): #19.298.873
return e if len(newstring.split()) == 1:
while newstring.rfind('.') != -1:
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
return newstring
#Pamela, Luigi e altri 4
else:
return string
# friends = newstring.count(' e ') + newstring.count(',')
# newstring = newstring.split()[::-1][0]
# while newstring.rfind('.') != -1:
# newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
# return int(newstring) + friends
elif lang == 'en':
newstring = string[0]
#19,298,873
if len(newstring.split()) == 1:
while newstring.rfind(',') != -1:
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
return newstring
# #Mark and other 254,134
# elif newstring.split()[::-1][1].isdigit():
# friends = newstring.count(' and ') + newstring.count(',')
# newstring = newstring.split()[::-1][1]
# while newstring.rfind(',') != -1:
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
# return int(newstring) + friends
# #Philip and 1K others
else:
return newstring
else: else:
return int(string) + friends return string
def url_strip(url):
fullurl = url[0]
#catchin '&id=' is enough to identify the post
i = fullurl.find('&id=')
if i != -1:
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
return j
else:
return fullurl
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
source = scrapy.Field( source = scrapy.Field(
output_processor=TakeFirst() output_processor=TakeFirst()
) # page that published the post ) # page that published the post
@ -153,10 +434,7 @@ class FbcrawlItem(scrapy.Item):
comments = scrapy.Field( comments = scrapy.Field(
output_processor=comments_strip output_processor=comments_strip
) )
commentators = scrapy.Field(
output_processor=Join(separator=u'\n')
)
reactions = scrapy.Field( reactions = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item):
sigh = scrapy.Field() sigh = scrapy.Field()
grrr = scrapy.Field() grrr = scrapy.Field()
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
url = scrapy.Field() url = scrapy.Field(
output_processor=url_strip
)

View File

@ -89,7 +89,6 @@ class FacebookSpider(scrapy.Spider):
for i in range(len(rispostina)): for i in range(len(rispostina)):
risp = response.urljoin(rispostina[i].extract()) risp = response.urljoin(rispostina[i].extract())
yield scrapy.Request(risp, callback=self.parse_rispostina) yield scrapy.Request(risp, callback=self.parse_rispostina)
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
if len(next_page) > 0: if len(next_page) > 0:

View File

@ -3,6 +3,7 @@ import scrapy
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.http import FormRequest from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem from fbcrawl.items import FbcrawlItem
from scrapy.exceptions import CloseSpider
class FacebookSpider(scrapy.Spider): class FacebookSpider(scrapy.Spider):
@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider):
""" """
name = "fb" name = "fb"
def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs): def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
super(FacebookSpider, self).__init__(**kwargs) super(FacebookSpider, self).__init__(**kwargs)
self.year = int(year) #arguments are passed as strings #email & pass need to be passed as attributes!
if not email or not password: if not email or not password:
raise ValueError("You need to provide valid email and password!") raise ValueError("You need to provide valid email and password!")
else: else:
self.email = email self.email = email
self.password = password self.password = password
#page name parsing (added support for full urls)
if not page: if not page:
raise ValueError("You need to provide a valid page name to crawl!") raise ValueError("You need to provide a valid page name to crawl!")
elif page.find('https://www.facebook.com/') != -1:
self.page = page[25:]
elif page.find('https://mbasic.facebook.com/') != -1:
self.page = page[28:]
elif page.find('https://m.facebook.com/') != -1:
self.page = page[23:]
else: else:
self.page = page self.page = page
if not(lang): #parse year
self.logger.info('Language attribute not provided, assuming "en"') assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.year = int(year) #arguments are passed as strings
self.lang = 'en'
#parse lang, if not provided (but is supported) it will be guessed in parse_home
if lang=='_':
self.logger.info('Language attribute not provided, I will try to guess it')
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.lang=lang
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt': elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
self.lang = lang self.lang = lang
else: else:
self.logger.info('Lang:{} not currently supported'.format(lang)) self.logger.info('Lang "{}" not currently supported'.format(lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again') self.logger.info('Change your interface lang from facebook and try again')
return raise CloseSpider('Language provided not currently supported')
self.start_urls = ['https://mbasic.facebook.com'] self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response): def parse(self, response):
'''
Handle login with provided credentials
'''
return FormRequest.from_response( return FormRequest.from_response(
response, response,
formxpath='//form[contains(@action, "login")]', formxpath='//form[contains(@action, "login")]',
@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider):
def parse_home(self, response): def parse_home(self, response):
''' '''
Parse user news feed page. This code is outdate and needs review. This method has multiple purposes:
1) Handle failed logins due to facebook 'save-device' redirection
2) Set language interface, if not already provided
3) Navigate to given page
''' '''
if response.css('#approvals_code'): #handle 'save-device' redirection
# Handle 'Approvals Code' checkpoint (ask user to enter code). if response.xpath("//div/a[contains(@href,'save-device')]"):
if not self.code:
# Show facebook messages via logs
# and request user for approval code.
message = response.css('._50f4::text').extract()[0]
self.log(message)
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
self.log(message)
self.code = input('Enter the code: ')
self.code = str(self.code)
if not (self.code and self.code.isdigit()):
self.log('Bad approvals code detected.')
return
return FormRequest.from_response(
response,
formdata={'approvals_code': self.code},
callback=self.parse_home)
elif response.xpath("//div/a[contains(@href,'save-device')]"):
# elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
# Handle 'Save Browser' checkpoint.
return FormRequest.from_response( return FormRequest.from_response(
response, response,
formdata={'name_action_selected': 'dont_save'}, formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home) callback=self.parse_home)
elif response.css('button#checkpointSubmitButton'):
# Handle 'Someone tried to log into your account' warning. #set language interface
return FormRequest.from_response( if self.lang == '_':
response, callback=self.parse_home) if response.xpath("//input[@placeholder='Search Facebook']"):
# Else go to the page requested. self.lang = 'en'
if self.page.find('https://www.facebook.com/') != -1: elif response.xpath("//input[@value='Buscar']"):
self.page = self.page[25:] self.lang = 'es'
elif response.xpath("//input[@value='Rechercher']"):
self.lang = 'fr'
elif response.xpath("//input[@value='Cerca']"):
self.lang = 'it'
elif response.xpath("//input[@value='Pesquisar']"):
self.lang = 'pt'
else:
raise CloseSpider('Language not recognized')
#navigate to provided page
href = response.urljoin(self.page) href = response.urljoin(self.page)
self.logger.info('Parse function called on %s', href) self.logger.info('Parsing facebook page %s', href)
return scrapy.Request(url=href,callback=self.parse_page) return scrapy.Request(url=href,callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
'''
Parse the given page selecting the posts.
Then ask recursively for another page.
'''
#select all posts #select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
new = ItemLoader(item=FbcrawlItem(),selector=post) new = ItemLoader(item=FbcrawlItem(),selector=post)
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
#new.add_value('url',response.url)
#page_url #new.add_value('url',response.url)
#returns full post-link in a list #returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0]) temp_post = response.urljoin(post[0])
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new}) yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
#load following page #load following page
#next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if len(next_page) == 0: if len(next_page) == 0:
if response.meta['flag'] == 4 and self.year <= 2015: if response.meta['flag'] == 4 and self.year <= 2015:
@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider):
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath('date', '//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
reactions = response.urljoin(reactions[0].extract()) reactions = response.urljoin(reactions[0].extract())