improved support for languages en, es, fr, it, pt

This commit is contained in:
rugantio 2019-01-31 06:54:31 +01:00
parent fb32a4213e
commit a9982865d9
6 changed files with 423 additions and 132 deletions

View File

@ -11,8 +11,11 @@ from datetime import datetime, timedelta
def parse_date(init_date,loader_context):
lang = loader_context['lang']
# =============================================================================
# Italian - status:final
# =============================================================================
if lang == 'it':
mesi = {
months = {
'gennaio':1,
'febbraio':2,
'marzo':3,
@ -27,7 +30,7 @@ def parse_date(init_date,loader_context):
'dicembre':12
}
mesi_abbr = {
months_abbr = {
'gen':1,
'feb':2,
'mar':3,
@ -43,101 +46,379 @@ def parse_date(init_date,loader_context):
}
giorni = {
'domenica':0,
'lunedì':1,
'martedì':2,
'mercoledì':3,
'giovedì':4,
'venerdì':5,
'sabato':6
'lunedì':0,
'martedì':1,
'mercoledì':2,
'giovedì':3,
'vener':4,
'sabato':5,
'domenica':6
}
date = init_date
date = date[0].split()
date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check
if len(date) == 0:
if l == 0:
return 'Error: no data'
#yesterday
elif len(date) == 1:
#adesso, ieri, 4h, 50min
elif l == 1:
if date[0].isalpha():
if date[0].lower() == 'ieri':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#4h
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
pass
#22h (yesterday)
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
#check that yesterday was not in another month
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'adesso':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
#22 min (oggi)
if date[1] == 'min':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#22 min (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#yesterday
elif date[0].isdigit() == False and date[1].isdigit() == False:
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
elif date[1] == 'h':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#4 h (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#day with 3 month length of this year
elif len(date[1]) == 3 and not(date[2].isdigit()):
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#2 gen
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = mesi_abbr[date[1]]
elif len(date[1]) > 3 and not(date[2].isdigit()):
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 gennaio
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = mesi[date[1]]
elif len(date[1]) == 3 and date[2].isdigit():
month = months[date[1]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 3
elif l == 3:
#21 giu 2017
if len(date[1]) == 3 and date[2].isdigit():
day = int(date[0])
month = mesi_abbr[date[1]]
month = months_abbr[date[1]]
year = int(date[2])
#usual dates, with regular length month
elif date[0].isdigit() and date[2].isdigit():
return datetime(year,month,day).date()
#21 giugno 2017
elif len(date[1]) > 3 and date[2].isdigit():
day = int(date[0])
month = mesi[date[1]]
month = months[date[1]]
year = int(date[2])
#dates with weekdays (this function assumes that the month is the same)
elif date[0].isdigit() == False and date[1].isdigit() == False:
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 4
elif l == 4:
#Ieri alle ore 23:32
if date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#domenica alle ore 19:29
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0]] #day to be match as number weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
if weekday < today:
day -= today - weekday
elif weekday > today:
weekday += 7
day -= today - weekday
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
return init_date
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return init_date
date = datetime(year,month,day)
return date.date()
return date
# l = 5
elif l == 5:
if date[2] == 'alle':
#29 feb alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'alle':
#29 feb 2016 alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#29 febbraio 2016 alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# =============================================================================
# English - status:beta
# =============================================================================
elif lang == 'en':
months = {
'january':1,
'february':2,
'march':3,
'april':4,
'may':5,
'june':6,
'july':7,
'august':8,
'september':9,
'october':10,
'november':11,
'december':12
}
def comments_strip(string):
months_abbr = {
'jan':1,
'feb':2,
'mar':3,
'apr':4,
'may':5,
'jun':6,
'jul':7,
'aug':8,
'sep':9,
'oct':10,
'nov':11,
'dec':12
}
date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check
if l == 0:
return 'Error: no data'
#Yesterday, Now, 4hr, 50mins
elif l == 1:
if date[0].isalpha():
if date[0].lower() == 'yesterday':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'now':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
#22 min (oggi)
if date[1] == 'min' or date[1] == 'mins':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#22 min (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
elif date[1] == 'hr' or date[1] == 'hrs':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#4 h (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#2 gen
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 gennaio
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 3
elif l == 3:
# #21 Jun 2017
# if len(date[1] == 3) and date[2].isdigit():
# day = int(date[0])
# month = months_abbr[date[1].lower()]
# year = int(date[2])
# return datetime(year,month,day).date()
# #21 June 2017
# elif len(date[1] > 3) and date[2].isdigit():
# day = int(date[0])
# month = months[date[1].lower()]
# year = int(date[2])
# return datetime(year,month,day).date()
# #parsing failed
# else:
return date
# l = 4
elif l == 4:
#Ieri alle ore 23:32
if date[0].lower() == 'yesteday' and date[1] == 'at':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'at':
#Jan 29 at 10:00 PM
if len(date[0]) == 3:
day = int(date[1])
month = months_abbr[date[0].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[1])
month = months[date[0].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'at':
date[1]
#Aug 25, 2016 at 7:00 PM
if len(date[0]) == 3:
day = int(date[1][:-1])
month = months_abbr[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#August 25, 2016 at 7:00 PM
else:
day = int(date[1][:-1])
month = months[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l > 6
#parsing failed - l too big
else:
return date
#parsing failed - language not supported
else:
return init_date
def comments_strip(string,loader_context):
lang = loader_context['lang']
if lang == 'it':
if string[0].rfind('Commenta') != -1:
return
else:
return string[0].rstrip(' commenti')
def reactions_strip(string):
friends = 1 + string[0].count(',')
e = 1 + string[0].count(' e ')
string = string[0].split()[::-1]
if len(string) == 1:
string = string[0]
while string.rfind('.') != -1:
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
elif lang == 'en':
new_string = string[0].rstrip(' Comments')
while new_string.rfind(',') != -1:
new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:]
return new_string
else:
return string
string = string[0]
while string.rfind('.') != -1:
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
if not string.isdigit():
return e
def reactions_strip(string,loader_context):
lang = loader_context['lang']
if lang == 'it':
newstring = string[0]
#19.298.873
if len(newstring.split()) == 1:
while newstring.rfind('.') != -1:
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
return newstring
#Pamela, Luigi e altri 4
else:
return int(string) + friends
return string
# friends = newstring.count(' e ') + newstring.count(',')
# newstring = newstring.split()[::-1][0]
# while newstring.rfind('.') != -1:
# newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
# return int(newstring) + friends
elif lang == 'en':
newstring = string[0]
#19,298,873
if len(newstring.split()) == 1:
while newstring.rfind(',') != -1:
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
return newstring
# #Mark and other 254,134
# elif newstring.split()[::-1][1].isdigit():
# friends = newstring.count(' and ') + newstring.count(',')
# newstring = newstring.split()[::-1][1]
# while newstring.rfind(',') != -1:
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
# return int(newstring) + friends
# #Philip and 1K others
else:
return newstring
else:
return string
def url_strip(url):
fullurl = url[0]
#catchin '&id=' is enough to identify the post
i = fullurl.find('&id=')
if i != -1:
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
return j
else:
return fullurl
class FbcrawlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
source = scrapy.Field(
output_processor=TakeFirst()
) # page that published the post
@ -154,9 +435,6 @@ class FbcrawlItem(scrapy.Item):
comments = scrapy.Field(
output_processor=comments_strip
)
commentators = scrapy.Field(
output_processor=Join(separator=u'\n')
)
reactions = scrapy.Field(
output_processor=reactions_strip
@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item):
sigh = scrapy.Field()
grrr = scrapy.Field()
share = scrapy.Field() # num of shares
url = scrapy.Field()
url = scrapy.Field(
output_processor=url_strip
)

View File

@ -90,7 +90,6 @@ class FacebookSpider(scrapy.Spider):
risp = response.urljoin(rispostina[i].extract())
yield scrapy.Request(risp, callback=self.parse_rispostina)
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
if len(next_page) > 0:
next_page = response.urljoin(next_page[0].extract())

View File

@ -3,6 +3,7 @@ import scrapy
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem
from scrapy.exceptions import CloseSpider
class FacebookSpider(scrapy.Spider):
@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider):
"""
name = "fb"
def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
super(FacebookSpider, self).__init__(**kwargs)
self.year = int(year) #arguments are passed as strings
#email & pass need to be passed as attributes!
if not email or not password:
raise ValueError("You need to provide valid email and password!")
else:
self.email = email
self.password = password
#page name parsing (added support for full urls)
if not page:
raise ValueError("You need to provide a valid page name to crawl!")
elif page.find('https://www.facebook.com/') != -1:
self.page = page[25:]
elif page.find('https://mbasic.facebook.com/') != -1:
self.page = page[28:]
elif page.find('https://m.facebook.com/') != -1:
self.page = page[23:]
else:
self.page = page
if not(lang):
self.logger.info('Language attribute not provided, assuming "en"')
#parse year
assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
self.year = int(year) #arguments are passed as strings
#parse lang, if not provided (but is supported) it will be guessed in parse_home
if lang=='_':
self.logger.info('Language attribute not provided, I will try to guess it')
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.lang = 'en'
self.lang=lang
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
self.lang = lang
else:
self.logger.info('Lang:{} not currently supported'.format(lang))
self.logger.info('Lang "{}" not currently supported'.format(lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again')
return
raise CloseSpider('Language provided not currently supported')
self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response):
'''
Handle login with provided credentials
'''
return FormRequest.from_response(
response,
formxpath='//form[contains(@action, "login")]',
@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider):
def parse_home(self, response):
'''
Parse user news feed page. This code is outdate and needs review.
This method has multiple purposes:
1) Handle failed logins due to facebook 'save-device' redirection
2) Set language interface, if not already provided
3) Navigate to given page
'''
if response.css('#approvals_code'):
# Handle 'Approvals Code' checkpoint (ask user to enter code).
if not self.code:
# Show facebook messages via logs
# and request user for approval code.
message = response.css('._50f4::text').extract()[0]
self.log(message)
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
self.log(message)
self.code = input('Enter the code: ')
self.code = str(self.code)
if not (self.code and self.code.isdigit()):
self.log('Bad approvals code detected.')
return
return FormRequest.from_response(
response,
formdata={'approvals_code': self.code},
callback=self.parse_home)
elif response.xpath("//div/a[contains(@href,'save-device')]"):
# elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
# Handle 'Save Browser' checkpoint.
#handle 'save-device' redirection
if response.xpath("//div/a[contains(@href,'save-device')]"):
return FormRequest.from_response(
response,
formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home)
elif response.css('button#checkpointSubmitButton'):
# Handle 'Someone tried to log into your account' warning.
return FormRequest.from_response(
response, callback=self.parse_home)
# Else go to the page requested.
if self.page.find('https://www.facebook.com/') != -1:
self.page = self.page[25:]
#set language interface
if self.lang == '_':
if response.xpath("//input[@placeholder='Search Facebook']"):
self.lang = 'en'
elif response.xpath("//input[@value='Buscar']"):
self.lang = 'es'
elif response.xpath("//input[@value='Rechercher']"):
self.lang = 'fr'
elif response.xpath("//input[@value='Cerca']"):
self.lang = 'it'
elif response.xpath("//input[@value='Pesquisar']"):
self.lang = 'pt'
else:
raise CloseSpider('Language not recognized')
#navigate to provided page
href = response.urljoin(self.page)
self.logger.info('Parse function called on %s', href)
self.logger.info('Parsing facebook page %s', href)
return scrapy.Request(url=href,callback=self.parse_page)
def parse_page(self, response):
'''
Parse the given page selecting the posts.
Then ask recursively for another page.
'''
#select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
new = ItemLoader(item=FbcrawlItem(),selector=post)
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url
#new.add_value('url',response.url)
new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
#page_url #new.add_value('url',response.url)
#returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0])
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
#load following page
#next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if len(next_page) == 0:
if response.meta['flag'] == 4 and self.year <= 2015:
@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider):
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
new.add_xpath('date', '//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
reactions = response.urljoin(reactions[0].extract())