added experimental support for languages en, es, fr, it, pt

This commit is contained in:
rugantio 2019-01-30 20:34:25 +01:00
parent 9de51e0ce8
commit fb32a4213e
5 changed files with 134 additions and 117 deletions

View File

@ -9,102 +9,106 @@ import scrapy
from scrapy.loader.processors import TakeFirst, Join, MapCompose from scrapy.loader.processors import TakeFirst, Join, MapCompose
from datetime import datetime, timedelta from datetime import datetime, timedelta
def parse_date(init_date): def parse_date(init_date,loader_context):
mesi = { lang = loader_context['lang']
'gennaio':1, if lang == 'it':
'febbraio':2, mesi = {
'marzo':3, 'gennaio':1,
'aprile':4, 'febbraio':2,
'maggio':5, 'marzo':3,
'giugno':6, 'aprile':4,
'luglio':7, 'maggio':5,
'agosto':8, 'giugno':6,
'settembre':9, 'luglio':7,
'ottobre':10, 'agosto':8,
'novembre':11, 'settembre':9,
'dicembre':12 'ottobre':10,
} 'novembre':11,
'dicembre':12
}
mesi_abbr = { mesi_abbr = {
'gen':1, 'gen':1,
'feb':2, 'feb':2,
'mar':3, 'mar':3,
'apr':4, 'apr':4,
'mag':5, 'mag':5,
'giu':6, 'giu':6,
'lug':7, 'lug':7,
'ago':8, 'ago':8,
'set':9, 'set':9,
'ott':10, 'ott':10,
'nov':11, 'nov':11,
'dic':12 'dic':12
} }
giorni = { giorni = {
'domenica':0, 'domenica':0,
'lunedì':1, 'lunedì':1,
'martedì':2, 'martedì':2,
'mercoledì':3, 'mercoledì':3,
'giovedì':4, 'giovedì':4,
'venerdì':5, 'venerdì':5,
'sabato':6 'sabato':6
} }
date = init_date date = init_date
date = date[0].split() date = date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
#sanity check #sanity check
if len(date) == 0: if len(date) == 0:
return 'Error: no data' return 'Error: no data'
#yesterday #yesterday
elif len(date) == 1: elif len(date) == 1:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#4h #4h
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
pass pass
#22h (yesterday) #22h (yesterday)
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#yesterday #yesterday
elif date[0].isdigit() == False and date[1].isdigit() == False: elif date[0].isdigit() == False and date[1].isdigit() == False:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#day with 3 month length of this year #day with 3 month length of this year
elif len(date[1]) == 3 and not(date[2].isdigit()): elif len(date[1]) == 3 and not(date[2].isdigit()):
day = int(date[0]) day = int(date[0])
month = mesi_abbr[date[1]] month = mesi_abbr[date[1]]
elif len(date[1]) > 3 and not(date[2].isdigit()): elif len(date[1]) > 3 and not(date[2].isdigit()):
day = int(date[0]) day = int(date[0])
month = mesi[date[1]] month = mesi[date[1]]
elif len(date[1]) == 3 and date[2].isdigit(): elif len(date[1]) == 3 and date[2].isdigit():
day = int(date[0]) day = int(date[0])
month = mesi_abbr[date[1]] month = mesi_abbr[date[1]]
year = int(date[2]) year = int(date[2])
#usual dates, with regular length month #usual dates, with regular length month
elif date[0].isdigit() and date[2].isdigit(): elif date[0].isdigit() and date[2].isdigit():
day = int(date[0]) day = int(date[0])
month = mesi[date[1]] month = mesi[date[1]]
year = int(date[2]) year = int(date[2])
#dates with weekdays (this function assumes that the month is the same) #dates with weekdays (this function assumes that the month is the same)
elif date[0].isdigit() == False and date[1].isdigit() == False: elif date[0].isdigit() == False and date[1].isdigit() == False:
today = datetime.now().weekday() #today as a weekday today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0]] #day to be match as number weekday weekday = giorni[date[0]] #day to be match as number weekday
#weekday is chronologically always lower than day #weekday is chronologically always lower than day
if weekday < today: if weekday < today:
day -= today - weekday day -= today - weekday
elif weekday > today: elif weekday > today:
weekday += 7 weekday += 7
day -= today - weekday day -= today - weekday
else:
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
return init_date
else: else:
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
return init_date return init_date
date = datetime(year,month,day) date = datetime(year,month,day)
return date.date() return date.date()

View File

@ -11,10 +11,10 @@ class FacebookSpider(scrapy.Spider):
""" """
name = "fb" name = "fb"
def __init__(self, email='', password='', page='', year=2018, **kwargs): def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
super(FacebookSpider, self).__init__(**kwargs) super(FacebookSpider, self).__init__(**kwargs)
self.year = int(year) #arguments are passed as strings self.year = int(year) #arguments are passed as strings
if not email or not password: if not email or not password:
raise ValueError("You need to provide valid email and password!") raise ValueError("You need to provide valid email and password!")
@ -27,8 +27,19 @@ class FacebookSpider(scrapy.Spider):
else: else:
self.page = page self.page = page
self.start_urls = ['https://mbasic.facebook.com'] if not(lang):
self.logger.info('Language attribute not provided, assuming "en"')
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.lang = 'en'
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
self.lang = lang
else:
self.logger.info('Lang:{} not currently supported'.format(lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again')
return
self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response): def parse(self, response):
return FormRequest.from_response( return FormRequest.from_response(
@ -59,29 +70,24 @@ class FacebookSpider(scrapy.Spider):
return FormRequest.from_response( return FormRequest.from_response(
response, response,
formdata={'approvals_code': self.code}, formdata={'approvals_code': self.code},
callback=self.parse_home, callback=self.parse_home)
) elif response.xpath("//div/a[contains(@href,'save-device')]"):
elif response.xpath("//div/input[@value='Ok' and @type='submit']"): # elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
# Handle 'Save Browser' checkpoint. # Handle 'Save Browser' checkpoint.
return FormRequest.from_response( return FormRequest.from_response(
response, response,
formdata={'name_action_selected': 'dont_save'}, formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home, callback=self.parse_home)
dont_filter=True,
)
elif response.css('button#checkpointSubmitButton'): elif response.css('button#checkpointSubmitButton'):
# Handle 'Someone tried to log into your account' warning. # Handle 'Someone tried to log into your account' warning.
return FormRequest.from_response( return FormRequest.from_response(
response, callback=self.parse_home, dont_filter=True,) response, callback=self.parse_home)
# Else go to the page requested. # Else go to the page requested.
if self.page.find('.facebook.com/') != -1: if self.page.find('https://www.facebook.com/') != -1:
self.page = self.page[28:] self.page = self.page[25:]
href = response.urljoin(self.page) href = response.urljoin(self.page)
self.logger.info('Parse function called on %s', href) self.logger.info('Parse function called on %s', href)
return scrapy.Request( return scrapy.Request(url=href,callback=self.parse_page)
url=href,
callback=self.parse_page,
)
def parse_page(self, response): def parse_page(self, response):
#select all posts #select all posts
@ -97,29 +103,35 @@ class FacebookSpider(scrapy.Spider):
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new}) yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
#load following page #load following page
# next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href') #next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ece')) and not(contains(text(),number()))]/@href").extract() next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if len(next_page) == 0: if len(next_page) == 0:
if response.meta['flag'] == 3 and self.year <= 2015: if response.meta['flag'] == 4 and self.year <= 2015:
self.logger.info('2015 reached, flag = {}'.format(response.meta['flag'])) self.logger.info('2014 reached, flag = 5')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5})
elif response.meta['flag'] == 3 and self.year <= 2015:
self.logger.info('2015 reached, flag = 4')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0]) new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4})
elif response.meta['flag'] == 2 and self.year <= 2016: elif response.meta['flag'] == 2 and self.year <= 2016:
self.logger.info('2016 reached, flag = {}'.format(response.meta['flag'])) self.logger.info('2016 reached, flag = 3')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract() next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0]) new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3})
elif response.meta['flag'] == 1 and self.year <= 2017: elif response.meta['flag'] == 1 and self.year <= 2017:
self.logger.info('2017 reached, flag = {}'.format(response.meta['flag'])) self.logger.info('2017 reached, flag = 2')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract() next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0]) new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})
elif response.meta['flag'] == 0 and self.year <= 2018: elif response.meta['flag'] == 0 and self.year <= 2018:
self.logger.info('2018 reached, flag = {}'.format(response.meta['flag'])) self.logger.info('2018 reached, flag = 1')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract() next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0]) new_page = response.urljoin(next_page[0])
@ -144,6 +156,7 @@ class FacebookSpider(scrapy.Spider):
def parse_reactions(self,response): def parse_reactions(self,response):
new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item']) new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
new.context['lang'] = self.lang
new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")