added experimental support for languages en, es, fr, it, pt
This commit is contained in:
parent
9de51e0ce8
commit
fb32a4213e
Binary file not shown.
Binary file not shown.
170
fbcrawl/items.py
170
fbcrawl/items.py
@ -9,102 +9,106 @@ import scrapy
|
|||||||
from scrapy.loader.processors import TakeFirst, Join, MapCompose
|
from scrapy.loader.processors import TakeFirst, Join, MapCompose
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
def parse_date(init_date):
|
def parse_date(init_date,loader_context):
|
||||||
mesi = {
|
lang = loader_context['lang']
|
||||||
'gennaio':1,
|
if lang == 'it':
|
||||||
'febbraio':2,
|
mesi = {
|
||||||
'marzo':3,
|
'gennaio':1,
|
||||||
'aprile':4,
|
'febbraio':2,
|
||||||
'maggio':5,
|
'marzo':3,
|
||||||
'giugno':6,
|
'aprile':4,
|
||||||
'luglio':7,
|
'maggio':5,
|
||||||
'agosto':8,
|
'giugno':6,
|
||||||
'settembre':9,
|
'luglio':7,
|
||||||
'ottobre':10,
|
'agosto':8,
|
||||||
'novembre':11,
|
'settembre':9,
|
||||||
'dicembre':12
|
'ottobre':10,
|
||||||
}
|
'novembre':11,
|
||||||
|
'dicembre':12
|
||||||
|
}
|
||||||
|
|
||||||
mesi_abbr = {
|
mesi_abbr = {
|
||||||
'gen':1,
|
'gen':1,
|
||||||
'feb':2,
|
'feb':2,
|
||||||
'mar':3,
|
'mar':3,
|
||||||
'apr':4,
|
'apr':4,
|
||||||
'mag':5,
|
'mag':5,
|
||||||
'giu':6,
|
'giu':6,
|
||||||
'lug':7,
|
'lug':7,
|
||||||
'ago':8,
|
'ago':8,
|
||||||
'set':9,
|
'set':9,
|
||||||
'ott':10,
|
'ott':10,
|
||||||
'nov':11,
|
'nov':11,
|
||||||
'dic':12
|
'dic':12
|
||||||
}
|
}
|
||||||
|
|
||||||
giorni = {
|
giorni = {
|
||||||
'domenica':0,
|
'domenica':0,
|
||||||
'lunedì':1,
|
'lunedì':1,
|
||||||
'martedì':2,
|
'martedì':2,
|
||||||
'mercoledì':3,
|
'mercoledì':3,
|
||||||
'giovedì':4,
|
'giovedì':4,
|
||||||
'venerdì':5,
|
'venerdì':5,
|
||||||
'sabato':6
|
'sabato':6
|
||||||
}
|
}
|
||||||
date = init_date
|
date = init_date
|
||||||
date = date[0].split()
|
date = date[0].split()
|
||||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
|
|
||||||
#sanity check
|
#sanity check
|
||||||
if len(date) == 0:
|
if len(date) == 0:
|
||||||
return 'Error: no data'
|
return 'Error: no data'
|
||||||
|
|
||||||
#yesterday
|
#yesterday
|
||||||
elif len(date) == 1:
|
elif len(date) == 1:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
|
||||||
#4h
|
#4h
|
||||||
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
#22h (yesterday)
|
#22h (yesterday)
|
||||||
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
|
||||||
#yesterday
|
#yesterday
|
||||||
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
|
||||||
#day with 3 month length of this year
|
#day with 3 month length of this year
|
||||||
elif len(date[1]) == 3 and not(date[2].isdigit()):
|
elif len(date[1]) == 3 and not(date[2].isdigit()):
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = mesi_abbr[date[1]]
|
month = mesi_abbr[date[1]]
|
||||||
|
|
||||||
elif len(date[1]) > 3 and not(date[2].isdigit()):
|
elif len(date[1]) > 3 and not(date[2].isdigit()):
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = mesi[date[1]]
|
month = mesi[date[1]]
|
||||||
|
|
||||||
elif len(date[1]) == 3 and date[2].isdigit():
|
elif len(date[1]) == 3 and date[2].isdigit():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = mesi_abbr[date[1]]
|
month = mesi_abbr[date[1]]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
|
|
||||||
#usual dates, with regular length month
|
#usual dates, with regular length month
|
||||||
elif date[0].isdigit() and date[2].isdigit():
|
elif date[0].isdigit() and date[2].isdigit():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = mesi[date[1]]
|
month = mesi[date[1]]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
|
|
||||||
#dates with weekdays (this function assumes that the month is the same)
|
#dates with weekdays (this function assumes that the month is the same)
|
||||||
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
||||||
today = datetime.now().weekday() #today as a weekday
|
today = datetime.now().weekday() #today as a weekday
|
||||||
weekday = giorni[date[0]] #day to be match as number weekday
|
weekday = giorni[date[0]] #day to be match as number weekday
|
||||||
#weekday is chronologically always lower than day
|
#weekday is chronologically always lower than day
|
||||||
if weekday < today:
|
if weekday < today:
|
||||||
day -= today - weekday
|
day -= today - weekday
|
||||||
elif weekday > today:
|
elif weekday > today:
|
||||||
weekday += 7
|
weekday += 7
|
||||||
day -= today - weekday
|
day -= today - weekday
|
||||||
|
else:
|
||||||
|
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
|
||||||
|
return init_date
|
||||||
else:
|
else:
|
||||||
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
|
|
||||||
return init_date
|
return init_date
|
||||||
date = datetime(year,month,day)
|
date = datetime(year,month,day)
|
||||||
return date.date()
|
return date.date()
|
||||||
|
Binary file not shown.
@ -11,10 +11,10 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
"""
|
"""
|
||||||
name = "fb"
|
name = "fb"
|
||||||
|
|
||||||
def __init__(self, email='', password='', page='', year=2018, **kwargs):
|
def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
|
||||||
super(FacebookSpider, self).__init__(**kwargs)
|
super(FacebookSpider, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.year = int(year) #arguments are passed as strings
|
self.year = int(year) #arguments are passed as strings
|
||||||
|
|
||||||
if not email or not password:
|
if not email or not password:
|
||||||
raise ValueError("You need to provide valid email and password!")
|
raise ValueError("You need to provide valid email and password!")
|
||||||
@ -27,8 +27,19 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
else:
|
else:
|
||||||
self.page = page
|
self.page = page
|
||||||
|
|
||||||
self.start_urls = ['https://mbasic.facebook.com']
|
if not(lang):
|
||||||
|
self.logger.info('Language attribute not provided, assuming "en"')
|
||||||
|
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||||
|
self.lang = 'en'
|
||||||
|
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
|
||||||
|
self.lang = lang
|
||||||
|
else:
|
||||||
|
self.logger.info('Lang:{} not currently supported'.format(lang))
|
||||||
|
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||||
|
self.logger.info('Change your interface lang from facebook and try again')
|
||||||
|
return
|
||||||
|
|
||||||
|
self.start_urls = ['https://mbasic.facebook.com']
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
return FormRequest.from_response(
|
return FormRequest.from_response(
|
||||||
@ -59,29 +70,24 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
return FormRequest.from_response(
|
return FormRequest.from_response(
|
||||||
response,
|
response,
|
||||||
formdata={'approvals_code': self.code},
|
formdata={'approvals_code': self.code},
|
||||||
callback=self.parse_home,
|
callback=self.parse_home)
|
||||||
)
|
elif response.xpath("//div/a[contains(@href,'save-device')]"):
|
||||||
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
# elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
||||||
# Handle 'Save Browser' checkpoint.
|
# Handle 'Save Browser' checkpoint.
|
||||||
return FormRequest.from_response(
|
return FormRequest.from_response(
|
||||||
response,
|
response,
|
||||||
formdata={'name_action_selected': 'dont_save'},
|
formdata={'name_action_selected': 'dont_save'},
|
||||||
callback=self.parse_home,
|
callback=self.parse_home)
|
||||||
dont_filter=True,
|
|
||||||
)
|
|
||||||
elif response.css('button#checkpointSubmitButton'):
|
elif response.css('button#checkpointSubmitButton'):
|
||||||
# Handle 'Someone tried to log into your account' warning.
|
# Handle 'Someone tried to log into your account' warning.
|
||||||
return FormRequest.from_response(
|
return FormRequest.from_response(
|
||||||
response, callback=self.parse_home, dont_filter=True,)
|
response, callback=self.parse_home)
|
||||||
# Else go to the page requested.
|
# Else go to the page requested.
|
||||||
if self.page.find('.facebook.com/') != -1:
|
if self.page.find('https://www.facebook.com/') != -1:
|
||||||
self.page = self.page[28:]
|
self.page = self.page[25:]
|
||||||
href = response.urljoin(self.page)
|
href = response.urljoin(self.page)
|
||||||
self.logger.info('Parse function called on %s', href)
|
self.logger.info('Parse function called on %s', href)
|
||||||
return scrapy.Request(
|
return scrapy.Request(url=href,callback=self.parse_page)
|
||||||
url=href,
|
|
||||||
callback=self.parse_page,
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
#select all posts
|
#select all posts
|
||||||
@ -97,29 +103,35 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
|
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
|
||||||
|
|
||||||
#load following page
|
#load following page
|
||||||
# next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
|
#next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
|
||||||
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ece')) and not(contains(text(),number()))]/@href").extract()
|
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||||
if len(next_page) == 0:
|
if len(next_page) == 0:
|
||||||
if response.meta['flag'] == 3 and self.year <= 2015:
|
if response.meta['flag'] == 4 and self.year <= 2015:
|
||||||
self.logger.info('2015 reached, flag = {}'.format(response.meta['flag']))
|
self.logger.info('2014 reached, flag = 5')
|
||||||
|
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
|
||||||
|
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||||
|
new_page = response.urljoin(next_page[0])
|
||||||
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5})
|
||||||
|
elif response.meta['flag'] == 3 and self.year <= 2015:
|
||||||
|
self.logger.info('2015 reached, flag = 4')
|
||||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
|
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
|
||||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||||
new_page = response.urljoin(next_page[0])
|
new_page = response.urljoin(next_page[0])
|
||||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4})
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4})
|
||||||
elif response.meta['flag'] == 2 and self.year <= 2016:
|
elif response.meta['flag'] == 2 and self.year <= 2016:
|
||||||
self.logger.info('2016 reached, flag = {}'.format(response.meta['flag']))
|
self.logger.info('2016 reached, flag = 3')
|
||||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract()
|
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract()
|
||||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||||
new_page = response.urljoin(next_page[0])
|
new_page = response.urljoin(next_page[0])
|
||||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3})
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3})
|
||||||
elif response.meta['flag'] == 1 and self.year <= 2017:
|
elif response.meta['flag'] == 1 and self.year <= 2017:
|
||||||
self.logger.info('2017 reached, flag = {}'.format(response.meta['flag']))
|
self.logger.info('2017 reached, flag = 2')
|
||||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract()
|
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract()
|
||||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||||
new_page = response.urljoin(next_page[0])
|
new_page = response.urljoin(next_page[0])
|
||||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})
|
||||||
elif response.meta['flag'] == 0 and self.year <= 2018:
|
elif response.meta['flag'] == 0 and self.year <= 2018:
|
||||||
self.logger.info('2018 reached, flag = {}'.format(response.meta['flag']))
|
self.logger.info('2018 reached, flag = 1')
|
||||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
|
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
|
||||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||||
new_page = response.urljoin(next_page[0])
|
new_page = response.urljoin(next_page[0])
|
||||||
@ -144,6 +156,7 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
|
|
||||||
def parse_reactions(self,response):
|
def parse_reactions(self,response):
|
||||||
new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
|
new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
|
||||||
|
new.context['lang'] = self.lang
|
||||||
new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
|
new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
|
||||||
new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
|
new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
|
||||||
new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
|
new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
|
||||||
|
Loading…
Reference in New Issue
Block a user