improved support for languages en, es, fr, it, pt
This commit is contained in:
parent
fb32a4213e
commit
a9982865d9
Binary file not shown.
452
fbcrawl/items.py
452
fbcrawl/items.py
@ -11,8 +11,11 @@ from datetime import datetime, timedelta
|
||||
|
||||
def parse_date(init_date,loader_context):
|
||||
lang = loader_context['lang']
|
||||
# =============================================================================
|
||||
# Italian - status:final
|
||||
# =============================================================================
|
||||
if lang == 'it':
|
||||
mesi = {
|
||||
months = {
|
||||
'gennaio':1,
|
||||
'febbraio':2,
|
||||
'marzo':3,
|
||||
@ -27,7 +30,7 @@ def parse_date(init_date,loader_context):
|
||||
'dicembre':12
|
||||
}
|
||||
|
||||
mesi_abbr = {
|
||||
months_abbr = {
|
||||
'gen':1,
|
||||
'feb':2,
|
||||
'mar':3,
|
||||
@ -43,101 +46,379 @@ def parse_date(init_date,loader_context):
|
||||
}
|
||||
|
||||
giorni = {
|
||||
'domenica':0,
|
||||
'lunedì':1,
|
||||
'martedì':2,
|
||||
'mercoledì':3,
|
||||
'giovedì':4,
|
||||
'venerdì':5,
|
||||
'sabato':6
|
||||
'lunedì':0,
|
||||
'martedì':1,
|
||||
'mercoledì':2,
|
||||
'giovedì':3,
|
||||
'venerdì':4,
|
||||
'sabato':5,
|
||||
'domenica':6
|
||||
}
|
||||
date = init_date
|
||||
date = date[0].split()
|
||||
date = init_date[0].split()
|
||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||
|
||||
l = len(date)
|
||||
|
||||
#sanity check
|
||||
if len(date) == 0:
|
||||
if l == 0:
|
||||
return 'Error: no data'
|
||||
|
||||
#yesterday
|
||||
elif len(date) == 1:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
|
||||
#4h
|
||||
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
pass
|
||||
#adesso, ieri, 4h, 50min
|
||||
elif l == 1:
|
||||
if date[0].isalpha():
|
||||
if date[0].lower() == 'ieri':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
#check that yesterday was not in another month
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
elif date[0].lower() == 'adesso':
|
||||
return datetime(year,month,day).date() #return today
|
||||
else: #not recognized, (return date or init_date)
|
||||
return date
|
||||
else:
|
||||
#4h, 50min (exploit future parsing)
|
||||
l = 2
|
||||
new_date = [x for x in date[0] if x.isdigit()]
|
||||
date[0] = ''.join(new_date)
|
||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
elif l == 2:
|
||||
#22 min (oggi)
|
||||
if date[1] == 'min':
|
||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#22 min (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (oggi)
|
||||
elif date[1] == 'h':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#2 gen
|
||||
elif len(date[1]) == 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#2 gennaio
|
||||
elif len(date[1]) > 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 3
|
||||
elif l == 3:
|
||||
#21 giu 2017
|
||||
if len(date[1]) == 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#21 giugno 2017
|
||||
elif len(date[1]) > 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 4
|
||||
elif l == 4:
|
||||
#Ieri alle ore 23:32
|
||||
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#domenica alle ore 19:29
|
||||
elif date[0].isalpha() and date[1] == 'alle':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||
else:
|
||||
delta += 8
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 5
|
||||
elif l == 5:
|
||||
if date[2] == 'alle':
|
||||
#29 feb alle ore 21:49
|
||||
if len(date[1]) == 3:
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
else:
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 6
|
||||
elif l == 6:
|
||||
if date[3] == 'alle':
|
||||
#29 feb 2016 alle ore 21:49
|
||||
if len(date[1]) == 3:
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio 2016 alle ore 21:49
|
||||
else:
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# =============================================================================
|
||||
# English - status:beta
|
||||
# =============================================================================
|
||||
elif lang == 'en':
|
||||
months = {
|
||||
'january':1,
|
||||
'february':2,
|
||||
'march':3,
|
||||
'april':4,
|
||||
'may':5,
|
||||
'june':6,
|
||||
'july':7,
|
||||
'august':8,
|
||||
'september':9,
|
||||
'october':10,
|
||||
'november':11,
|
||||
'december':12
|
||||
}
|
||||
|
||||
months_abbr = {
|
||||
'jan':1,
|
||||
'feb':2,
|
||||
'mar':3,
|
||||
'apr':4,
|
||||
'may':5,
|
||||
'jun':6,
|
||||
'jul':7,
|
||||
'aug':8,
|
||||
'sep':9,
|
||||
'oct':10,
|
||||
'nov':11,
|
||||
'dec':12
|
||||
}
|
||||
|
||||
#22h (yesterday)
|
||||
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
date = init_date[0].split()
|
||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||
|
||||
l = len(date)
|
||||
|
||||
#yesterday
|
||||
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
|
||||
#day with 3 month length of this year
|
||||
elif len(date[1]) == 3 and not(date[2].isdigit()):
|
||||
day = int(date[0])
|
||||
month = mesi_abbr[date[1]]
|
||||
|
||||
elif len(date[1]) > 3 and not(date[2].isdigit()):
|
||||
day = int(date[0])
|
||||
month = mesi[date[1]]
|
||||
|
||||
elif len(date[1]) == 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = mesi_abbr[date[1]]
|
||||
year = int(date[2])
|
||||
|
||||
#usual dates, with regular length month
|
||||
elif date[0].isdigit() and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = mesi[date[1]]
|
||||
year = int(date[2])
|
||||
|
||||
#dates with weekdays (this function assumes that the month is the same)
|
||||
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0]] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
if weekday < today:
|
||||
day -= today - weekday
|
||||
elif weekday > today:
|
||||
weekday += 7
|
||||
day -= today - weekday
|
||||
#sanity check
|
||||
if l == 0:
|
||||
return 'Error: no data'
|
||||
|
||||
#Yesterday, Now, 4hr, 50mins
|
||||
elif l == 1:
|
||||
if date[0].isalpha():
|
||||
if date[0].lower() == 'yesterday':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
#check that yesterday was not in another month
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
elif date[0].lower() == 'now':
|
||||
return datetime(year,month,day).date() #return today
|
||||
else: #not recognized, (return date or init_date)
|
||||
return date
|
||||
else:
|
||||
#4h, 50min (exploit future parsing)
|
||||
l = 2
|
||||
new_date = [x for x in date[0] if x.isdigit()]
|
||||
date[0] = ''.join(new_date)
|
||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
elif l == 2:
|
||||
#22 min (oggi)
|
||||
if date[1] == 'min' or date[1] == 'mins':
|
||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#22 min (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (oggi)
|
||||
elif date[1] == 'hr' or date[1] == 'hrs':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#2 gen
|
||||
elif len(date[1]) == 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#2 gennaio
|
||||
elif len(date[1]) > 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 3
|
||||
elif l == 3:
|
||||
# #21 Jun 2017
|
||||
# if len(date[1] == 3) and date[2].isdigit():
|
||||
# day = int(date[0])
|
||||
# month = months_abbr[date[1].lower()]
|
||||
# year = int(date[2])
|
||||
# return datetime(year,month,day).date()
|
||||
# #21 June 2017
|
||||
# elif len(date[1] > 3) and date[2].isdigit():
|
||||
# day = int(date[0])
|
||||
# month = months[date[1].lower()]
|
||||
# year = int(date[2])
|
||||
# return datetime(year,month,day).date()
|
||||
# #parsing failed
|
||||
# else:
|
||||
return date
|
||||
# l = 4
|
||||
elif l == 4:
|
||||
#Ieri alle ore 23:32
|
||||
if date[0].lower() == 'yesteday' and date[1] == 'at':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 5
|
||||
elif l == 5:
|
||||
if date[2] == 'at':
|
||||
#Jan 29 at 10:00 PM
|
||||
if len(date[0]) == 3:
|
||||
day = int(date[1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
else:
|
||||
day = int(date[1])
|
||||
month = months[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 6
|
||||
elif l == 6:
|
||||
if date[3] == 'at':
|
||||
date[1]
|
||||
#Aug 25, 2016 at 7:00 PM
|
||||
if len(date[0]) == 3:
|
||||
day = int(date[1][:-1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#August 25, 2016 at 7:00 PM
|
||||
else:
|
||||
day = int(date[1][:-1])
|
||||
month = months[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l > 6
|
||||
#parsing failed - l too big
|
||||
else:
|
||||
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
|
||||
return init_date
|
||||
return date
|
||||
#parsing failed - language not supported
|
||||
else:
|
||||
return init_date
|
||||
date = datetime(year,month,day)
|
||||
return date.date()
|
||||
|
||||
def comments_strip(string):
|
||||
return string[0].rstrip(' commenti')
|
||||
|
||||
def reactions_strip(string):
|
||||
friends = 1 + string[0].count(',')
|
||||
e = 1 + string[0].count(' e ')
|
||||
string = string[0].split()[::-1]
|
||||
if len(string) == 1:
|
||||
string = string[0]
|
||||
while string.rfind('.') != -1:
|
||||
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
||||
|
||||
def comments_strip(string,loader_context):
|
||||
lang = loader_context['lang']
|
||||
if lang == 'it':
|
||||
if string[0].rfind('Commenta') != -1:
|
||||
return
|
||||
else:
|
||||
return string[0].rstrip(' commenti')
|
||||
|
||||
elif lang == 'en':
|
||||
new_string = string[0].rstrip(' Comments')
|
||||
while new_string.rfind(',') != -1:
|
||||
new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:]
|
||||
return new_string
|
||||
else:
|
||||
return string
|
||||
|
||||
string = string[0]
|
||||
while string.rfind('.') != -1:
|
||||
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
||||
|
||||
if not string.isdigit():
|
||||
return e
|
||||
def reactions_strip(string,loader_context):
|
||||
lang = loader_context['lang']
|
||||
if lang == 'it':
|
||||
newstring = string[0]
|
||||
#19.298.873
|
||||
if len(newstring.split()) == 1:
|
||||
while newstring.rfind('.') != -1:
|
||||
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
||||
return newstring
|
||||
#Pamela, Luigi e altri 4
|
||||
else:
|
||||
return string
|
||||
# friends = newstring.count(' e ') + newstring.count(',')
|
||||
# newstring = newstring.split()[::-1][0]
|
||||
# while newstring.rfind('.') != -1:
|
||||
# newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
||||
# return int(newstring) + friends
|
||||
elif lang == 'en':
|
||||
newstring = string[0]
|
||||
#19,298,873
|
||||
if len(newstring.split()) == 1:
|
||||
while newstring.rfind(',') != -1:
|
||||
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||
return newstring
|
||||
# #Mark and other 254,134
|
||||
# elif newstring.split()[::-1][1].isdigit():
|
||||
# friends = newstring.count(' and ') + newstring.count(',')
|
||||
# newstring = newstring.split()[::-1][1]
|
||||
# while newstring.rfind(',') != -1:
|
||||
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||
# return int(newstring) + friends
|
||||
# #Philip and 1K others
|
||||
else:
|
||||
return newstring
|
||||
else:
|
||||
return int(string) + friends
|
||||
return string
|
||||
|
||||
def url_strip(url):
|
||||
fullurl = url[0]
|
||||
#catchin '&id=' is enough to identify the post
|
||||
i = fullurl.find('&id=')
|
||||
if i != -1:
|
||||
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
||||
return j
|
||||
else:
|
||||
return fullurl
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
source = scrapy.Field(
|
||||
output_processor=TakeFirst()
|
||||
) # page that published the post
|
||||
@ -153,10 +434,7 @@ class FbcrawlItem(scrapy.Item):
|
||||
|
||||
comments = scrapy.Field(
|
||||
output_processor=comments_strip
|
||||
)
|
||||
commentators = scrapy.Field(
|
||||
output_processor=Join(separator=u'\n')
|
||||
)
|
||||
)
|
||||
|
||||
reactions = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item):
|
||||
sigh = scrapy.Field()
|
||||
grrr = scrapy.Field()
|
||||
share = scrapy.Field() # num of shares
|
||||
url = scrapy.Field()
|
||||
url = scrapy.Field(
|
||||
output_processor=url_strip
|
||||
)
|
||||
|
Binary file not shown.
Binary file not shown.
@ -89,7 +89,6 @@ class FacebookSpider(scrapy.Spider):
|
||||
for i in range(len(rispostina)):
|
||||
risp = response.urljoin(rispostina[i].extract())
|
||||
yield scrapy.Request(risp, callback=self.parse_rispostina)
|
||||
|
||||
|
||||
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
||||
if len(next_page) > 0:
|
||||
|
@ -3,6 +3,7 @@ import scrapy
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.http import FormRequest
|
||||
from fbcrawl.items import FbcrawlItem
|
||||
from scrapy.exceptions import CloseSpider
|
||||
|
||||
|
||||
class FacebookSpider(scrapy.Spider):
|
||||
@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider):
|
||||
"""
|
||||
name = "fb"
|
||||
|
||||
def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
|
||||
def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
|
||||
super(FacebookSpider, self).__init__(**kwargs)
|
||||
|
||||
self.year = int(year) #arguments are passed as strings
|
||||
|
||||
#email & pass need to be passed as attributes!
|
||||
if not email or not password:
|
||||
raise ValueError("You need to provide valid email and password!")
|
||||
else:
|
||||
self.email = email
|
||||
self.password = password
|
||||
|
||||
#page name parsing (added support for full urls)
|
||||
if not page:
|
||||
raise ValueError("You need to provide a valid page name to crawl!")
|
||||
elif page.find('https://www.facebook.com/') != -1:
|
||||
self.page = page[25:]
|
||||
elif page.find('https://mbasic.facebook.com/') != -1:
|
||||
self.page = page[28:]
|
||||
elif page.find('https://m.facebook.com/') != -1:
|
||||
self.page = page[23:]
|
||||
else:
|
||||
self.page = page
|
||||
|
||||
if not(lang):
|
||||
self.logger.info('Language attribute not provided, assuming "en"')
|
||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||
self.lang = 'en'
|
||||
|
||||
#parse year
|
||||
assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
|
||||
self.year = int(year) #arguments are passed as strings
|
||||
|
||||
#parse lang, if not provided (but is supported) it will be guessed in parse_home
|
||||
if lang=='_':
|
||||
self.logger.info('Language attribute not provided, I will try to guess it')
|
||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||
self.lang=lang
|
||||
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
|
||||
self.lang = lang
|
||||
else:
|
||||
self.logger.info('Lang:{} not currently supported'.format(lang))
|
||||
self.logger.info('Lang "{}" not currently supported'.format(lang))
|
||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||
self.logger.info('Change your interface lang from facebook and try again')
|
||||
return
|
||||
self.logger.info('Change your interface lang from facebook and try again')
|
||||
raise CloseSpider('Language provided not currently supported')
|
||||
|
||||
self.start_urls = ['https://mbasic.facebook.com']
|
||||
|
||||
def parse(self, response):
|
||||
'''
|
||||
Handle login with provided credentials
|
||||
'''
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formxpath='//form[contains(@action, "login")]',
|
||||
@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider):
|
||||
|
||||
def parse_home(self, response):
|
||||
'''
|
||||
Parse user news feed page. This code is outdate and needs review.
|
||||
This method has multiple purposes:
|
||||
1) Handle failed logins due to facebook 'save-device' redirection
|
||||
2) Set language interface, if not already provided
|
||||
3) Navigate to given page
|
||||
'''
|
||||
if response.css('#approvals_code'):
|
||||
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
||||
if not self.code:
|
||||
# Show facebook messages via logs
|
||||
# and request user for approval code.
|
||||
message = response.css('._50f4::text').extract()[0]
|
||||
self.log(message)
|
||||
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
||||
self.log(message)
|
||||
self.code = input('Enter the code: ')
|
||||
self.code = str(self.code)
|
||||
if not (self.code and self.code.isdigit()):
|
||||
self.log('Bad approvals code detected.')
|
||||
return
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'approvals_code': self.code},
|
||||
callback=self.parse_home)
|
||||
elif response.xpath("//div/a[contains(@href,'save-device')]"):
|
||||
# elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
||||
# Handle 'Save Browser' checkpoint.
|
||||
#handle 'save-device' redirection
|
||||
if response.xpath("//div/a[contains(@href,'save-device')]"):
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'name_action_selected': 'dont_save'},
|
||||
callback=self.parse_home)
|
||||
elif response.css('button#checkpointSubmitButton'):
|
||||
# Handle 'Someone tried to log into your account' warning.
|
||||
return FormRequest.from_response(
|
||||
response, callback=self.parse_home)
|
||||
# Else go to the page requested.
|
||||
if self.page.find('https://www.facebook.com/') != -1:
|
||||
self.page = self.page[25:]
|
||||
|
||||
#set language interface
|
||||
if self.lang == '_':
|
||||
if response.xpath("//input[@placeholder='Search Facebook']"):
|
||||
self.lang = 'en'
|
||||
elif response.xpath("//input[@value='Buscar']"):
|
||||
self.lang = 'es'
|
||||
elif response.xpath("//input[@value='Rechercher']"):
|
||||
self.lang = 'fr'
|
||||
elif response.xpath("//input[@value='Cerca']"):
|
||||
self.lang = 'it'
|
||||
elif response.xpath("//input[@value='Pesquisar']"):
|
||||
self.lang = 'pt'
|
||||
else:
|
||||
raise CloseSpider('Language not recognized')
|
||||
|
||||
#navigate to provided page
|
||||
href = response.urljoin(self.page)
|
||||
self.logger.info('Parse function called on %s', href)
|
||||
self.logger.info('Parsing facebook page %s', href)
|
||||
return scrapy.Request(url=href,callback=self.parse_page)
|
||||
|
||||
def parse_page(self, response):
|
||||
'''
|
||||
Parse the given page selecting the posts.
|
||||
Then ask recursively for another page.
|
||||
'''
|
||||
#select all posts
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
|
||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||
#page_url
|
||||
#new.add_value('url',response.url)
|
||||
new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
|
||||
|
||||
#page_url #new.add_value('url',response.url)
|
||||
#returns full post-link in a list
|
||||
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||
temp_post = response.urljoin(post[0])
|
||||
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
|
||||
|
||||
#load following page
|
||||
#next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
|
||||
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
if len(next_page) == 0:
|
||||
if response.meta['flag'] == 4 and self.year <= 2015:
|
||||
@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider):
|
||||
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
|
||||
new.add_xpath('date', '//div/div/abbr/text()')
|
||||
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
|
||||
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
|
||||
|
||||
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
|
||||
reactions = response.urljoin(reactions[0].extract())
|
||||
|
Loading…
Reference in New Issue
Block a user