improved support for languages en, es, fr, it, pt
This commit is contained in:
parent
fb32a4213e
commit
a9982865d9
Binary file not shown.
452
fbcrawl/items.py
452
fbcrawl/items.py
@ -11,8 +11,11 @@ from datetime import datetime, timedelta
|
|||||||
|
|
||||||
def parse_date(init_date,loader_context):
|
def parse_date(init_date,loader_context):
|
||||||
lang = loader_context['lang']
|
lang = loader_context['lang']
|
||||||
|
# =============================================================================
|
||||||
|
# Italian - status:final
|
||||||
|
# =============================================================================
|
||||||
if lang == 'it':
|
if lang == 'it':
|
||||||
mesi = {
|
months = {
|
||||||
'gennaio':1,
|
'gennaio':1,
|
||||||
'febbraio':2,
|
'febbraio':2,
|
||||||
'marzo':3,
|
'marzo':3,
|
||||||
@ -27,7 +30,7 @@ def parse_date(init_date,loader_context):
|
|||||||
'dicembre':12
|
'dicembre':12
|
||||||
}
|
}
|
||||||
|
|
||||||
mesi_abbr = {
|
months_abbr = {
|
||||||
'gen':1,
|
'gen':1,
|
||||||
'feb':2,
|
'feb':2,
|
||||||
'mar':3,
|
'mar':3,
|
||||||
@ -43,101 +46,379 @@ def parse_date(init_date,loader_context):
|
|||||||
}
|
}
|
||||||
|
|
||||||
giorni = {
|
giorni = {
|
||||||
'domenica':0,
|
'lunedì':0,
|
||||||
'lunedì':1,
|
'martedì':1,
|
||||||
'martedì':2,
|
'mercoledì':2,
|
||||||
'mercoledì':3,
|
'giovedì':3,
|
||||||
'giovedì':4,
|
'venerdì':4,
|
||||||
'venerdì':5,
|
'sabato':5,
|
||||||
'sabato':6
|
'domenica':6
|
||||||
}
|
}
|
||||||
date = init_date
|
date = init_date[0].split()
|
||||||
date = date[0].split()
|
|
||||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
|
|
||||||
|
l = len(date)
|
||||||
|
|
||||||
#sanity check
|
#sanity check
|
||||||
if len(date) == 0:
|
if l == 0:
|
||||||
return 'Error: no data'
|
return 'Error: no data'
|
||||||
|
|
||||||
#yesterday
|
#adesso, ieri, 4h, 50min
|
||||||
elif len(date) == 1:
|
elif l == 1:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
if date[0].isalpha():
|
||||||
|
if date[0].lower() == 'ieri':
|
||||||
#4h
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
#check that yesterday was not in another month
|
||||||
pass
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
elif date[0].lower() == 'adesso':
|
||||||
|
return datetime(year,month,day).date() #return today
|
||||||
|
else: #not recognized, (return date or init_date)
|
||||||
|
return date
|
||||||
|
else:
|
||||||
|
#4h, 50min (exploit future parsing)
|
||||||
|
l = 2
|
||||||
|
new_date = [x for x in date[0] if x.isdigit()]
|
||||||
|
date[0] = ''.join(new_date)
|
||||||
|
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||||
|
date[1] = ''.join(new_date)
|
||||||
|
# l = 2
|
||||||
|
elif l == 2:
|
||||||
|
#22 min (oggi)
|
||||||
|
if date[1] == 'min':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#22 min (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#4 h (oggi)
|
||||||
|
elif date[1] == 'h':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#4 h (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 gen
|
||||||
|
elif len(date[1]) == 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 gennaio
|
||||||
|
elif len(date[1]) > 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1]]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 3
|
||||||
|
elif l == 3:
|
||||||
|
#21 giu 2017
|
||||||
|
if len(date[1]) == 3 and date[2].isdigit():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1]]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#21 giugno 2017
|
||||||
|
elif len(date[1]) > 3 and date[2].isdigit():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1]]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 4
|
||||||
|
elif l == 4:
|
||||||
|
#Ieri alle ore 23:32
|
||||||
|
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#domenica alle ore 19:29
|
||||||
|
elif date[0].isalpha() and date[1] == 'alle':
|
||||||
|
today = datetime.now().weekday() #today as a weekday
|
||||||
|
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||||
|
#weekday is chronologically always lower than day
|
||||||
|
delta = today - weekday
|
||||||
|
if delta >= 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||||
|
else:
|
||||||
|
delta += 8
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 5
|
||||||
|
elif l == 5:
|
||||||
|
if date[2] == 'alle':
|
||||||
|
#29 feb alle ore 21:49
|
||||||
|
if len(date[1]) == 3:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#29 febbraio alle ore 21:49
|
||||||
|
else:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 6
|
||||||
|
elif l == 6:
|
||||||
|
if date[3] == 'alle':
|
||||||
|
#29 feb 2016 alle ore 21:49
|
||||||
|
if len(date[1]) == 3:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#29 febbraio 2016 alle ore 21:49
|
||||||
|
else:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# =============================================================================
|
||||||
|
# English - status:beta
|
||||||
|
# =============================================================================
|
||||||
|
elif lang == 'en':
|
||||||
|
months = {
|
||||||
|
'january':1,
|
||||||
|
'february':2,
|
||||||
|
'march':3,
|
||||||
|
'april':4,
|
||||||
|
'may':5,
|
||||||
|
'june':6,
|
||||||
|
'july':7,
|
||||||
|
'august':8,
|
||||||
|
'september':9,
|
||||||
|
'october':10,
|
||||||
|
'november':11,
|
||||||
|
'december':12
|
||||||
|
}
|
||||||
|
|
||||||
|
months_abbr = {
|
||||||
|
'jan':1,
|
||||||
|
'feb':2,
|
||||||
|
'mar':3,
|
||||||
|
'apr':4,
|
||||||
|
'may':5,
|
||||||
|
'jun':6,
|
||||||
|
'jul':7,
|
||||||
|
'aug':8,
|
||||||
|
'sep':9,
|
||||||
|
'oct':10,
|
||||||
|
'nov':11,
|
||||||
|
'dec':12
|
||||||
|
}
|
||||||
|
|
||||||
#22h (yesterday)
|
date = init_date[0].split()
|
||||||
elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
|
||||||
|
l = len(date)
|
||||||
|
|
||||||
#yesterday
|
#sanity check
|
||||||
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
if l == 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
return 'Error: no data'
|
||||||
|
|
||||||
#day with 3 month length of this year
|
#Yesterday, Now, 4hr, 50mins
|
||||||
elif len(date[1]) == 3 and not(date[2].isdigit()):
|
elif l == 1:
|
||||||
day = int(date[0])
|
if date[0].isalpha():
|
||||||
month = mesi_abbr[date[1]]
|
if date[0].lower() == 'yesterday':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
elif len(date[1]) > 3 and not(date[2].isdigit()):
|
#check that yesterday was not in another month
|
||||||
day = int(date[0])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
month = mesi[date[1]]
|
elif date[0].lower() == 'now':
|
||||||
|
return datetime(year,month,day).date() #return today
|
||||||
elif len(date[1]) == 3 and date[2].isdigit():
|
else: #not recognized, (return date or init_date)
|
||||||
day = int(date[0])
|
return date
|
||||||
month = mesi_abbr[date[1]]
|
else:
|
||||||
year = int(date[2])
|
#4h, 50min (exploit future parsing)
|
||||||
|
l = 2
|
||||||
#usual dates, with regular length month
|
new_date = [x for x in date[0] if x.isdigit()]
|
||||||
elif date[0].isdigit() and date[2].isdigit():
|
date[0] = ''.join(new_date)
|
||||||
day = int(date[0])
|
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||||
month = mesi[date[1]]
|
date[1] = ''.join(new_date)
|
||||||
year = int(date[2])
|
# l = 2
|
||||||
|
elif l == 2:
|
||||||
#dates with weekdays (this function assumes that the month is the same)
|
#22 min (oggi)
|
||||||
elif date[0].isdigit() == False and date[1].isdigit() == False:
|
if date[1] == 'min' or date[1] == 'mins':
|
||||||
today = datetime.now().weekday() #today as a weekday
|
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
|
||||||
weekday = giorni[date[0]] #day to be match as number weekday
|
return datetime(year,month,day).date()
|
||||||
#weekday is chronologically always lower than day
|
#22 min (ieri)
|
||||||
if weekday < today:
|
else:
|
||||||
day -= today - weekday
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
elif weekday > today:
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
weekday += 7
|
return datetime(year,month,day).date()
|
||||||
day -= today - weekday
|
#4 h (oggi)
|
||||||
|
elif date[1] == 'hr' or date[1] == 'hrs':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#4 h (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 gen
|
||||||
|
elif len(date[1]) == 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 gennaio
|
||||||
|
elif len(date[1]) > 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1]]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 3
|
||||||
|
elif l == 3:
|
||||||
|
# #21 Jun 2017
|
||||||
|
# if len(date[1] == 3) and date[2].isdigit():
|
||||||
|
# day = int(date[0])
|
||||||
|
# month = months_abbr[date[1].lower()]
|
||||||
|
# year = int(date[2])
|
||||||
|
# return datetime(year,month,day).date()
|
||||||
|
# #21 June 2017
|
||||||
|
# elif len(date[1] > 3) and date[2].isdigit():
|
||||||
|
# day = int(date[0])
|
||||||
|
# month = months[date[1].lower()]
|
||||||
|
# year = int(date[2])
|
||||||
|
# return datetime(year,month,day).date()
|
||||||
|
# #parsing failed
|
||||||
|
# else:
|
||||||
|
return date
|
||||||
|
# l = 4
|
||||||
|
elif l == 4:
|
||||||
|
#Ieri alle ore 23:32
|
||||||
|
if date[0].lower() == 'yesteday' and date[1] == 'at':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 5
|
||||||
|
elif l == 5:
|
||||||
|
if date[2] == 'at':
|
||||||
|
#Jan 29 at 10:00 PM
|
||||||
|
if len(date[0]) == 3:
|
||||||
|
day = int(date[1])
|
||||||
|
month = months_abbr[date[0].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#29 febbraio alle ore 21:49
|
||||||
|
else:
|
||||||
|
day = int(date[1])
|
||||||
|
month = months[date[0].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 6
|
||||||
|
elif l == 6:
|
||||||
|
if date[3] == 'at':
|
||||||
|
date[1]
|
||||||
|
#Aug 25, 2016 at 7:00 PM
|
||||||
|
if len(date[0]) == 3:
|
||||||
|
day = int(date[1][:-1])
|
||||||
|
month = months_abbr[date[0].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#August 25, 2016 at 7:00 PM
|
||||||
|
else:
|
||||||
|
day = int(date[1][:-1])
|
||||||
|
month = months[date[0].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l > 6
|
||||||
|
#parsing failed - l too big
|
||||||
else:
|
else:
|
||||||
#date item parser fail. datetime format unknown, check xpath selector or change the language of the interface'
|
return date
|
||||||
return init_date
|
#parsing failed - language not supported
|
||||||
else:
|
else:
|
||||||
return init_date
|
return init_date
|
||||||
date = datetime(year,month,day)
|
|
||||||
return date.date()
|
def comments_strip(string,loader_context):
|
||||||
|
lang = loader_context['lang']
|
||||||
def comments_strip(string):
|
if lang == 'it':
|
||||||
return string[0].rstrip(' commenti')
|
if string[0].rfind('Commenta') != -1:
|
||||||
|
return
|
||||||
def reactions_strip(string):
|
else:
|
||||||
friends = 1 + string[0].count(',')
|
return string[0].rstrip(' commenti')
|
||||||
e = 1 + string[0].count(' e ')
|
|
||||||
string = string[0].split()[::-1]
|
elif lang == 'en':
|
||||||
if len(string) == 1:
|
new_string = string[0].rstrip(' Comments')
|
||||||
string = string[0]
|
while new_string.rfind(',') != -1:
|
||||||
while string.rfind('.') != -1:
|
new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:]
|
||||||
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
return new_string
|
||||||
|
else:
|
||||||
return string
|
return string
|
||||||
|
|
||||||
string = string[0]
|
def reactions_strip(string,loader_context):
|
||||||
while string.rfind('.') != -1:
|
lang = loader_context['lang']
|
||||||
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
if lang == 'it':
|
||||||
|
newstring = string[0]
|
||||||
if not string.isdigit():
|
#19.298.873
|
||||||
return e
|
if len(newstring.split()) == 1:
|
||||||
|
while newstring.rfind('.') != -1:
|
||||||
|
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
||||||
|
return newstring
|
||||||
|
#Pamela, Luigi e altri 4
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
# friends = newstring.count(' e ') + newstring.count(',')
|
||||||
|
# newstring = newstring.split()[::-1][0]
|
||||||
|
# while newstring.rfind('.') != -1:
|
||||||
|
# newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
||||||
|
# return int(newstring) + friends
|
||||||
|
elif lang == 'en':
|
||||||
|
newstring = string[0]
|
||||||
|
#19,298,873
|
||||||
|
if len(newstring.split()) == 1:
|
||||||
|
while newstring.rfind(',') != -1:
|
||||||
|
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||||
|
return newstring
|
||||||
|
# #Mark and other 254,134
|
||||||
|
# elif newstring.split()[::-1][1].isdigit():
|
||||||
|
# friends = newstring.count(' and ') + newstring.count(',')
|
||||||
|
# newstring = newstring.split()[::-1][1]
|
||||||
|
# while newstring.rfind(',') != -1:
|
||||||
|
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||||
|
# return int(newstring) + friends
|
||||||
|
# #Philip and 1K others
|
||||||
|
else:
|
||||||
|
return newstring
|
||||||
else:
|
else:
|
||||||
return int(string) + friends
|
return string
|
||||||
|
|
||||||
|
def url_strip(url):
|
||||||
|
fullurl = url[0]
|
||||||
|
#catchin '&id=' is enough to identify the post
|
||||||
|
i = fullurl.find('&id=')
|
||||||
|
if i != -1:
|
||||||
|
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
||||||
|
return j
|
||||||
|
else:
|
||||||
|
return fullurl
|
||||||
|
|
||||||
class FbcrawlItem(scrapy.Item):
|
class FbcrawlItem(scrapy.Item):
|
||||||
# define the fields for your item here like:
|
|
||||||
# name = scrapy.Field()
|
|
||||||
source = scrapy.Field(
|
source = scrapy.Field(
|
||||||
output_processor=TakeFirst()
|
output_processor=TakeFirst()
|
||||||
) # page that published the post
|
) # page that published the post
|
||||||
@ -153,10 +434,7 @@ class FbcrawlItem(scrapy.Item):
|
|||||||
|
|
||||||
comments = scrapy.Field(
|
comments = scrapy.Field(
|
||||||
output_processor=comments_strip
|
output_processor=comments_strip
|
||||||
)
|
)
|
||||||
commentators = scrapy.Field(
|
|
||||||
output_processor=Join(separator=u'\n')
|
|
||||||
)
|
|
||||||
|
|
||||||
reactions = scrapy.Field(
|
reactions = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item):
|
|||||||
sigh = scrapy.Field()
|
sigh = scrapy.Field()
|
||||||
grrr = scrapy.Field()
|
grrr = scrapy.Field()
|
||||||
share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
url = scrapy.Field()
|
url = scrapy.Field(
|
||||||
|
output_processor=url_strip
|
||||||
|
)
|
||||||
|
Binary file not shown.
Binary file not shown.
@ -89,7 +89,6 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
for i in range(len(rispostina)):
|
for i in range(len(rispostina)):
|
||||||
risp = response.urljoin(rispostina[i].extract())
|
risp = response.urljoin(rispostina[i].extract())
|
||||||
yield scrapy.Request(risp, callback=self.parse_rispostina)
|
yield scrapy.Request(risp, callback=self.parse_rispostina)
|
||||||
|
|
||||||
|
|
||||||
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
||||||
if len(next_page) > 0:
|
if len(next_page) > 0:
|
||||||
|
@ -3,6 +3,7 @@ import scrapy
|
|||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
from scrapy.http import FormRequest
|
from scrapy.http import FormRequest
|
||||||
from fbcrawl.items import FbcrawlItem
|
from fbcrawl.items import FbcrawlItem
|
||||||
|
from scrapy.exceptions import CloseSpider
|
||||||
|
|
||||||
|
|
||||||
class FacebookSpider(scrapy.Spider):
|
class FacebookSpider(scrapy.Spider):
|
||||||
@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
"""
|
"""
|
||||||
name = "fb"
|
name = "fb"
|
||||||
|
|
||||||
def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs):
|
def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
|
||||||
super(FacebookSpider, self).__init__(**kwargs)
|
super(FacebookSpider, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.year = int(year) #arguments are passed as strings
|
#email & pass need to be passed as attributes!
|
||||||
|
|
||||||
if not email or not password:
|
if not email or not password:
|
||||||
raise ValueError("You need to provide valid email and password!")
|
raise ValueError("You need to provide valid email and password!")
|
||||||
else:
|
else:
|
||||||
self.email = email
|
self.email = email
|
||||||
self.password = password
|
self.password = password
|
||||||
|
|
||||||
|
#page name parsing (added support for full urls)
|
||||||
if not page:
|
if not page:
|
||||||
raise ValueError("You need to provide a valid page name to crawl!")
|
raise ValueError("You need to provide a valid page name to crawl!")
|
||||||
|
elif page.find('https://www.facebook.com/') != -1:
|
||||||
|
self.page = page[25:]
|
||||||
|
elif page.find('https://mbasic.facebook.com/') != -1:
|
||||||
|
self.page = page[28:]
|
||||||
|
elif page.find('https://m.facebook.com/') != -1:
|
||||||
|
self.page = page[23:]
|
||||||
else:
|
else:
|
||||||
self.page = page
|
self.page = page
|
||||||
|
|
||||||
if not(lang):
|
#parse year
|
||||||
self.logger.info('Language attribute not provided, assuming "en"')
|
assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
|
||||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
self.year = int(year) #arguments are passed as strings
|
||||||
self.lang = 'en'
|
|
||||||
|
#parse lang, if not provided (but is supported) it will be guessed in parse_home
|
||||||
|
if lang=='_':
|
||||||
|
self.logger.info('Language attribute not provided, I will try to guess it')
|
||||||
|
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||||
|
self.lang=lang
|
||||||
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
|
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
else:
|
else:
|
||||||
self.logger.info('Lang:{} not currently supported'.format(lang))
|
self.logger.info('Lang "{}" not currently supported'.format(lang))
|
||||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||||
self.logger.info('Change your interface lang from facebook and try again')
|
self.logger.info('Change your interface lang from facebook and try again')
|
||||||
return
|
raise CloseSpider('Language provided not currently supported')
|
||||||
|
|
||||||
self.start_urls = ['https://mbasic.facebook.com']
|
self.start_urls = ['https://mbasic.facebook.com']
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
'''
|
||||||
|
Handle login with provided credentials
|
||||||
|
'''
|
||||||
return FormRequest.from_response(
|
return FormRequest.from_response(
|
||||||
response,
|
response,
|
||||||
formxpath='//form[contains(@action, "login")]',
|
formxpath='//form[contains(@action, "login")]',
|
||||||
@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
|
|
||||||
def parse_home(self, response):
|
def parse_home(self, response):
|
||||||
'''
|
'''
|
||||||
Parse user news feed page. This code is outdate and needs review.
|
This method has multiple purposes:
|
||||||
|
1) Handle failed logins due to facebook 'save-device' redirection
|
||||||
|
2) Set language interface, if not already provided
|
||||||
|
3) Navigate to given page
|
||||||
'''
|
'''
|
||||||
if response.css('#approvals_code'):
|
#handle 'save-device' redirection
|
||||||
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
if response.xpath("//div/a[contains(@href,'save-device')]"):
|
||||||
if not self.code:
|
|
||||||
# Show facebook messages via logs
|
|
||||||
# and request user for approval code.
|
|
||||||
message = response.css('._50f4::text').extract()[0]
|
|
||||||
self.log(message)
|
|
||||||
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
|
||||||
self.log(message)
|
|
||||||
self.code = input('Enter the code: ')
|
|
||||||
self.code = str(self.code)
|
|
||||||
if not (self.code and self.code.isdigit()):
|
|
||||||
self.log('Bad approvals code detected.')
|
|
||||||
return
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formdata={'approvals_code': self.code},
|
|
||||||
callback=self.parse_home)
|
|
||||||
elif response.xpath("//div/a[contains(@href,'save-device')]"):
|
|
||||||
# elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
|
||||||
# Handle 'Save Browser' checkpoint.
|
|
||||||
return FormRequest.from_response(
|
return FormRequest.from_response(
|
||||||
response,
|
response,
|
||||||
formdata={'name_action_selected': 'dont_save'},
|
formdata={'name_action_selected': 'dont_save'},
|
||||||
callback=self.parse_home)
|
callback=self.parse_home)
|
||||||
elif response.css('button#checkpointSubmitButton'):
|
|
||||||
# Handle 'Someone tried to log into your account' warning.
|
#set language interface
|
||||||
return FormRequest.from_response(
|
if self.lang == '_':
|
||||||
response, callback=self.parse_home)
|
if response.xpath("//input[@placeholder='Search Facebook']"):
|
||||||
# Else go to the page requested.
|
self.lang = 'en'
|
||||||
if self.page.find('https://www.facebook.com/') != -1:
|
elif response.xpath("//input[@value='Buscar']"):
|
||||||
self.page = self.page[25:]
|
self.lang = 'es'
|
||||||
|
elif response.xpath("//input[@value='Rechercher']"):
|
||||||
|
self.lang = 'fr'
|
||||||
|
elif response.xpath("//input[@value='Cerca']"):
|
||||||
|
self.lang = 'it'
|
||||||
|
elif response.xpath("//input[@value='Pesquisar']"):
|
||||||
|
self.lang = 'pt'
|
||||||
|
else:
|
||||||
|
raise CloseSpider('Language not recognized')
|
||||||
|
|
||||||
|
#navigate to provided page
|
||||||
href = response.urljoin(self.page)
|
href = response.urljoin(self.page)
|
||||||
self.logger.info('Parse function called on %s', href)
|
self.logger.info('Parsing facebook page %s', href)
|
||||||
return scrapy.Request(url=href,callback=self.parse_page)
|
return scrapy.Request(url=href,callback=self.parse_page)
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
|
'''
|
||||||
|
Parse the given page selecting the posts.
|
||||||
|
Then ask recursively for another page.
|
||||||
|
'''
|
||||||
#select all posts
|
#select all posts
|
||||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||||
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
|
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
|
||||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||||
#page_url
|
new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
|
||||||
#new.add_value('url',response.url)
|
|
||||||
|
#page_url #new.add_value('url',response.url)
|
||||||
#returns full post-link in a list
|
#returns full post-link in a list
|
||||||
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||||
temp_post = response.urljoin(post[0])
|
temp_post = response.urljoin(post[0])
|
||||||
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
|
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
|
||||||
|
|
||||||
#load following page
|
#load following page
|
||||||
#next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href')
|
|
||||||
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||||
if len(next_page) == 0:
|
if len(next_page) == 0:
|
||||||
if response.meta['flag'] == 4 and self.year <= 2015:
|
if response.meta['flag'] == 4 and self.year <= 2015:
|
||||||
@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
|
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
|
||||||
new.add_xpath('date', '//div/div/abbr/text()')
|
new.add_xpath('date', '//div/div/abbr/text()')
|
||||||
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
|
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
|
||||||
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
|
|
||||||
|
|
||||||
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
|
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
|
||||||
reactions = response.urljoin(reactions[0].extract())
|
reactions = response.urljoin(reactions[0].extract())
|
||||||
|
Loading…
Reference in New Issue
Block a user