Adding events crawler

This commit is contained in:
StefanYohansson 2019-06-29 22:01:04 -03:00
parent be8a9c2f5f
commit 5c3128cfef
2 changed files with 171 additions and 104 deletions

View File

@ -8,7 +8,7 @@
import scrapy import scrapy
from scrapy.loader.processors import TakeFirst, Join, MapCompose from scrapy.loader.processors import TakeFirst, Join, MapCompose
from datetime import datetime, timedelta from datetime import datetime, timedelta
def comments_strip(string,loader_context): def comments_strip(string,loader_context):
lang = loader_context['lang'] lang = loader_context['lang']
if lang == 'it': if lang == 'it':
@ -16,7 +16,7 @@ def comments_strip(string,loader_context):
return return
else: else:
return string[0].rstrip(' commenti') return string[0].rstrip(' commenti')
elif lang == 'en': elif lang == 'en':
if(string[0] == 'Share'): if(string[0] == 'Share'):
return '0' return '0'
@ -31,13 +31,13 @@ def reactions_strip(string,loader_context):
lang = loader_context['lang'] lang = loader_context['lang']
if lang == 'it': if lang == 'it':
newstring = string[0] newstring = string[0]
#19.298.873 #19.298.873
if len(newstring.split()) == 1: if len(newstring.split()) == 1:
while newstring.rfind('.') != -1: while newstring.rfind('.') != -1:
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:] newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
return newstring return newstring
#Pamela, Luigi e altri 4 #Pamela, Luigi e altri 4
else: else:
return string return string
friends = newstring.count(' e ') + newstring.count(',') friends = newstring.count(' e ') + newstring.count(',')
newstring = newstring.split()[::-1][0] newstring = newstring.split()[::-1][0]
@ -46,13 +46,13 @@ def reactions_strip(string,loader_context):
return int(newstring) + friends return int(newstring) + friends
elif lang == 'en': elif lang == 'en':
newstring = string[0] newstring = string[0]
#19,298,873 #19,298,873
if len(newstring.split()) == 1: if len(newstring.split()) == 1:
while newstring.rfind(',') != -1: while newstring.rfind(',') != -1:
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
return newstring return newstring
#Mark and other 254,134 #Mark and other 254,134
elif newstring.split()[::-1][1].isdigit(): elif newstring.split()[::-1][1].isdigit():
friends = newstring.count(' and ') + newstring.count(',') friends = newstring.count(' and ') + newstring.count(',')
newstring = newstring.split()[::-1][1] newstring = newstring.split()[::-1][1]
while newstring.rfind(',') != -1: while newstring.rfind(',') != -1:
@ -70,7 +70,7 @@ def url_strip(url):
i = fullurl.find('&id=') i = fullurl.find('&id=')
if i != -1: if i != -1:
return fullurl[:i+4] + fullurl[i+4:].split('&')[0] return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
else: #catch photos else: #catch photos
i = fullurl.find('/photos/') i = fullurl.find('/photos/')
if i != -1: if i != -1:
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
@ -80,13 +80,13 @@ def url_strip(url):
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
else: else:
return fullurl return fullurl
def parse_date(date,loader_context): def parse_date(date,loader_context):
import json import json
d = json.loads(date[0]) #nested dict of features d = json.loads(date[0]) #nested dict of features
flat_d = dict() #only retain 'leaves' of d tree flat_d = dict() #only retain 'leaves' of d tree
def recursive_items(dictionary): def recursive_items(dictionary):
''' '''
Get most nested key:value pair of nested dict Get most nested key:value pair of nested dict
@ -138,7 +138,7 @@ def parse_date2(init_date,loader_context):
'ott':10, 'ott':10,
'nov':11, 'nov':11,
'dic':12 'dic':12
} }
giorni = { giorni = {
'lunedì':0, 'lunedì':0,
@ -148,8 +148,8 @@ def parse_date2(init_date,loader_context):
'venerdì':4, 'venerdì':4,
'sabato':5, 'sabato':5,
'domenica':6 'domenica':6
} }
date = init_date[0].split() date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
@ -161,7 +161,7 @@ def parse_date2(init_date,loader_context):
#adesso, ieri, 4h, 50min #adesso, ieri, 4h, 50min
elif l == 1: elif l == 1:
if date[0].isalpha(): if date[0].isalpha():
if date[0].lower() == 'ieri': if date[0].lower() == 'ieri':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month #check that yesterday was not in another month
@ -169,15 +169,15 @@ def parse_date2(init_date,loader_context):
elif date[0].lower() == 'adesso': elif date[0].lower() == 'adesso':
return datetime(year,month,day).date() #return today return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date) else: #not recognized, (return date or init_date)
return date return date
else: else:
#4h, 50min (exploit future parsing) #4h, 50min (exploit future parsing)
l = 2 l = 2
new_date = [x for x in date[0] if x.isdigit()] new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date) date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())] new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date) date[1] = ''.join(new_date)
# l = 2 # l = 2
elif l == 2: elif l == 2:
#22 min (oggi) #22 min (oggi)
if date[1] == 'min': if date[1] == 'min':
@ -187,7 +187,7 @@ def parse_date2(init_date,loader_context):
else: else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#4 h (oggi) #4 h (oggi)
elif date[1] == 'h': elif date[1] == 'h':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
@ -196,34 +196,34 @@ def parse_date2(init_date,loader_context):
else: else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#2 gen #2 gen
elif len(date[1]) == 3 and date[1].isalpha(): elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0]) day = int(date[0])
month = months_abbr[date[1].lower()] month = months_abbr[date[1].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#2 gennaio #2 gennaio
elif len(date[1]) > 3 and date[1].isalpha(): elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0]) day = int(date[0])
month = months[date[1]] month = months[date[1]]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
# l = 3 # l = 3
elif l == 3: elif l == 3:
#21 giu 2017 #21 giu 2017
if len(date[1]) == 3 and date[2].isdigit(): if len(date[1]) == 3 and date[2].isdigit():
day = int(date[0]) day = int(date[0])
month = months_abbr[date[1]] month = months_abbr[date[1]]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#21 giugno 2017 #21 giugno 2017
elif len(date[1]) > 3 and date[2].isdigit(): elif len(date[1]) > 3 and date[2].isdigit():
day = int(date[0]) day = int(date[0])
month = months[date[1]] month = months[date[1]]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#9 ore fa #9 ore fa
elif date[0].isdigit() and date[1][:2] == 'or': elif date[0].isdigit() and date[1][:2] == 'or':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
@ -232,25 +232,25 @@ def parse_date2(init_date,loader_context):
else: else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#7 minuti fa #7 minuti fa
elif date[0].isdigit() and date[1][:3] == 'min': elif date[0].isdigit() and date[1][:3] == 'min':
return datetime(year,month,day).date() return datetime(year,month,day).date()
#ieri alle 20:45 #ieri alle 20:45
elif date[0].lower() == 'ieri' and date[1] == 'alle': elif date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#oggi alle 11:11 #oggi alle 11:11
elif date[0].lower() == 'oggi' and date[1] == 'alle': elif date[0].lower() == 'oggi' and date[1] == 'alle':
return datetime(year,month,day).date() return datetime(year,month,day).date()
#lunedì alle 12:34 #lunedì alle 12:34
elif date[0].isalpha() and date[1] == 'alle': elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day #weekday is chronologically always lower than day
delta = today - weekday delta = today - weekday
if delta >= 0: if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
@ -270,13 +270,13 @@ def parse_date2(init_date,loader_context):
if date[0].lower() == 'ieri' and date[1] == 'alle': if date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#domenica alle ore 19:29 #domenica alle ore 19:29
elif date[0].isalpha() and date[1] == 'alle': elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day #weekday is chronologically always lower than day
delta = today - weekday delta = today - weekday
if delta >= 0: if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
@ -286,7 +286,7 @@ def parse_date2(init_date,loader_context):
delta += 8 delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
@ -297,16 +297,16 @@ def parse_date2(init_date,loader_context):
if len(date[1]) == 3: if len(date[1]) == 3:
day = int(date[0]) day = int(date[0])
month = months_abbr[date[1].lower()] month = months_abbr[date[1].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#29 febbraio alle ore 21:49 #29 febbraio alle ore 21:49
else: else:
day = int(date[0]) day = int(date[0])
month = months[date[1].lower()] month = months[date[1].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
# l = 6 # l = 6
elif l == 6: elif l == 6:
if date[3] == 'alle': if date[3] == 'alle':
#29 feb 2016 alle ore 21:49 #29 feb 2016 alle ore 21:49
@ -314,14 +314,14 @@ def parse_date2(init_date,loader_context):
day = int(date[0]) day = int(date[0])
month = months_abbr[date[1].lower()] month = months_abbr[date[1].lower()]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#29 febbraio 2016 alle ore 21:49 #29 febbraio 2016 alle ore 21:49
else: else:
day = int(date[0]) day = int(date[0])
month = months[date[1].lower()] month = months[date[1].lower()]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
# ============================================================================= # =============================================================================
@ -356,7 +356,7 @@ def parse_date2(init_date,loader_context):
'oct':10, 'oct':10,
'nov':11, 'nov':11,
'dec':12 'dec':12
} }
days = { days = {
'monday':0, 'monday':0,
@ -366,7 +366,7 @@ def parse_date2(init_date,loader_context):
'friday':4, 'friday':4,
'saturday':5, 'saturday':5,
'sunday':6 'sunday':6
} }
date = init_date[0].split() date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
@ -379,7 +379,7 @@ def parse_date2(init_date,loader_context):
#Yesterday, Now, 4hr, 50mins #Yesterday, Now, 4hr, 50mins
elif l == 1: elif l == 1:
if date[0].isalpha(): if date[0].isalpha():
if date[0].lower() == 'yesterday': if date[0].lower() == 'yesterday':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month #check that yesterday was not in another month
@ -387,15 +387,15 @@ def parse_date2(init_date,loader_context):
elif date[0].lower() == 'now': elif date[0].lower() == 'now':
return datetime(year,month,day).date() #return today return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date) else: #not recognized, (return date or init_date)
return date return date
else: else:
#4h, 50min (exploit future parsing) #4h, 50min (exploit future parsing)
l = 2 l = 2
new_date = [x for x in date[0] if x.isdigit()] new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date) date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())] new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date) date[1] = ''.join(new_date)
# l = 2 # l = 2
elif l == 2: elif l == 2:
if date[1] == 'now': if date[1] == 'now':
return datetime(year,month,day).date() return datetime(year,month,day).date()
@ -414,31 +414,31 @@ def parse_date2(init_date,loader_context):
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#4 h (oggi) #4 h (oggi)
else: else:
return datetime(year,month,day).date() return datetime(year,month,day).date()
#2 jan #2 jan
elif len(date[1]) == 3 and date[1].isalpha(): elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0]) day = int(date[0])
month = months_abbr[date[1].lower()] month = months_abbr[date[1].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#2 january #2 january
elif len(date[1]) > 3 and date[1].isalpha(): elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0]) day = int(date[0])
month = months[date[1]] month = months[date[1]]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#jan 2 #jan 2
elif len(date[0]) == 3 and date[0].isalpha(): elif len(date[0]) == 3 and date[0].isalpha():
day = int(date[1]) day = int(date[1])
month = months_abbr[date[0].lower()] month = months_abbr[date[0].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#january 2 #january 2
elif len(date[0]) > 3 and date[0].isalpha(): elif len(date[0]) > 3 and date[0].isalpha():
day = int(date[1]) day = int(date[1])
month = months[date[0]] month = months[date[0]]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
@ -452,35 +452,35 @@ def parse_date2(init_date,loader_context):
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
# 5 hours ago (today)
else:
return datetime(year,month,day).date() return datetime(year,month,day).date()
#10 minutes ago # 5 hours ago (today)
else:
return datetime(year,month,day).date()
#10 minutes ago
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins': elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
#22 minutes ago (yesterday) #22 minutes ago (yesterday)
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#22 minutes ago (today) #22 minutes ago (today)
else: else:
return datetime(year,month,day).date() return datetime(year,month,day).date()
else: else:
return date return date
else: else:
#21 Jun 2017 #21 Jun 2017
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit(): if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
day = int(date[0]) day = int(date[0])
month = months_abbr[date[1].lower()] month = months_abbr[date[1].lower()]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#21 June 2017 #21 June 2017
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit(): elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
day = int(date[0]) day = int(date[0])
month = months[date[1].lower()] month = months[date[1].lower()]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#Jul 11, 2016 #Jul 11, 2016
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha(): elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
day = int(date[1][:-1]) day = int(date[1][:-1])
@ -496,13 +496,13 @@ def parse_date2(init_date,loader_context):
if date[0].lower() == 'yesterday' and date[1] == 'at': if date[0].lower() == 'yesterday' and date[1] == 'at':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#Thursday at 4:27 PM #Thursday at 4:27 PM
elif date[1] == 'at': elif date[1] == 'at':
today = datetime.now().weekday() #today as a weekday today = datetime.now().weekday() #today as a weekday
weekday = days[date[0].lower()] #day to be match as number weekday weekday = days[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day #weekday is chronologically always lower than day
delta = today - weekday delta = today - weekday
if delta >= 0: if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
@ -519,82 +519,82 @@ def parse_date2(init_date,loader_context):
# l = 5 # l = 5
elif l == 5: elif l == 5:
if date[2] == 'at': if date[2] == 'at':
#Jan 29 at 10:00 PM #Jan 29 at 10:00 PM
if len(date[0]) == 3: if len(date[0]) == 3:
day = int(date[1]) day = int(date[1])
month = months_abbr[date[0].lower()] month = months_abbr[date[0].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#29 febbraio alle ore 21:49 #29 febbraio alle ore 21:49
else: else:
day = int(date[1]) day = int(date[1])
month = months[date[0].lower()] month = months[date[0].lower()]
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
# l = 6 # l = 6
elif l == 6: elif l == 6:
if date[3] == 'at': if date[3] == 'at':
date[1] date[1]
#Aug 25, 2016 at 7:00 PM #Aug 25, 2016 at 7:00 PM
if len(date[0]) == 3: if len(date[0]) == 3:
day = int(date[1][:-1]) day = int(date[1][:-1])
month = months_abbr[date[0].lower()] month = months_abbr[date[0].lower()]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#August 25, 2016 at 7:00 PM #August 25, 2016 at 7:00 PM
else: else:
day = int(date[1][:-1]) day = int(date[1][:-1])
month = months[date[0].lower()] month = months[date[0].lower()]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
# l > 6 # l > 6
#parsing failed - l too big #parsing failed - l too big
else: else:
return date return date
#parsing failed - language not supported #parsing failed - language not supported
else: else:
return init_date return init_date
def id_strip(post_id): def id_strip(post_id):
import json import json
d = json.loads(post_id[::-1][0]) #nested dict of features d = json.loads(post_id[::-1][0]) #nested dict of features
return str(d['top_level_post_id']) return str(d['top_level_post_id'])
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):
source = scrapy.Field() source = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
text = scrapy.Field( text = scrapy.Field(
output_processor=Join(separator=u'') output_processor=Join(separator=u'')
) # full text of the post ) # full text of the post
comments = scrapy.Field( comments = scrapy.Field(
output_processor=comments_strip output_processor=comments_strip
) )
reactions = scrapy.Field( reactions = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) # num of reactions ) # num of reactions
likes = scrapy.Field( likes = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
ahah = scrapy.Field( ahah = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
love = scrapy.Field( love = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
wow = scrapy.Field( wow = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
sigh = scrapy.Field( sigh = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
grrr = scrapy.Field( grrr = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
url = scrapy.Field( url = scrapy.Field(
output_processor=url_strip output_processor=url_strip
@ -605,11 +605,11 @@ class FbcrawlItem(scrapy.Item):
shared_from = scrapy.Field() shared_from = scrapy.Field()
class CommentsItem(scrapy.Item): class CommentsItem(scrapy.Item):
source = scrapy.Field() source = scrapy.Field()
reply_to=scrapy.Field() reply_to=scrapy.Field()
date = scrapy.Field( # when was the post published date = scrapy.Field( # when was the post published
output_processor=parse_date2 output_processor=parse_date2
) )
text = scrapy.Field( text = scrapy.Field(
output_processor=Join(separator=u'') output_processor=Join(separator=u'')
) # full text of the post ) # full text of the post
@ -618,18 +618,18 @@ class CommentsItem(scrapy.Item):
) # num of reactions ) # num of reactions
likes = scrapy.Field( likes = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
source_url = scrapy.Field() source_url = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
ahah = scrapy.Field() ahah = scrapy.Field()
love = scrapy.Field() love = scrapy.Field()
wow = scrapy.Field() wow = scrapy.Field()
sigh = scrapy.Field() sigh = scrapy.Field()
grrr = scrapy.Field() grrr = scrapy.Field()
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
class ProfileItem(scrapy.Item): class ProfileItem(scrapy.Item):
name = scrapy.Field() name = scrapy.Field()
gender = scrapy.Field() gender = scrapy.Field()
birthday = scrapy.Field() birthday = scrapy.Field()
current_city = scrapy.Field() current_city = scrapy.Field()
@ -638,3 +638,12 @@ class ProfileItem(scrapy.Item):
education = scrapy.Field() education = scrapy.Field()
interested_in = scrapy.Field() interested_in = scrapy.Field()
page = scrapy.Field() page = scrapy.Field()
class EventsItem(scrapy.Item):
name = scrapy.Field()
location = scrapy.Field()
where = scrapy.Field()
photo = scrapy.Field()
start_date = scrapy.Field()
end_date = scrapy.Field()
description = scrapy.Field()

58
fbcrawl/spiders/events.py Normal file
View File

@ -0,0 +1,58 @@
import scrapy
from scrapy.loader import ItemLoader
from scrapy.exceptions import CloseSpider
from fbcrawl.spiders.fbcrawl import FacebookSpider
from fbcrawl.items import EventsItem, parse_date, parse_date2
from datetime import datetime
class EventsSpider(FacebookSpider):
"""
Parse FB events, given a page (needs credentials)
"""
name = "events"
custom_settings = {
'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \
'end_date','description'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
'CONCURRENT_REQUESTS' : 1
}
def __init__(self, *args, **kwargs):
self.page = kwargs['page']
super().__init__(*args,**kwargs)
def parse_page(self, response):
yield scrapy.Request(url=response.urljoin('%s/events' % self.page),
callback=self.parse_events,
priority=10,
meta={'index':1})
def parse_events(self, response):
TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr'
for event in response.xpath(TABLE_XPATH):
url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first()
yield response.follow(url, callback=self.parse_event)
def parse_event(self, response):
EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()'
EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()'
EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()'
DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()'
EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()'
EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src'
date = response.xpath(DATE).extract_first()
start_date = date.split('')[0] or None
end_date = date.split('')[1] or None
name = response.xpath(EVENT_NAME).extract_first()
self.logger.info('Parsing event %s' % name)
yield EventsItem(
name=name,
where=response.xpath(EVENT_WHERE).extract_first(),
location=response.xpath(EVENT_LOCATION).extract_first(),
photo=response.xpath(EVENT_COVER).extract_first(),
start_date=start_date,
end_date=end_date,
description=response.xpath(EVENT_DESCRIPTION).extract_first()
)