Adding events crawler
This commit is contained in:
parent
be8a9c2f5f
commit
5c3128cfef
217
fbcrawl/items.py
217
fbcrawl/items.py
@ -8,7 +8,7 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
from scrapy.loader.processors import TakeFirst, Join, MapCompose
|
from scrapy.loader.processors import TakeFirst, Join, MapCompose
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
def comments_strip(string,loader_context):
|
def comments_strip(string,loader_context):
|
||||||
lang = loader_context['lang']
|
lang = loader_context['lang']
|
||||||
if lang == 'it':
|
if lang == 'it':
|
||||||
@ -16,7 +16,7 @@ def comments_strip(string,loader_context):
|
|||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
return string[0].rstrip(' commenti')
|
return string[0].rstrip(' commenti')
|
||||||
|
|
||||||
elif lang == 'en':
|
elif lang == 'en':
|
||||||
if(string[0] == 'Share'):
|
if(string[0] == 'Share'):
|
||||||
return '0'
|
return '0'
|
||||||
@ -31,13 +31,13 @@ def reactions_strip(string,loader_context):
|
|||||||
lang = loader_context['lang']
|
lang = loader_context['lang']
|
||||||
if lang == 'it':
|
if lang == 'it':
|
||||||
newstring = string[0]
|
newstring = string[0]
|
||||||
#19.298.873
|
#19.298.873
|
||||||
if len(newstring.split()) == 1:
|
if len(newstring.split()) == 1:
|
||||||
while newstring.rfind('.') != -1:
|
while newstring.rfind('.') != -1:
|
||||||
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
||||||
return newstring
|
return newstring
|
||||||
#Pamela, Luigi e altri 4
|
#Pamela, Luigi e altri 4
|
||||||
else:
|
else:
|
||||||
return string
|
return string
|
||||||
friends = newstring.count(' e ') + newstring.count(',')
|
friends = newstring.count(' e ') + newstring.count(',')
|
||||||
newstring = newstring.split()[::-1][0]
|
newstring = newstring.split()[::-1][0]
|
||||||
@ -46,13 +46,13 @@ def reactions_strip(string,loader_context):
|
|||||||
return int(newstring) + friends
|
return int(newstring) + friends
|
||||||
elif lang == 'en':
|
elif lang == 'en':
|
||||||
newstring = string[0]
|
newstring = string[0]
|
||||||
#19,298,873
|
#19,298,873
|
||||||
if len(newstring.split()) == 1:
|
if len(newstring.split()) == 1:
|
||||||
while newstring.rfind(',') != -1:
|
while newstring.rfind(',') != -1:
|
||||||
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||||
return newstring
|
return newstring
|
||||||
#Mark and other 254,134
|
#Mark and other 254,134
|
||||||
elif newstring.split()[::-1][1].isdigit():
|
elif newstring.split()[::-1][1].isdigit():
|
||||||
friends = newstring.count(' and ') + newstring.count(',')
|
friends = newstring.count(' and ') + newstring.count(',')
|
||||||
newstring = newstring.split()[::-1][1]
|
newstring = newstring.split()[::-1][1]
|
||||||
while newstring.rfind(',') != -1:
|
while newstring.rfind(',') != -1:
|
||||||
@ -70,7 +70,7 @@ def url_strip(url):
|
|||||||
i = fullurl.find('&id=')
|
i = fullurl.find('&id=')
|
||||||
if i != -1:
|
if i != -1:
|
||||||
return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
||||||
else: #catch photos
|
else: #catch photos
|
||||||
i = fullurl.find('/photos/')
|
i = fullurl.find('/photos/')
|
||||||
if i != -1:
|
if i != -1:
|
||||||
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
||||||
@ -80,13 +80,13 @@ def url_strip(url):
|
|||||||
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
||||||
else:
|
else:
|
||||||
return fullurl
|
return fullurl
|
||||||
|
|
||||||
def parse_date(date,loader_context):
|
def parse_date(date,loader_context):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
d = json.loads(date[0]) #nested dict of features
|
d = json.loads(date[0]) #nested dict of features
|
||||||
flat_d = dict() #only retain 'leaves' of d tree
|
flat_d = dict() #only retain 'leaves' of d tree
|
||||||
|
|
||||||
def recursive_items(dictionary):
|
def recursive_items(dictionary):
|
||||||
'''
|
'''
|
||||||
Get most nested key:value pair of nested dict
|
Get most nested key:value pair of nested dict
|
||||||
@ -138,7 +138,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
'ott':10,
|
'ott':10,
|
||||||
'nov':11,
|
'nov':11,
|
||||||
'dic':12
|
'dic':12
|
||||||
}
|
}
|
||||||
|
|
||||||
giorni = {
|
giorni = {
|
||||||
'lunedì':0,
|
'lunedì':0,
|
||||||
@ -148,8 +148,8 @@ def parse_date2(init_date,loader_context):
|
|||||||
'venerdì':4,
|
'venerdì':4,
|
||||||
'sabato':5,
|
'sabato':5,
|
||||||
'domenica':6
|
'domenica':6
|
||||||
}
|
}
|
||||||
|
|
||||||
date = init_date[0].split()
|
date = init_date[0].split()
|
||||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
|
|
||||||
@ -161,7 +161,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
|
|
||||||
#adesso, ieri, 4h, 50min
|
#adesso, ieri, 4h, 50min
|
||||||
elif l == 1:
|
elif l == 1:
|
||||||
if date[0].isalpha():
|
if date[0].isalpha():
|
||||||
if date[0].lower() == 'ieri':
|
if date[0].lower() == 'ieri':
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
#check that yesterday was not in another month
|
#check that yesterday was not in another month
|
||||||
@ -169,15 +169,15 @@ def parse_date2(init_date,loader_context):
|
|||||||
elif date[0].lower() == 'adesso':
|
elif date[0].lower() == 'adesso':
|
||||||
return datetime(year,month,day).date() #return today
|
return datetime(year,month,day).date() #return today
|
||||||
else: #not recognized, (return date or init_date)
|
else: #not recognized, (return date or init_date)
|
||||||
return date
|
return date
|
||||||
else:
|
else:
|
||||||
#4h, 50min (exploit future parsing)
|
#4h, 50min (exploit future parsing)
|
||||||
l = 2
|
l = 2
|
||||||
new_date = [x for x in date[0] if x.isdigit()]
|
new_date = [x for x in date[0] if x.isdigit()]
|
||||||
date[0] = ''.join(new_date)
|
date[0] = ''.join(new_date)
|
||||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||||
date[1] = ''.join(new_date)
|
date[1] = ''.join(new_date)
|
||||||
# l = 2
|
# l = 2
|
||||||
elif l == 2:
|
elif l == 2:
|
||||||
#22 min (oggi)
|
#22 min (oggi)
|
||||||
if date[1] == 'min':
|
if date[1] == 'min':
|
||||||
@ -187,7 +187,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
else:
|
else:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#4 h (oggi)
|
#4 h (oggi)
|
||||||
elif date[1] == 'h':
|
elif date[1] == 'h':
|
||||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
@ -196,34 +196,34 @@ def parse_date2(init_date,loader_context):
|
|||||||
else:
|
else:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#2 gen
|
#2 gen
|
||||||
elif len(date[1]) == 3 and date[1].isalpha():
|
elif len(date[1]) == 3 and date[1].isalpha():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months_abbr[date[1].lower()]
|
month = months_abbr[date[1].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#2 gennaio
|
#2 gennaio
|
||||||
elif len(date[1]) > 3 and date[1].isalpha():
|
elif len(date[1]) > 3 and date[1].isalpha():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months[date[1]]
|
month = months[date[1]]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
# l = 3
|
# l = 3
|
||||||
elif l == 3:
|
elif l == 3:
|
||||||
#21 giu 2017
|
#21 giu 2017
|
||||||
if len(date[1]) == 3 and date[2].isdigit():
|
if len(date[1]) == 3 and date[2].isdigit():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months_abbr[date[1]]
|
month = months_abbr[date[1]]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#21 giugno 2017
|
#21 giugno 2017
|
||||||
elif len(date[1]) > 3 and date[2].isdigit():
|
elif len(date[1]) > 3 and date[2].isdigit():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months[date[1]]
|
month = months[date[1]]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#9 ore fa
|
#9 ore fa
|
||||||
elif date[0].isdigit() and date[1][:2] == 'or':
|
elif date[0].isdigit() and date[1][:2] == 'or':
|
||||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
@ -232,25 +232,25 @@ def parse_date2(init_date,loader_context):
|
|||||||
else:
|
else:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#7 minuti fa
|
#7 minuti fa
|
||||||
elif date[0].isdigit() and date[1][:3] == 'min':
|
elif date[0].isdigit() and date[1][:3] == 'min':
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
|
|
||||||
#ieri alle 20:45
|
#ieri alle 20:45
|
||||||
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#oggi alle 11:11
|
#oggi alle 11:11
|
||||||
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#lunedì alle 12:34
|
#lunedì alle 12:34
|
||||||
elif date[0].isalpha() and date[1] == 'alle':
|
elif date[0].isalpha() and date[1] == 'alle':
|
||||||
today = datetime.now().weekday() #today as a weekday
|
today = datetime.now().weekday() #today as a weekday
|
||||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||||
#weekday is chronologically always lower than day
|
#weekday is chronologically always lower than day
|
||||||
delta = today - weekday
|
delta = today - weekday
|
||||||
if delta >= 0:
|
if delta >= 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
@ -270,13 +270,13 @@ def parse_date2(init_date,loader_context):
|
|||||||
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#domenica alle ore 19:29
|
#domenica alle ore 19:29
|
||||||
elif date[0].isalpha() and date[1] == 'alle':
|
elif date[0].isalpha() and date[1] == 'alle':
|
||||||
today = datetime.now().weekday() #today as a weekday
|
today = datetime.now().weekday() #today as a weekday
|
||||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||||
#weekday is chronologically always lower than day
|
#weekday is chronologically always lower than day
|
||||||
delta = today - weekday
|
delta = today - weekday
|
||||||
if delta >= 0:
|
if delta >= 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
@ -286,7 +286,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
delta += 8
|
delta += 8
|
||||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
@ -297,16 +297,16 @@ def parse_date2(init_date,loader_context):
|
|||||||
if len(date[1]) == 3:
|
if len(date[1]) == 3:
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months_abbr[date[1].lower()]
|
month = months_abbr[date[1].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#29 febbraio alle ore 21:49
|
#29 febbraio alle ore 21:49
|
||||||
else:
|
else:
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months[date[1].lower()]
|
month = months[date[1].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
# l = 6
|
# l = 6
|
||||||
elif l == 6:
|
elif l == 6:
|
||||||
if date[3] == 'alle':
|
if date[3] == 'alle':
|
||||||
#29 feb 2016 alle ore 21:49
|
#29 feb 2016 alle ore 21:49
|
||||||
@ -314,14 +314,14 @@ def parse_date2(init_date,loader_context):
|
|||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months_abbr[date[1].lower()]
|
month = months_abbr[date[1].lower()]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#29 febbraio 2016 alle ore 21:49
|
#29 febbraio 2016 alle ore 21:49
|
||||||
else:
|
else:
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months[date[1].lower()]
|
month = months[date[1].lower()]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@ -356,7 +356,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
'oct':10,
|
'oct':10,
|
||||||
'nov':11,
|
'nov':11,
|
||||||
'dec':12
|
'dec':12
|
||||||
}
|
}
|
||||||
|
|
||||||
days = {
|
days = {
|
||||||
'monday':0,
|
'monday':0,
|
||||||
@ -366,7 +366,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
'friday':4,
|
'friday':4,
|
||||||
'saturday':5,
|
'saturday':5,
|
||||||
'sunday':6
|
'sunday':6
|
||||||
}
|
}
|
||||||
|
|
||||||
date = init_date[0].split()
|
date = init_date[0].split()
|
||||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
@ -379,7 +379,7 @@ def parse_date2(init_date,loader_context):
|
|||||||
|
|
||||||
#Yesterday, Now, 4hr, 50mins
|
#Yesterday, Now, 4hr, 50mins
|
||||||
elif l == 1:
|
elif l == 1:
|
||||||
if date[0].isalpha():
|
if date[0].isalpha():
|
||||||
if date[0].lower() == 'yesterday':
|
if date[0].lower() == 'yesterday':
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
#check that yesterday was not in another month
|
#check that yesterday was not in another month
|
||||||
@ -387,15 +387,15 @@ def parse_date2(init_date,loader_context):
|
|||||||
elif date[0].lower() == 'now':
|
elif date[0].lower() == 'now':
|
||||||
return datetime(year,month,day).date() #return today
|
return datetime(year,month,day).date() #return today
|
||||||
else: #not recognized, (return date or init_date)
|
else: #not recognized, (return date or init_date)
|
||||||
return date
|
return date
|
||||||
else:
|
else:
|
||||||
#4h, 50min (exploit future parsing)
|
#4h, 50min (exploit future parsing)
|
||||||
l = 2
|
l = 2
|
||||||
new_date = [x for x in date[0] if x.isdigit()]
|
new_date = [x for x in date[0] if x.isdigit()]
|
||||||
date[0] = ''.join(new_date)
|
date[0] = ''.join(new_date)
|
||||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||||
date[1] = ''.join(new_date)
|
date[1] = ''.join(new_date)
|
||||||
# l = 2
|
# l = 2
|
||||||
elif l == 2:
|
elif l == 2:
|
||||||
if date[1] == 'now':
|
if date[1] == 'now':
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
@ -414,31 +414,31 @@ def parse_date2(init_date,loader_context):
|
|||||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#4 h (oggi)
|
#4 h (oggi)
|
||||||
else:
|
else:
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
|
|
||||||
#2 jan
|
#2 jan
|
||||||
elif len(date[1]) == 3 and date[1].isalpha():
|
elif len(date[1]) == 3 and date[1].isalpha():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months_abbr[date[1].lower()]
|
month = months_abbr[date[1].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#2 january
|
#2 january
|
||||||
elif len(date[1]) > 3 and date[1].isalpha():
|
elif len(date[1]) > 3 and date[1].isalpha():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months[date[1]]
|
month = months[date[1]]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#jan 2
|
#jan 2
|
||||||
elif len(date[0]) == 3 and date[0].isalpha():
|
elif len(date[0]) == 3 and date[0].isalpha():
|
||||||
day = int(date[1])
|
day = int(date[1])
|
||||||
month = months_abbr[date[0].lower()]
|
month = months_abbr[date[0].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#january 2
|
#january 2
|
||||||
elif len(date[0]) > 3 and date[0].isalpha():
|
elif len(date[0]) > 3 and date[0].isalpha():
|
||||||
day = int(date[1])
|
day = int(date[1])
|
||||||
month = months[date[0]]
|
month = months[date[0]]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
@ -452,35 +452,35 @@ def parse_date2(init_date,loader_context):
|
|||||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
|
||||||
# 5 hours ago (today)
|
|
||||||
else:
|
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#10 minutes ago
|
# 5 hours ago (today)
|
||||||
|
else:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#10 minutes ago
|
||||||
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
|
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
|
||||||
#22 minutes ago (yesterday)
|
#22 minutes ago (yesterday)
|
||||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#22 minutes ago (today)
|
#22 minutes ago (today)
|
||||||
else:
|
else:
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
else:
|
else:
|
||||||
#21 Jun 2017
|
#21 Jun 2017
|
||||||
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
|
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months_abbr[date[1].lower()]
|
month = months_abbr[date[1].lower()]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#21 June 2017
|
#21 June 2017
|
||||||
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
|
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
|
||||||
day = int(date[0])
|
day = int(date[0])
|
||||||
month = months[date[1].lower()]
|
month = months[date[1].lower()]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#Jul 11, 2016
|
#Jul 11, 2016
|
||||||
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
|
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
|
||||||
day = int(date[1][:-1])
|
day = int(date[1][:-1])
|
||||||
@ -496,13 +496,13 @@ def parse_date2(init_date,loader_context):
|
|||||||
if date[0].lower() == 'yesterday' and date[1] == 'at':
|
if date[0].lower() == 'yesterday' and date[1] == 'at':
|
||||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#Thursday at 4:27 PM
|
#Thursday at 4:27 PM
|
||||||
elif date[1] == 'at':
|
elif date[1] == 'at':
|
||||||
today = datetime.now().weekday() #today as a weekday
|
today = datetime.now().weekday() #today as a weekday
|
||||||
weekday = days[date[0].lower()] #day to be match as number weekday
|
weekday = days[date[0].lower()] #day to be match as number weekday
|
||||||
#weekday is chronologically always lower than day
|
#weekday is chronologically always lower than day
|
||||||
delta = today - weekday
|
delta = today - weekday
|
||||||
if delta >= 0:
|
if delta >= 0:
|
||||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
@ -519,82 +519,82 @@ def parse_date2(init_date,loader_context):
|
|||||||
# l = 5
|
# l = 5
|
||||||
elif l == 5:
|
elif l == 5:
|
||||||
if date[2] == 'at':
|
if date[2] == 'at':
|
||||||
#Jan 29 at 10:00 PM
|
#Jan 29 at 10:00 PM
|
||||||
if len(date[0]) == 3:
|
if len(date[0]) == 3:
|
||||||
day = int(date[1])
|
day = int(date[1])
|
||||||
month = months_abbr[date[0].lower()]
|
month = months_abbr[date[0].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#29 febbraio alle ore 21:49
|
#29 febbraio alle ore 21:49
|
||||||
else:
|
else:
|
||||||
day = int(date[1])
|
day = int(date[1])
|
||||||
month = months[date[0].lower()]
|
month = months[date[0].lower()]
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
# l = 6
|
# l = 6
|
||||||
elif l == 6:
|
elif l == 6:
|
||||||
if date[3] == 'at':
|
if date[3] == 'at':
|
||||||
date[1]
|
date[1]
|
||||||
#Aug 25, 2016 at 7:00 PM
|
#Aug 25, 2016 at 7:00 PM
|
||||||
if len(date[0]) == 3:
|
if len(date[0]) == 3:
|
||||||
day = int(date[1][:-1])
|
day = int(date[1][:-1])
|
||||||
month = months_abbr[date[0].lower()]
|
month = months_abbr[date[0].lower()]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#August 25, 2016 at 7:00 PM
|
#August 25, 2016 at 7:00 PM
|
||||||
else:
|
else:
|
||||||
day = int(date[1][:-1])
|
day = int(date[1][:-1])
|
||||||
month = months[date[0].lower()]
|
month = months[date[0].lower()]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
# l > 6
|
# l > 6
|
||||||
#parsing failed - l too big
|
#parsing failed - l too big
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
#parsing failed - language not supported
|
#parsing failed - language not supported
|
||||||
else:
|
else:
|
||||||
return init_date
|
return init_date
|
||||||
|
|
||||||
def id_strip(post_id):
|
def id_strip(post_id):
|
||||||
import json
|
import json
|
||||||
d = json.loads(post_id[::-1][0]) #nested dict of features
|
d = json.loads(post_id[::-1][0]) #nested dict of features
|
||||||
return str(d['top_level_post_id'])
|
return str(d['top_level_post_id'])
|
||||||
|
|
||||||
|
|
||||||
class FbcrawlItem(scrapy.Item):
|
class FbcrawlItem(scrapy.Item):
|
||||||
source = scrapy.Field()
|
source = scrapy.Field()
|
||||||
date = scrapy.Field()
|
date = scrapy.Field()
|
||||||
text = scrapy.Field(
|
text = scrapy.Field(
|
||||||
output_processor=Join(separator=u'')
|
output_processor=Join(separator=u'')
|
||||||
) # full text of the post
|
) # full text of the post
|
||||||
comments = scrapy.Field(
|
comments = scrapy.Field(
|
||||||
output_processor=comments_strip
|
output_processor=comments_strip
|
||||||
)
|
)
|
||||||
reactions = scrapy.Field(
|
reactions = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
) # num of reactions
|
) # num of reactions
|
||||||
likes = scrapy.Field(
|
likes = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
ahah = scrapy.Field(
|
ahah = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
love = scrapy.Field(
|
love = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
wow = scrapy.Field(
|
wow = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
sigh = scrapy.Field(
|
sigh = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
grrr = scrapy.Field(
|
grrr = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
url = scrapy.Field(
|
url = scrapy.Field(
|
||||||
output_processor=url_strip
|
output_processor=url_strip
|
||||||
@ -605,11 +605,11 @@ class FbcrawlItem(scrapy.Item):
|
|||||||
shared_from = scrapy.Field()
|
shared_from = scrapy.Field()
|
||||||
|
|
||||||
class CommentsItem(scrapy.Item):
|
class CommentsItem(scrapy.Item):
|
||||||
source = scrapy.Field()
|
source = scrapy.Field()
|
||||||
reply_to=scrapy.Field()
|
reply_to=scrapy.Field()
|
||||||
date = scrapy.Field( # when was the post published
|
date = scrapy.Field( # when was the post published
|
||||||
output_processor=parse_date2
|
output_processor=parse_date2
|
||||||
)
|
)
|
||||||
text = scrapy.Field(
|
text = scrapy.Field(
|
||||||
output_processor=Join(separator=u'')
|
output_processor=Join(separator=u'')
|
||||||
) # full text of the post
|
) # full text of the post
|
||||||
@ -618,18 +618,18 @@ class CommentsItem(scrapy.Item):
|
|||||||
) # num of reactions
|
) # num of reactions
|
||||||
likes = scrapy.Field(
|
likes = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
source_url = scrapy.Field()
|
source_url = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
ahah = scrapy.Field()
|
ahah = scrapy.Field()
|
||||||
love = scrapy.Field()
|
love = scrapy.Field()
|
||||||
wow = scrapy.Field()
|
wow = scrapy.Field()
|
||||||
sigh = scrapy.Field()
|
sigh = scrapy.Field()
|
||||||
grrr = scrapy.Field()
|
grrr = scrapy.Field()
|
||||||
share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
|
|
||||||
class ProfileItem(scrapy.Item):
|
class ProfileItem(scrapy.Item):
|
||||||
name = scrapy.Field()
|
name = scrapy.Field()
|
||||||
gender = scrapy.Field()
|
gender = scrapy.Field()
|
||||||
birthday = scrapy.Field()
|
birthday = scrapy.Field()
|
||||||
current_city = scrapy.Field()
|
current_city = scrapy.Field()
|
||||||
@ -638,3 +638,12 @@ class ProfileItem(scrapy.Item):
|
|||||||
education = scrapy.Field()
|
education = scrapy.Field()
|
||||||
interested_in = scrapy.Field()
|
interested_in = scrapy.Field()
|
||||||
page = scrapy.Field()
|
page = scrapy.Field()
|
||||||
|
|
||||||
|
class EventsItem(scrapy.Item):
|
||||||
|
name = scrapy.Field()
|
||||||
|
location = scrapy.Field()
|
||||||
|
where = scrapy.Field()
|
||||||
|
photo = scrapy.Field()
|
||||||
|
start_date = scrapy.Field()
|
||||||
|
end_date = scrapy.Field()
|
||||||
|
description = scrapy.Field()
|
||||||
|
58
fbcrawl/spiders/events.py
Normal file
58
fbcrawl/spiders/events.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from scrapy.exceptions import CloseSpider
|
||||||
|
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||||
|
from fbcrawl.items import EventsItem, parse_date, parse_date2
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
class EventsSpider(FacebookSpider):
|
||||||
|
"""
|
||||||
|
Parse FB events, given a page (needs credentials)
|
||||||
|
"""
|
||||||
|
name = "events"
|
||||||
|
custom_settings = {
|
||||||
|
'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \
|
||||||
|
'end_date','description'],
|
||||||
|
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||||
|
'CONCURRENT_REQUESTS' : 1
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.page = kwargs['page']
|
||||||
|
super().__init__(*args,**kwargs)
|
||||||
|
|
||||||
|
def parse_page(self, response):
|
||||||
|
yield scrapy.Request(url=response.urljoin('%s/events' % self.page),
|
||||||
|
callback=self.parse_events,
|
||||||
|
priority=10,
|
||||||
|
meta={'index':1})
|
||||||
|
|
||||||
|
def parse_events(self, response):
|
||||||
|
TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr'
|
||||||
|
for event in response.xpath(TABLE_XPATH):
|
||||||
|
url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first()
|
||||||
|
yield response.follow(url, callback=self.parse_event)
|
||||||
|
|
||||||
|
def parse_event(self, response):
|
||||||
|
EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()'
|
||||||
|
EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()'
|
||||||
|
EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()'
|
||||||
|
DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()'
|
||||||
|
EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()'
|
||||||
|
EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src'
|
||||||
|
date = response.xpath(DATE).extract_first()
|
||||||
|
start_date = date.split('–')[0] or None
|
||||||
|
end_date = date.split('–')[1] or None
|
||||||
|
name = response.xpath(EVENT_NAME).extract_first()
|
||||||
|
self.logger.info('Parsing event %s' % name)
|
||||||
|
yield EventsItem(
|
||||||
|
name=name,
|
||||||
|
where=response.xpath(EVENT_WHERE).extract_first(),
|
||||||
|
location=response.xpath(EVENT_LOCATION).extract_first(),
|
||||||
|
photo=response.xpath(EVENT_COVER).extract_first(),
|
||||||
|
start_date=start_date,
|
||||||
|
end_date=end_date,
|
||||||
|
description=response.xpath(EVENT_DESCRIPTION).extract_first()
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user