Adding events crawler
This commit is contained in:
parent
be8a9c2f5f
commit
5c3128cfef
217
fbcrawl/items.py
217
fbcrawl/items.py
@ -8,7 +8,7 @@
|
||||
import scrapy
|
||||
from scrapy.loader.processors import TakeFirst, Join, MapCompose
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
def comments_strip(string,loader_context):
|
||||
lang = loader_context['lang']
|
||||
if lang == 'it':
|
||||
@ -16,7 +16,7 @@ def comments_strip(string,loader_context):
|
||||
return
|
||||
else:
|
||||
return string[0].rstrip(' commenti')
|
||||
|
||||
|
||||
elif lang == 'en':
|
||||
if(string[0] == 'Share'):
|
||||
return '0'
|
||||
@ -31,13 +31,13 @@ def reactions_strip(string,loader_context):
|
||||
lang = loader_context['lang']
|
||||
if lang == 'it':
|
||||
newstring = string[0]
|
||||
#19.298.873
|
||||
#19.298.873
|
||||
if len(newstring.split()) == 1:
|
||||
while newstring.rfind('.') != -1:
|
||||
newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:]
|
||||
return newstring
|
||||
#Pamela, Luigi e altri 4
|
||||
else:
|
||||
else:
|
||||
return string
|
||||
friends = newstring.count(' e ') + newstring.count(',')
|
||||
newstring = newstring.split()[::-1][0]
|
||||
@ -46,13 +46,13 @@ def reactions_strip(string,loader_context):
|
||||
return int(newstring) + friends
|
||||
elif lang == 'en':
|
||||
newstring = string[0]
|
||||
#19,298,873
|
||||
#19,298,873
|
||||
if len(newstring.split()) == 1:
|
||||
while newstring.rfind(',') != -1:
|
||||
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||
return newstring
|
||||
#Mark and other 254,134
|
||||
elif newstring.split()[::-1][1].isdigit():
|
||||
#Mark and other 254,134
|
||||
elif newstring.split()[::-1][1].isdigit():
|
||||
friends = newstring.count(' and ') + newstring.count(',')
|
||||
newstring = newstring.split()[::-1][1]
|
||||
while newstring.rfind(',') != -1:
|
||||
@ -70,7 +70,7 @@ def url_strip(url):
|
||||
i = fullurl.find('&id=')
|
||||
if i != -1:
|
||||
return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
||||
else: #catch photos
|
||||
else: #catch photos
|
||||
i = fullurl.find('/photos/')
|
||||
if i != -1:
|
||||
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
||||
@ -80,13 +80,13 @@ def url_strip(url):
|
||||
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
||||
else:
|
||||
return fullurl
|
||||
|
||||
|
||||
def parse_date(date,loader_context):
|
||||
import json
|
||||
|
||||
|
||||
d = json.loads(date[0]) #nested dict of features
|
||||
flat_d = dict() #only retain 'leaves' of d tree
|
||||
|
||||
|
||||
def recursive_items(dictionary):
|
||||
'''
|
||||
Get most nested key:value pair of nested dict
|
||||
@ -138,7 +138,7 @@ def parse_date2(init_date,loader_context):
|
||||
'ott':10,
|
||||
'nov':11,
|
||||
'dic':12
|
||||
}
|
||||
}
|
||||
|
||||
giorni = {
|
||||
'lunedì':0,
|
||||
@ -148,8 +148,8 @@ def parse_date2(init_date,loader_context):
|
||||
'venerdì':4,
|
||||
'sabato':5,
|
||||
'domenica':6
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
date = init_date[0].split()
|
||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||
|
||||
@ -161,7 +161,7 @@ def parse_date2(init_date,loader_context):
|
||||
|
||||
#adesso, ieri, 4h, 50min
|
||||
elif l == 1:
|
||||
if date[0].isalpha():
|
||||
if date[0].isalpha():
|
||||
if date[0].lower() == 'ieri':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
#check that yesterday was not in another month
|
||||
@ -169,15 +169,15 @@ def parse_date2(init_date,loader_context):
|
||||
elif date[0].lower() == 'adesso':
|
||||
return datetime(year,month,day).date() #return today
|
||||
else: #not recognized, (return date or init_date)
|
||||
return date
|
||||
else:
|
||||
return date
|
||||
else:
|
||||
#4h, 50min (exploit future parsing)
|
||||
l = 2
|
||||
new_date = [x for x in date[0] if x.isdigit()]
|
||||
date[0] = ''.join(new_date)
|
||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
elif l == 2:
|
||||
#22 min (oggi)
|
||||
if date[1] == 'min':
|
||||
@ -187,7 +187,7 @@ def parse_date2(init_date,loader_context):
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (oggi)
|
||||
elif date[1] == 'h':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
@ -196,34 +196,34 @@ def parse_date2(init_date,loader_context):
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#2 gen
|
||||
elif len(date[1]) == 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#2 gennaio
|
||||
elif len(date[1]) > 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 3
|
||||
elif l == 3:
|
||||
#21 giu 2017
|
||||
#21 giu 2017
|
||||
if len(date[1]) == 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#21 giugno 2017
|
||||
return datetime(year,month,day).date()
|
||||
#21 giugno 2017
|
||||
elif len(date[1]) > 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#9 ore fa
|
||||
elif date[0].isdigit() and date[1][:2] == 'or':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
@ -232,25 +232,25 @@ def parse_date2(init_date,loader_context):
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#7 minuti fa
|
||||
elif date[0].isdigit() and date[1][:3] == 'min':
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
|
||||
#ieri alle 20:45
|
||||
#ieri alle 20:45
|
||||
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#oggi alle 11:11
|
||||
return datetime(year,month,day).date()
|
||||
#oggi alle 11:11
|
||||
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì alle 12:34
|
||||
elif date[0].isalpha() and date[1] == 'alle':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
@ -270,13 +270,13 @@ def parse_date2(init_date,loader_context):
|
||||
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#domenica alle ore 19:29
|
||||
elif date[0].isalpha() and date[1] == 'alle':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
@ -286,7 +286,7 @@ def parse_date2(init_date,loader_context):
|
||||
delta += 8
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
@ -297,16 +297,16 @@ def parse_date2(init_date,loader_context):
|
||||
if len(date[1]) == 3:
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
else:
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 6
|
||||
# l = 6
|
||||
elif l == 6:
|
||||
if date[3] == 'alle':
|
||||
#29 feb 2016 alle ore 21:49
|
||||
@ -314,14 +314,14 @@ def parse_date2(init_date,loader_context):
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio 2016 alle ore 21:49
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio 2016 alle ore 21:49
|
||||
else:
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# =============================================================================
|
||||
@ -356,7 +356,7 @@ def parse_date2(init_date,loader_context):
|
||||
'oct':10,
|
||||
'nov':11,
|
||||
'dec':12
|
||||
}
|
||||
}
|
||||
|
||||
days = {
|
||||
'monday':0,
|
||||
@ -366,7 +366,7 @@ def parse_date2(init_date,loader_context):
|
||||
'friday':4,
|
||||
'saturday':5,
|
||||
'sunday':6
|
||||
}
|
||||
}
|
||||
|
||||
date = init_date[0].split()
|
||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||
@ -379,7 +379,7 @@ def parse_date2(init_date,loader_context):
|
||||
|
||||
#Yesterday, Now, 4hr, 50mins
|
||||
elif l == 1:
|
||||
if date[0].isalpha():
|
||||
if date[0].isalpha():
|
||||
if date[0].lower() == 'yesterday':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
#check that yesterday was not in another month
|
||||
@ -387,15 +387,15 @@ def parse_date2(init_date,loader_context):
|
||||
elif date[0].lower() == 'now':
|
||||
return datetime(year,month,day).date() #return today
|
||||
else: #not recognized, (return date or init_date)
|
||||
return date
|
||||
else:
|
||||
return date
|
||||
else:
|
||||
#4h, 50min (exploit future parsing)
|
||||
l = 2
|
||||
new_date = [x for x in date[0] if x.isdigit()]
|
||||
date[0] = ''.join(new_date)
|
||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
elif l == 2:
|
||||
if date[1] == 'now':
|
||||
return datetime(year,month,day).date()
|
||||
@ -414,31 +414,31 @@ def parse_date2(init_date,loader_context):
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (oggi)
|
||||
else:
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
|
||||
#2 jan
|
||||
elif len(date[1]) == 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#2 january
|
||||
elif len(date[1]) > 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#jan 2
|
||||
elif len(date[0]) == 3 and date[0].isalpha():
|
||||
day = int(date[1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
month = months_abbr[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#january 2
|
||||
elif len(date[0]) > 3 and date[0].isalpha():
|
||||
day = int(date[1])
|
||||
month = months[date[0]]
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
@ -452,35 +452,35 @@ def parse_date2(init_date,loader_context):
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
# 5 hours ago (today)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
#10 minutes ago
|
||||
# 5 hours ago (today)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
#10 minutes ago
|
||||
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
|
||||
#22 minutes ago (yesterday)
|
||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#22 minutes ago (today)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
else:
|
||||
return date
|
||||
return date
|
||||
else:
|
||||
#21 Jun 2017
|
||||
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#21 June 2017
|
||||
return datetime(year,month,day).date()
|
||||
#21 June 2017
|
||||
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#Jul 11, 2016
|
||||
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
|
||||
day = int(date[1][:-1])
|
||||
@ -496,13 +496,13 @@ def parse_date2(init_date,loader_context):
|
||||
if date[0].lower() == 'yesterday' and date[1] == 'at':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#Thursday at 4:27 PM
|
||||
elif date[1] == 'at':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = days[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
@ -519,82 +519,82 @@ def parse_date2(init_date,loader_context):
|
||||
# l = 5
|
||||
elif l == 5:
|
||||
if date[2] == 'at':
|
||||
#Jan 29 at 10:00 PM
|
||||
#Jan 29 at 10:00 PM
|
||||
if len(date[0]) == 3:
|
||||
day = int(date[1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
else:
|
||||
day = int(date[1])
|
||||
month = months[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 6
|
||||
# l = 6
|
||||
elif l == 6:
|
||||
if date[3] == 'at':
|
||||
date[1]
|
||||
#Aug 25, 2016 at 7:00 PM
|
||||
#Aug 25, 2016 at 7:00 PM
|
||||
if len(date[0]) == 3:
|
||||
day = int(date[1][:-1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#August 25, 2016 at 7:00 PM
|
||||
return datetime(year,month,day).date()
|
||||
#August 25, 2016 at 7:00 PM
|
||||
else:
|
||||
day = int(date[1][:-1])
|
||||
month = months[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l > 6
|
||||
# l > 6
|
||||
#parsing failed - l too big
|
||||
else:
|
||||
return date
|
||||
#parsing failed - language not supported
|
||||
else:
|
||||
return init_date
|
||||
|
||||
|
||||
def id_strip(post_id):
|
||||
import json
|
||||
d = json.loads(post_id[::-1][0]) #nested dict of features
|
||||
return str(d['top_level_post_id'])
|
||||
|
||||
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
source = scrapy.Field()
|
||||
date = scrapy.Field()
|
||||
source = scrapy.Field()
|
||||
date = scrapy.Field()
|
||||
text = scrapy.Field(
|
||||
output_processor=Join(separator=u'')
|
||||
) # full text of the post
|
||||
comments = scrapy.Field(
|
||||
output_processor=comments_strip
|
||||
)
|
||||
)
|
||||
reactions = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
) # num of reactions
|
||||
likes = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
)
|
||||
ahah = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
)
|
||||
love = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
)
|
||||
wow = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
)
|
||||
sigh = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
)
|
||||
grrr = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
)
|
||||
share = scrapy.Field() # num of shares
|
||||
url = scrapy.Field(
|
||||
output_processor=url_strip
|
||||
@ -605,11 +605,11 @@ class FbcrawlItem(scrapy.Item):
|
||||
shared_from = scrapy.Field()
|
||||
|
||||
class CommentsItem(scrapy.Item):
|
||||
source = scrapy.Field()
|
||||
source = scrapy.Field()
|
||||
reply_to=scrapy.Field()
|
||||
date = scrapy.Field( # when was the post published
|
||||
output_processor=parse_date2
|
||||
)
|
||||
)
|
||||
text = scrapy.Field(
|
||||
output_processor=Join(separator=u'')
|
||||
) # full text of the post
|
||||
@ -618,18 +618,18 @@ class CommentsItem(scrapy.Item):
|
||||
) # num of reactions
|
||||
likes = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
source_url = scrapy.Field()
|
||||
)
|
||||
source_url = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
ahah = scrapy.Field()
|
||||
love = scrapy.Field()
|
||||
wow = scrapy.Field()
|
||||
sigh = scrapy.Field()
|
||||
grrr = scrapy.Field()
|
||||
ahah = scrapy.Field()
|
||||
love = scrapy.Field()
|
||||
wow = scrapy.Field()
|
||||
sigh = scrapy.Field()
|
||||
grrr = scrapy.Field()
|
||||
share = scrapy.Field() # num of shares
|
||||
|
||||
class ProfileItem(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
gender = scrapy.Field()
|
||||
birthday = scrapy.Field()
|
||||
current_city = scrapy.Field()
|
||||
@ -638,3 +638,12 @@ class ProfileItem(scrapy.Item):
|
||||
education = scrapy.Field()
|
||||
interested_in = scrapy.Field()
|
||||
page = scrapy.Field()
|
||||
|
||||
class EventsItem(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
location = scrapy.Field()
|
||||
where = scrapy.Field()
|
||||
photo = scrapy.Field()
|
||||
start_date = scrapy.Field()
|
||||
end_date = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
|
58
fbcrawl/spiders/events.py
Normal file
58
fbcrawl/spiders/events.py
Normal file
@ -0,0 +1,58 @@
|
||||
import scrapy
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.exceptions import CloseSpider
|
||||
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||
from fbcrawl.items import EventsItem, parse_date, parse_date2
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
class EventsSpider(FacebookSpider):
|
||||
"""
|
||||
Parse FB events, given a page (needs credentials)
|
||||
"""
|
||||
name = "events"
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \
|
||||
'end_date','description'],
|
||||
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||
'CONCURRENT_REQUESTS' : 1
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.page = kwargs['page']
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
yield scrapy.Request(url=response.urljoin('%s/events' % self.page),
|
||||
callback=self.parse_events,
|
||||
priority=10,
|
||||
meta={'index':1})
|
||||
|
||||
def parse_events(self, response):
|
||||
TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr'
|
||||
for event in response.xpath(TABLE_XPATH):
|
||||
url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first()
|
||||
yield response.follow(url, callback=self.parse_event)
|
||||
|
||||
def parse_event(self, response):
|
||||
EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()'
|
||||
EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()'
|
||||
EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()'
|
||||
DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()'
|
||||
EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()'
|
||||
EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src'
|
||||
date = response.xpath(DATE).extract_first()
|
||||
start_date = date.split('–')[0] or None
|
||||
end_date = date.split('–')[1] or None
|
||||
name = response.xpath(EVENT_NAME).extract_first()
|
||||
self.logger.info('Parsing event %s' % name)
|
||||
yield EventsItem(
|
||||
name=name,
|
||||
where=response.xpath(EVENT_WHERE).extract_first(),
|
||||
location=response.xpath(EVENT_LOCATION).extract_first(),
|
||||
photo=response.xpath(EVENT_COVER).extract_first(),
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
description=response.xpath(EVENT_DESCRIPTION).extract_first()
|
||||
)
|
Loading…
Reference in New Issue
Block a user