Adding features: crawl comments from page, crawl posts and comments from groups
This commit is contained in:
parent
f6e8545236
commit
d875e89c52
Binary file not shown.
Binary file not shown.
510
fbcrawl/items.py
510
fbcrawl/items.py
@ -49,14 +49,14 @@ def reactions_strip(string,loader_context):
|
|||||||
while newstring.rfind(',') != -1:
|
while newstring.rfind(',') != -1:
|
||||||
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||||
return newstring
|
return newstring
|
||||||
# #Mark and other 254,134
|
#Mark and other 254,134
|
||||||
# elif newstring.split()[::-1][1].isdigit():
|
elif newstring.split()[::-1][1].isdigit():
|
||||||
# friends = newstring.count(' and ') + newstring.count(',')
|
friends = newstring.count(' and ') + newstring.count(',')
|
||||||
# newstring = newstring.split()[::-1][1]
|
newstring = newstring.split()[::-1][1]
|
||||||
# while newstring.rfind(',') != -1:
|
while newstring.rfind(',') != -1:
|
||||||
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||||
# return int(newstring) + friends
|
return int(newstring) + friends
|
||||||
# #Philip and 1K others
|
#Philip and 1K others
|
||||||
else:
|
else:
|
||||||
return newstring
|
return newstring
|
||||||
else:
|
else:
|
||||||
@ -79,7 +79,7 @@ def url_strip(url):
|
|||||||
else:
|
else:
|
||||||
return fullurl
|
return fullurl
|
||||||
|
|
||||||
def parse_date(date):
|
def parse_date(date,loader_context):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
d = json.loads(date[0]) #nested dict of features
|
d = json.loads(date[0]) #nested dict of features
|
||||||
@ -99,7 +99,463 @@ def parse_date(date):
|
|||||||
flat_d[key] = value
|
flat_d[key] = value
|
||||||
|
|
||||||
#returns timestamp in localtime conversion from linux timestamp UTC
|
#returns timestamp in localtime conversion from linux timestamp UTC
|
||||||
return str(datetime.fromtimestamp(flat_d['publish_time']))
|
ret = str(datetime.fromtimestamp(flat_d['publish_time'])) if 'publish_time' in flat_d else None
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def parse_date2(init_date,loader_context):
|
||||||
|
lang = loader_context['lang']
|
||||||
|
# =============================================================================
|
||||||
|
# Italian - status:final
|
||||||
|
# =============================================================================
|
||||||
|
if lang == 'it':
|
||||||
|
months = {
|
||||||
|
'gennaio':1,
|
||||||
|
'febbraio':2,
|
||||||
|
'marzo':3,
|
||||||
|
'aprile':4,
|
||||||
|
'maggio':5,
|
||||||
|
'giugno':6,
|
||||||
|
'luglio':7,
|
||||||
|
'agosto':8,
|
||||||
|
'settembre':9,
|
||||||
|
'ottobre':10,
|
||||||
|
'novembre':11,
|
||||||
|
'dicembre':12
|
||||||
|
}
|
||||||
|
|
||||||
|
months_abbr = {
|
||||||
|
'gen':1,
|
||||||
|
'feb':2,
|
||||||
|
'mar':3,
|
||||||
|
'apr':4,
|
||||||
|
'mag':5,
|
||||||
|
'giu':6,
|
||||||
|
'lug':7,
|
||||||
|
'ago':8,
|
||||||
|
'set':9,
|
||||||
|
'ott':10,
|
||||||
|
'nov':11,
|
||||||
|
'dic':12
|
||||||
|
}
|
||||||
|
|
||||||
|
giorni = {
|
||||||
|
'lunedì':0,
|
||||||
|
'martedì':1,
|
||||||
|
'mercoledì':2,
|
||||||
|
'giovedì':3,
|
||||||
|
'venerdì':4,
|
||||||
|
'sabato':5,
|
||||||
|
'domenica':6
|
||||||
|
}
|
||||||
|
|
||||||
|
date = init_date[0].split()
|
||||||
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
|
|
||||||
|
l = len(date)
|
||||||
|
|
||||||
|
#sanity check
|
||||||
|
if l == 0:
|
||||||
|
return 'Error: no data'
|
||||||
|
|
||||||
|
#adesso, ieri, 4h, 50min
|
||||||
|
elif l == 1:
|
||||||
|
if date[0].isalpha():
|
||||||
|
if date[0].lower() == 'ieri':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
#check that yesterday was not in another month
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
elif date[0].lower() == 'adesso':
|
||||||
|
return datetime(year,month,day).date() #return today
|
||||||
|
else: #not recognized, (return date or init_date)
|
||||||
|
return date
|
||||||
|
else:
|
||||||
|
#4h, 50min (exploit future parsing)
|
||||||
|
l = 2
|
||||||
|
new_date = [x for x in date[0] if x.isdigit()]
|
||||||
|
date[0] = ''.join(new_date)
|
||||||
|
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||||
|
date[1] = ''.join(new_date)
|
||||||
|
# l = 2
|
||||||
|
elif l == 2:
|
||||||
|
#22 min (oggi)
|
||||||
|
if date[1] == 'min':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#22 min (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#4 h (oggi)
|
||||||
|
elif date[1] == 'h':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#4 h (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 gen
|
||||||
|
elif len(date[1]) == 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 gennaio
|
||||||
|
elif len(date[1]) > 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1]]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 3
|
||||||
|
elif l == 3:
|
||||||
|
#21 giu 2017
|
||||||
|
if len(date[1]) == 3 and date[2].isdigit():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1]]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#21 giugno 2017
|
||||||
|
elif len(date[1]) > 3 and date[2].isdigit():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1]]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#9 ore fa
|
||||||
|
elif date[0].isdigit() and date[1][:2] == 'or':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#9 ore fa (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#7 minuti fa
|
||||||
|
elif date[0].isdigit() and date[1][:3] == 'min':
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
|
||||||
|
#ieri alle 20:45
|
||||||
|
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#oggi alle 11:11
|
||||||
|
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#lunedì alle 12:34
|
||||||
|
elif date[0].isalpha() and date[1] == 'alle':
|
||||||
|
today = datetime.now().weekday() #today as a weekday
|
||||||
|
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||||
|
#weekday is chronologically always lower than day
|
||||||
|
delta = today - weekday
|
||||||
|
if delta >= 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||||
|
else:
|
||||||
|
delta += 8
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 4
|
||||||
|
elif l == 4:
|
||||||
|
#Ieri alle ore 23:32
|
||||||
|
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#domenica alle ore 19:29
|
||||||
|
elif date[0].isalpha() and date[1] == 'alle':
|
||||||
|
today = datetime.now().weekday() #today as a weekday
|
||||||
|
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||||
|
#weekday is chronologically always lower than day
|
||||||
|
delta = today - weekday
|
||||||
|
if delta >= 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||||
|
else:
|
||||||
|
delta += 8
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 5
|
||||||
|
elif l == 5:
|
||||||
|
if date[2] == 'alle':
|
||||||
|
#29 feb alle ore 21:49
|
||||||
|
if len(date[1]) == 3:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#29 febbraio alle ore 21:49
|
||||||
|
else:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 6
|
||||||
|
elif l == 6:
|
||||||
|
if date[3] == 'alle':
|
||||||
|
#29 feb 2016 alle ore 21:49
|
||||||
|
if len(date[1]) == 3:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#29 febbraio 2016 alle ore 21:49
|
||||||
|
else:
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# =============================================================================
|
||||||
|
# English - status:beta
|
||||||
|
# =============================================================================
|
||||||
|
elif lang == 'en':
|
||||||
|
months = {
|
||||||
|
'january':1,
|
||||||
|
'february':2,
|
||||||
|
'march':3,
|
||||||
|
'april':4,
|
||||||
|
'may':5,
|
||||||
|
'june':6,
|
||||||
|
'july':7,
|
||||||
|
'august':8,
|
||||||
|
'september':9,
|
||||||
|
'october':10,
|
||||||
|
'november':11,
|
||||||
|
'december':12
|
||||||
|
}
|
||||||
|
|
||||||
|
months_abbr = {
|
||||||
|
'jan':1,
|
||||||
|
'feb':2,
|
||||||
|
'mar':3,
|
||||||
|
'apr':4,
|
||||||
|
'may':5,
|
||||||
|
'jun':6,
|
||||||
|
'jul':7,
|
||||||
|
'aug':8,
|
||||||
|
'sep':9,
|
||||||
|
'oct':10,
|
||||||
|
'nov':11,
|
||||||
|
'dec':12
|
||||||
|
}
|
||||||
|
|
||||||
|
days = {
|
||||||
|
'monday':0,
|
||||||
|
'tuesday':1,
|
||||||
|
'wednesday':2,
|
||||||
|
'thursday':3,
|
||||||
|
'friday':4,
|
||||||
|
'saturday':5,
|
||||||
|
'sunday':6
|
||||||
|
}
|
||||||
|
|
||||||
|
date = init_date[0].split()
|
||||||
|
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||||
|
|
||||||
|
l = len(date)
|
||||||
|
|
||||||
|
#sanity check
|
||||||
|
if l == 0:
|
||||||
|
return 'Error: no data'
|
||||||
|
|
||||||
|
#Yesterday, Now, 4hr, 50mins
|
||||||
|
elif l == 1:
|
||||||
|
if date[0].isalpha():
|
||||||
|
if date[0].lower() == 'yesterday':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
#check that yesterday was not in another month
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
elif date[0].lower() == 'now':
|
||||||
|
return datetime(year,month,day).date() #return today
|
||||||
|
else: #not recognized, (return date or init_date)
|
||||||
|
return date
|
||||||
|
else:
|
||||||
|
#4h, 50min (exploit future parsing)
|
||||||
|
l = 2
|
||||||
|
new_date = [x for x in date[0] if x.isdigit()]
|
||||||
|
date[0] = ''.join(new_date)
|
||||||
|
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||||
|
date[1] = ''.join(new_date)
|
||||||
|
# l = 2
|
||||||
|
elif l == 2:
|
||||||
|
if date[1] == 'now':
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#22 min (ieri)
|
||||||
|
if date[1] == 'min' or date[1] == 'mins':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#22 min (oggi)
|
||||||
|
else:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
|
||||||
|
#4 h (ieri)
|
||||||
|
elif date[1] == 'hr' or date[1] == 'hrs':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#4 h (oggi)
|
||||||
|
else:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
|
||||||
|
#2 jan
|
||||||
|
elif len(date[1]) == 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#2 january
|
||||||
|
elif len(date[1]) > 3 and date[1].isalpha():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1]]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#jan 2
|
||||||
|
elif len(date[0]) == 3 and date[0].isalpha():
|
||||||
|
day = int(date[1])
|
||||||
|
month = months_abbr[date[0].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#january 2
|
||||||
|
elif len(date[0]) > 3 and date[0].isalpha():
|
||||||
|
day = int(date[1])
|
||||||
|
month = months[date[0]]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
return date
|
||||||
|
# l = 3
|
||||||
|
elif l == 3:
|
||||||
|
#5 hours ago
|
||||||
|
if date[2] == 'ago':
|
||||||
|
if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs':
|
||||||
|
# 5 hours ago (yesterday)
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
# 5 hours ago (today)
|
||||||
|
else:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#10 minutes ago
|
||||||
|
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
|
||||||
|
#22 minutes ago (yesterday)
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#22 minutes ago (today)
|
||||||
|
else:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
else:
|
||||||
|
#21 Jun 2017
|
||||||
|
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months_abbr[date[1].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#21 June 2017
|
||||||
|
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
|
||||||
|
day = int(date[0])
|
||||||
|
month = months[date[1].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#Jul 11, 2016
|
||||||
|
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
|
||||||
|
day = int(date[1][:-1])
|
||||||
|
month = months_abbr[date[0].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 4
|
||||||
|
elif l == 4:
|
||||||
|
#yesterday at 23:32 PM
|
||||||
|
if date[0].lower() == 'yesterday' and date[1] == 'at':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#Thursday at 4:27 PM
|
||||||
|
elif date[1] == 'at':
|
||||||
|
today = datetime.now().weekday() #today as a weekday
|
||||||
|
weekday = days[date[0].lower()] #day to be match as number weekday
|
||||||
|
#weekday is chronologically always lower than day
|
||||||
|
delta = today - weekday
|
||||||
|
if delta >= 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#monday = 0 saturday = 6
|
||||||
|
else:
|
||||||
|
delta += 8
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 5
|
||||||
|
elif l == 5:
|
||||||
|
if date[2] == 'at':
|
||||||
|
#Jan 29 at 10:00 PM
|
||||||
|
if len(date[0]) == 3:
|
||||||
|
day = int(date[1])
|
||||||
|
month = months_abbr[date[0].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#29 febbraio alle ore 21:49
|
||||||
|
else:
|
||||||
|
day = int(date[1])
|
||||||
|
month = months[date[0].lower()]
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l = 6
|
||||||
|
elif l == 6:
|
||||||
|
if date[3] == 'at':
|
||||||
|
date[1]
|
||||||
|
#Aug 25, 2016 at 7:00 PM
|
||||||
|
if len(date[0]) == 3:
|
||||||
|
day = int(date[1][:-1])
|
||||||
|
month = months_abbr[date[0].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#August 25, 2016 at 7:00 PM
|
||||||
|
else:
|
||||||
|
day = int(date[1][:-1])
|
||||||
|
month = months[date[0].lower()]
|
||||||
|
year = int(date[2])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#parsing failed
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
# l > 6
|
||||||
|
#parsing failed - l too big
|
||||||
|
else:
|
||||||
|
return date
|
||||||
|
#parsing failed - language not supported
|
||||||
|
else:
|
||||||
|
return init_date
|
||||||
|
|
||||||
def id_strip(post_id):
|
def id_strip(post_id):
|
||||||
import json
|
import json
|
||||||
@ -122,11 +578,21 @@ class FbcrawlItem(scrapy.Item):
|
|||||||
likes = scrapy.Field(
|
likes = scrapy.Field(
|
||||||
output_processor=reactions_strip
|
output_processor=reactions_strip
|
||||||
)
|
)
|
||||||
ahah = scrapy.Field()
|
ahah = scrapy.Field(
|
||||||
love = scrapy.Field()
|
output_processor=reactions_strip
|
||||||
wow = scrapy.Field()
|
)
|
||||||
sigh = scrapy.Field()
|
love = scrapy.Field(
|
||||||
grrr = scrapy.Field()
|
output_processor=reactions_strip
|
||||||
|
)
|
||||||
|
wow = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
)
|
||||||
|
sigh = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
)
|
||||||
|
grrr = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
)
|
||||||
share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
url = scrapy.Field(
|
url = scrapy.Field(
|
||||||
output_processor=url_strip
|
output_processor=url_strip
|
||||||
@ -140,7 +606,7 @@ class CommentsItem(scrapy.Item):
|
|||||||
source = scrapy.Field()
|
source = scrapy.Field()
|
||||||
reply_to=scrapy.Field()
|
reply_to=scrapy.Field()
|
||||||
date = scrapy.Field( # when was the post published
|
date = scrapy.Field( # when was the post published
|
||||||
output_processor=parse_date
|
output_processor=parse_date2
|
||||||
)
|
)
|
||||||
text = scrapy.Field(
|
text = scrapy.Field(
|
||||||
output_processor=Join(separator=u'')
|
output_processor=Join(separator=u'')
|
||||||
@ -153,9 +619,9 @@ class CommentsItem(scrapy.Item):
|
|||||||
)
|
)
|
||||||
source_url = scrapy.Field()
|
source_url = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
#ahah = scrapy.Field()
|
ahah = scrapy.Field()
|
||||||
#love = scrapy.Field()
|
love = scrapy.Field()
|
||||||
#wow = scrapy.Field()
|
wow = scrapy.Field()
|
||||||
#sigh = scrapy.Field()
|
sigh = scrapy.Field()
|
||||||
#grrr = scrapy.Field()
|
grrr = scrapy.Field()
|
||||||
#share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
|
@ -88,6 +88,7 @@ DOWNLOAD_DELAY = 3
|
|||||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
|
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
|
||||||
|
URLLENGTH_LIMIT = 99999
|
||||||
FEED_EXPORT_ENCODING = 'utf-8'
|
FEED_EXPORT_ENCODING = 'utf-8'
|
||||||
DUPEFILTER_DEBUG = True
|
DUPEFILTER_DEBUG = True
|
||||||
LOG_LEVEL = 'INFO'
|
LOG_LEVEL = 'INFO'
|
||||||
|
Binary file not shown.
Binary file not shown.
@ -1,9 +1,11 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
|
from scrapy.exceptions import CloseSpider
|
||||||
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||||
from fbcrawl.items import CommentsItem
|
from fbcrawl.items import CommentsItem, parse_date, parse_date2
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
class CommentsSpider(FacebookSpider):
|
class CommentsSpider(FacebookSpider):
|
||||||
"""
|
"""
|
||||||
@ -14,15 +16,117 @@ class CommentsSpider(FacebookSpider):
|
|||||||
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
|
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
|
||||||
'source_url','url'],
|
'source_url','url'],
|
||||||
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||||
'CONCURRENT_REQUESTS':1,
|
'CONCURRENT_REQUESTS' : 1
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
if 'post' in kwargs and 'page' in kwargs:
|
||||||
|
raise AttributeError('You need to specifiy only one between post and page')
|
||||||
|
elif 'post' in kwargs:
|
||||||
|
self.page = kwargs['post']
|
||||||
|
self.type = 'post'
|
||||||
|
elif 'page' in kwargs:
|
||||||
|
self.type = 'page'
|
||||||
|
|
||||||
super().__init__(*args,**kwargs)
|
super().__init__(*args,**kwargs)
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
'''
|
'''
|
||||||
parse page does multiple things:
|
'''
|
||||||
|
if self.type == 'post':
|
||||||
|
yield scrapy.Request(url=response.url,
|
||||||
|
callback=self.parse_post,
|
||||||
|
priority=10,
|
||||||
|
meta={'index':1})
|
||||||
|
elif self.type == 'page':
|
||||||
|
#select all posts
|
||||||
|
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||||
|
many_features = post.xpath('./@data-ft').get()
|
||||||
|
date = []
|
||||||
|
date.append(many_features)
|
||||||
|
date = parse_date(date,{'lang':self.lang})
|
||||||
|
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
|
||||||
|
|
||||||
|
if current_date is None:
|
||||||
|
date_string = post.xpath('.//abbr/text()').get()
|
||||||
|
date = parse_date2([date_string],{'lang':self.lang})
|
||||||
|
current_date = datetime(date.year,date.month,date.day) if date is not None else date
|
||||||
|
date = str(date)
|
||||||
|
|
||||||
|
if abs(self.count) + 1 > self.max:
|
||||||
|
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
|
||||||
|
self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
|
||||||
|
|
||||||
|
#returns full post-link in a list
|
||||||
|
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||||
|
temp_post = response.urljoin(post[0])
|
||||||
|
self.count -= 1
|
||||||
|
yield scrapy.Request(temp_post,
|
||||||
|
self.parse_post,
|
||||||
|
priority = self.count,
|
||||||
|
meta={'index':1})
|
||||||
|
|
||||||
|
#load following page, try to click on "more"
|
||||||
|
#after few pages have been scraped, the "more" link might disappears
|
||||||
|
#if not present look for the highest year not parsed yet
|
||||||
|
#click once on the year and go back to clicking "more"
|
||||||
|
|
||||||
|
#new_page is different for groups
|
||||||
|
if self.group == 1:
|
||||||
|
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
|
||||||
|
else:
|
||||||
|
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||||
|
#this is why lang is needed
|
||||||
|
|
||||||
|
if not new_page:
|
||||||
|
self.logger.info('[!] "more" link not found, will look for a "year" link')
|
||||||
|
#self.k is the year link that we look for
|
||||||
|
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||||
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
|
new_page = response.xpath(xpath).extract()
|
||||||
|
if new_page:
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.k -= 1
|
||||||
|
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':self.k})
|
||||||
|
else:
|
||||||
|
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||||
|
self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
|
||||||
|
self.k -= 1
|
||||||
|
if self.k < self.year:
|
||||||
|
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
|
||||||
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
|
new_page = response.xpath(xpath).extract()
|
||||||
|
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.k -= 1
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':self.k})
|
||||||
|
else:
|
||||||
|
self.logger.info('Crawling has finished with no errors!')
|
||||||
|
else:
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
if 'flag' in response.meta:
|
||||||
|
self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':response.meta['flag']})
|
||||||
|
else:
|
||||||
|
self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':self.k})
|
||||||
|
|
||||||
|
def parse_post(self, response):
|
||||||
|
'''
|
||||||
|
parse post does multiple things:
|
||||||
1) loads replied-to-comments page one-by-one (for DFS)
|
1) loads replied-to-comments page one-by-one (for DFS)
|
||||||
2) call parse_reply on the nested comments
|
2) call parse_reply on the nested comments
|
||||||
3) adds simple (not-replied-to) comments
|
3) adds simple (not-replied-to) comments
|
||||||
@ -37,9 +141,10 @@ class CommentsSpider(FacebookSpider):
|
|||||||
source = reply.xpath('.//h3/a/text()').extract()
|
source = reply.xpath('.//h3/a/text()').extract()
|
||||||
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
|
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
|
||||||
ans = response.urljoin(answer[::-1][0])
|
ans = response.urljoin(answer[::-1][0])
|
||||||
self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans))
|
self.logger.info('{} nested comment'.format(str(response.meta['index'])))
|
||||||
yield scrapy.Request(ans,
|
yield scrapy.Request(ans,
|
||||||
callback=self.parse_reply,
|
callback=self.parse_reply,
|
||||||
|
priority=1000,
|
||||||
meta={'reply_to':source,
|
meta={'reply_to':source,
|
||||||
'url':response.url,
|
'url':response.url,
|
||||||
'index':response.meta['index'],
|
'index':response.meta['index'],
|
||||||
@ -49,7 +154,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
if not response.xpath(path): #prevents from exec
|
if not response.xpath(path): #prevents from exec
|
||||||
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||||
for i,reply in enumerate(response.xpath(path2)):
|
for i,reply in enumerate(response.xpath(path2)):
|
||||||
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
|
self.logger.info('{} regular comment'.format(i+1))
|
||||||
new = ItemLoader(item=CommentsItem(),selector=reply)
|
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||||
new.context['lang'] = self.lang
|
new.context['lang'] = self.lang
|
||||||
new.add_xpath('source','.//h3/a/text()')
|
new.add_xpath('source','.//h3/a/text()')
|
||||||
@ -71,7 +176,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
new_page = response.urljoin(new_page[0])
|
new_page = response.urljoin(new_page[0])
|
||||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||||
yield scrapy.Request(new_page,
|
yield scrapy.Request(new_page,
|
||||||
callback=self.parse_page,
|
callback=self.parse_post,
|
||||||
meta={'index':1,
|
meta={'index':1,
|
||||||
'group':1})
|
'group':1})
|
||||||
else:
|
else:
|
||||||
@ -80,7 +185,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
new_page = response.urljoin(new_page[0])
|
new_page = response.urljoin(new_page[0])
|
||||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||||
yield scrapy.Request(new_page,
|
yield scrapy.Request(new_page,
|
||||||
callback=self.parse_page,
|
callback=self.parse_post,
|
||||||
meta={'index':1,
|
meta={'index':1,
|
||||||
'group':group_flag})
|
'group':group_flag})
|
||||||
|
|
||||||
@ -88,6 +193,9 @@ class CommentsSpider(FacebookSpider):
|
|||||||
'''
|
'''
|
||||||
parse reply to comments, root comment is added if flag
|
parse reply to comments, root comment is added if flag
|
||||||
'''
|
'''
|
||||||
|
# from scrapy.utils.response import open_in_browser
|
||||||
|
# open_in_browser(response)
|
||||||
|
|
||||||
if response.meta['flag'] == 'init':
|
if response.meta['flag'] == 'init':
|
||||||
#parse root comment
|
#parse root comment
|
||||||
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
|
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
@ -120,7 +228,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
back_page = response.urljoin(back[0])
|
back_page = response.urljoin(back[0])
|
||||||
yield scrapy.Request(back_page,
|
yield scrapy.Request(back_page,
|
||||||
callback=self.parse_reply,
|
callback=self.parse_reply,
|
||||||
priority=100,
|
priority = 1000,
|
||||||
meta={'reply_to':response.meta['reply_to'],
|
meta={'reply_to':response.meta['reply_to'],
|
||||||
'flag':'back',
|
'flag':'back',
|
||||||
'url':response.meta['url'],
|
'url':response.meta['url'],
|
||||||
@ -131,7 +239,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
next_reply = response.meta['url']
|
next_reply = response.meta['url']
|
||||||
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
|
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
|
||||||
yield scrapy.Request(next_reply,
|
yield scrapy.Request(next_reply,
|
||||||
callback=self.parse_page,
|
callback=self.parse_post,
|
||||||
meta={'index':response.meta['index']+1,
|
meta={'index':response.meta['index']+1,
|
||||||
'group':response.meta['group']})
|
'group':response.meta['group']})
|
||||||
|
|
||||||
@ -155,7 +263,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
back_page = response.urljoin(back[0])
|
back_page = response.urljoin(back[0])
|
||||||
yield scrapy.Request(back_page,
|
yield scrapy.Request(back_page,
|
||||||
callback=self.parse_reply,
|
callback=self.parse_reply,
|
||||||
priority=100,
|
priority=1000,
|
||||||
meta={'reply_to':response.meta['reply_to'],
|
meta={'reply_to':response.meta['reply_to'],
|
||||||
'flag':'back',
|
'flag':'back',
|
||||||
'url':response.meta['url'],
|
'url':response.meta['url'],
|
||||||
@ -166,7 +274,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
next_reply = response.meta['url']
|
next_reply = response.meta['url']
|
||||||
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||||
yield scrapy.Request(next_reply,
|
yield scrapy.Request(next_reply,
|
||||||
callback=self.parse_page,
|
callback=self.parse_post,
|
||||||
meta={'index':response.meta['index']+1,
|
meta={'index':response.meta['index']+1,
|
||||||
'group':response.meta['group']})
|
'group':response.meta['group']})
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import logging
|
|||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
from scrapy.http import FormRequest
|
from scrapy.http import FormRequest
|
||||||
from scrapy.exceptions import CloseSpider
|
from scrapy.exceptions import CloseSpider
|
||||||
from fbcrawl.items import FbcrawlItem, parse_date
|
from fbcrawl.items import FbcrawlItem, parse_date, parse_date2
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
class FacebookSpider(scrapy.Spider):
|
class FacebookSpider(scrapy.Spider):
|
||||||
@ -15,7 +15,8 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
custom_settings = {
|
custom_settings = {
|
||||||
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||||
'reactions','likes','ahah','love','wow', \
|
'reactions','likes','ahah','love','wow', \
|
||||||
'sigh','grrr','comments','post_id','url']
|
'sigh','grrr','comments','post_id','url'],
|
||||||
|
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@ -33,16 +34,19 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
self.logger.info('Email and password provided, will be used to log in')
|
self.logger.info('Email and password provided, will be used to log in')
|
||||||
|
|
||||||
#page name parsing (added support for full urls)
|
#page name parsing (added support for full urls)
|
||||||
if 'page' not in kwargs:
|
if 'page' in kwargs:
|
||||||
raise AttributeError('You need to provide a valid page name to crawl!'
|
if self.page.find('/groups/') != -1:
|
||||||
'scrapy fb -a page="PAGENAME"')
|
self.group = 1
|
||||||
elif self.page.find('https://www.facebook.com/') != -1:
|
else:
|
||||||
self.page = self.page[25:]
|
self.group = 0
|
||||||
elif self.page.find('https://mbasic.facebook.com/') != -1:
|
if self.page.find('https://www.facebook.com/') != -1:
|
||||||
self.page = self.page[28:]
|
self.page = self.page[25:]
|
||||||
elif self.page.find('https://m.facebook.com/') != -1:
|
elif self.page.find('https://mbasic.facebook.com/') != -1:
|
||||||
self.page = self.page[23:]
|
self.page = self.page[28:]
|
||||||
|
elif self.page.find('https://m.facebook.com/') != -1:
|
||||||
|
self.page = self.page[23:]
|
||||||
|
|
||||||
|
|
||||||
#parse date
|
#parse date
|
||||||
if 'date' not in kwargs:
|
if 'date' not in kwargs:
|
||||||
self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)')
|
self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)')
|
||||||
@ -148,11 +152,19 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
many_features = post.xpath('./@data-ft').get()
|
many_features = post.xpath('./@data-ft').get()
|
||||||
date = []
|
date = []
|
||||||
date.append(many_features)
|
date.append(many_features)
|
||||||
date = parse_date(date)
|
date = parse_date(date,{'lang':self.lang})
|
||||||
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S')
|
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
|
||||||
|
|
||||||
|
if current_date is None:
|
||||||
|
date_string = post.xpath('.//abbr/text()').get()
|
||||||
|
date = parse_date2([date_string],{'lang':self.lang})
|
||||||
|
current_date = datetime(date.year,date.month,date.day) if date is not None else date
|
||||||
|
date = str(date)
|
||||||
|
|
||||||
|
#if 'date' argument is reached stop crawling
|
||||||
if self.date > current_date:
|
if self.date > current_date:
|
||||||
raise CloseSpider('Reached date: {}'.format(self.date))
|
raise CloseSpider('Reached date: {}'.format(self.date))
|
||||||
|
|
||||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||||
if abs(self.count) + 1 > self.max:
|
if abs(self.count) + 1 > self.max:
|
||||||
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
|
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
|
||||||
@ -161,8 +173,8 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
new.add_value('date',date)
|
new.add_value('date',date)
|
||||||
new.add_xpath('post_id','./@data-ft')
|
new.add_xpath('post_id','./@data-ft')
|
||||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||||
|
|
||||||
#page_url #new.add_value('url',response.url)
|
#page_url #new.add_value('url',response.url)
|
||||||
|
|
||||||
#returns full post-link in a list
|
#returns full post-link in a list
|
||||||
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||||
temp_post = response.urljoin(post[0])
|
temp_post = response.urljoin(post[0])
|
||||||
@ -173,18 +185,24 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
#after few pages have been scraped, the "more" link might disappears
|
#after few pages have been scraped, the "more" link might disappears
|
||||||
#if not present look for the highest year not parsed yet
|
#if not present look for the highest year not parsed yet
|
||||||
#click once on the year and go back to clicking "more"
|
#click once on the year and go back to clicking "more"
|
||||||
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
|
||||||
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
#new_page is different for groups
|
||||||
|
if self.group == 1:
|
||||||
|
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
|
||||||
|
else:
|
||||||
|
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||||
|
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
if not new_page:
|
if not new_page:
|
||||||
self.logger.info('[!] "more" link not found, will look for a year')
|
self.logger.info('[!] "more" link not found, will look for a "year" link')
|
||||||
#self.k is the year that we look for in the link.
|
#self.k is the year link that we look for
|
||||||
if response.meta['flag'] == self.k and self.k >= self.year:
|
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
new_page = response.xpath(xpath).extract()
|
new_page = response.xpath(xpath).extract()
|
||||||
if new_page:
|
if new_page:
|
||||||
new_page = response.urljoin(new_page[0])
|
new_page = response.urljoin(new_page[0])
|
||||||
self.k -= 1
|
self.k -= 1
|
||||||
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page))
|
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||||
else:
|
else:
|
||||||
while not new_page: #sometimes the years are skipped this handles small year gaps
|
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||||
@ -194,7 +212,7 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
|
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
|
||||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
new_page = response.xpath(xpath).extract()
|
new_page = response.xpath(xpath).extract()
|
||||||
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page))
|
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||||
new_page = response.urljoin(new_page[0])
|
new_page = response.urljoin(new_page[0])
|
||||||
self.k -= 1
|
self.k -= 1
|
||||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||||
|
Loading…
Reference in New Issue
Block a user