Adding features: crawl comments from page, crawl posts and comments from groups
This commit is contained in:
parent
f6e8545236
commit
d875e89c52
Binary file not shown.
Binary file not shown.
510
fbcrawl/items.py
510
fbcrawl/items.py
@ -49,14 +49,14 @@ def reactions_strip(string,loader_context):
|
||||
while newstring.rfind(',') != -1:
|
||||
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||
return newstring
|
||||
# #Mark and other 254,134
|
||||
# elif newstring.split()[::-1][1].isdigit():
|
||||
# friends = newstring.count(' and ') + newstring.count(',')
|
||||
# newstring = newstring.split()[::-1][1]
|
||||
# while newstring.rfind(',') != -1:
|
||||
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||
# return int(newstring) + friends
|
||||
# #Philip and 1K others
|
||||
#Mark and other 254,134
|
||||
elif newstring.split()[::-1][1].isdigit():
|
||||
friends = newstring.count(' and ') + newstring.count(',')
|
||||
newstring = newstring.split()[::-1][1]
|
||||
while newstring.rfind(',') != -1:
|
||||
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
|
||||
return int(newstring) + friends
|
||||
#Philip and 1K others
|
||||
else:
|
||||
return newstring
|
||||
else:
|
||||
@ -79,7 +79,7 @@ def url_strip(url):
|
||||
else:
|
||||
return fullurl
|
||||
|
||||
def parse_date(date):
|
||||
def parse_date(date,loader_context):
|
||||
import json
|
||||
|
||||
d = json.loads(date[0]) #nested dict of features
|
||||
@ -99,7 +99,463 @@ def parse_date(date):
|
||||
flat_d[key] = value
|
||||
|
||||
#returns timestamp in localtime conversion from linux timestamp UTC
|
||||
return str(datetime.fromtimestamp(flat_d['publish_time']))
|
||||
ret = str(datetime.fromtimestamp(flat_d['publish_time'])) if 'publish_time' in flat_d else None
|
||||
return ret
|
||||
|
||||
def parse_date2(init_date,loader_context):
|
||||
lang = loader_context['lang']
|
||||
# =============================================================================
|
||||
# Italian - status:final
|
||||
# =============================================================================
|
||||
if lang == 'it':
|
||||
months = {
|
||||
'gennaio':1,
|
||||
'febbraio':2,
|
||||
'marzo':3,
|
||||
'aprile':4,
|
||||
'maggio':5,
|
||||
'giugno':6,
|
||||
'luglio':7,
|
||||
'agosto':8,
|
||||
'settembre':9,
|
||||
'ottobre':10,
|
||||
'novembre':11,
|
||||
'dicembre':12
|
||||
}
|
||||
|
||||
months_abbr = {
|
||||
'gen':1,
|
||||
'feb':2,
|
||||
'mar':3,
|
||||
'apr':4,
|
||||
'mag':5,
|
||||
'giu':6,
|
||||
'lug':7,
|
||||
'ago':8,
|
||||
'set':9,
|
||||
'ott':10,
|
||||
'nov':11,
|
||||
'dic':12
|
||||
}
|
||||
|
||||
giorni = {
|
||||
'lunedì':0,
|
||||
'martedì':1,
|
||||
'mercoledì':2,
|
||||
'giovedì':3,
|
||||
'venerdì':4,
|
||||
'sabato':5,
|
||||
'domenica':6
|
||||
}
|
||||
|
||||
date = init_date[0].split()
|
||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||
|
||||
l = len(date)
|
||||
|
||||
#sanity check
|
||||
if l == 0:
|
||||
return 'Error: no data'
|
||||
|
||||
#adesso, ieri, 4h, 50min
|
||||
elif l == 1:
|
||||
if date[0].isalpha():
|
||||
if date[0].lower() == 'ieri':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
#check that yesterday was not in another month
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
elif date[0].lower() == 'adesso':
|
||||
return datetime(year,month,day).date() #return today
|
||||
else: #not recognized, (return date or init_date)
|
||||
return date
|
||||
else:
|
||||
#4h, 50min (exploit future parsing)
|
||||
l = 2
|
||||
new_date = [x for x in date[0] if x.isdigit()]
|
||||
date[0] = ''.join(new_date)
|
||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
elif l == 2:
|
||||
#22 min (oggi)
|
||||
if date[1] == 'min':
|
||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#22 min (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (oggi)
|
||||
elif date[1] == 'h':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#2 gen
|
||||
elif len(date[1]) == 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#2 gennaio
|
||||
elif len(date[1]) > 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 3
|
||||
elif l == 3:
|
||||
#21 giu 2017
|
||||
if len(date[1]) == 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#21 giugno 2017
|
||||
elif len(date[1]) > 3 and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#9 ore fa
|
||||
elif date[0].isdigit() and date[1][:2] == 'or':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#9 ore fa (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#7 minuti fa
|
||||
elif date[0].isdigit() and date[1][:3] == 'min':
|
||||
return datetime(year,month,day).date()
|
||||
|
||||
#ieri alle 20:45
|
||||
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#oggi alle 11:11
|
||||
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì alle 12:34
|
||||
elif date[0].isalpha() and date[1] == 'alle':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||
else:
|
||||
delta += 8
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 4
|
||||
elif l == 4:
|
||||
#Ieri alle ore 23:32
|
||||
if date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#domenica alle ore 19:29
|
||||
elif date[0].isalpha() and date[1] == 'alle':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||
else:
|
||||
delta += 8
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 5
|
||||
elif l == 5:
|
||||
if date[2] == 'alle':
|
||||
#29 feb alle ore 21:49
|
||||
if len(date[1]) == 3:
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
else:
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 6
|
||||
elif l == 6:
|
||||
if date[3] == 'alle':
|
||||
#29 feb 2016 alle ore 21:49
|
||||
if len(date[1]) == 3:
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio 2016 alle ore 21:49
|
||||
else:
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# =============================================================================
|
||||
# English - status:beta
|
||||
# =============================================================================
|
||||
elif lang == 'en':
|
||||
months = {
|
||||
'january':1,
|
||||
'february':2,
|
||||
'march':3,
|
||||
'april':4,
|
||||
'may':5,
|
||||
'june':6,
|
||||
'july':7,
|
||||
'august':8,
|
||||
'september':9,
|
||||
'october':10,
|
||||
'november':11,
|
||||
'december':12
|
||||
}
|
||||
|
||||
months_abbr = {
|
||||
'jan':1,
|
||||
'feb':2,
|
||||
'mar':3,
|
||||
'apr':4,
|
||||
'may':5,
|
||||
'jun':6,
|
||||
'jul':7,
|
||||
'aug':8,
|
||||
'sep':9,
|
||||
'oct':10,
|
||||
'nov':11,
|
||||
'dec':12
|
||||
}
|
||||
|
||||
days = {
|
||||
'monday':0,
|
||||
'tuesday':1,
|
||||
'wednesday':2,
|
||||
'thursday':3,
|
||||
'friday':4,
|
||||
'saturday':5,
|
||||
'sunday':6
|
||||
}
|
||||
|
||||
date = init_date[0].split()
|
||||
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
|
||||
|
||||
l = len(date)
|
||||
|
||||
#sanity check
|
||||
if l == 0:
|
||||
return 'Error: no data'
|
||||
|
||||
#Yesterday, Now, 4hr, 50mins
|
||||
elif l == 1:
|
||||
if date[0].isalpha():
|
||||
if date[0].lower() == 'yesterday':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
#check that yesterday was not in another month
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
elif date[0].lower() == 'now':
|
||||
return datetime(year,month,day).date() #return today
|
||||
else: #not recognized, (return date or init_date)
|
||||
return date
|
||||
else:
|
||||
#4h, 50min (exploit future parsing)
|
||||
l = 2
|
||||
new_date = [x for x in date[0] if x.isdigit()]
|
||||
date[0] = ''.join(new_date)
|
||||
new_date = [x for x in date[0] if not(x.isdigit())]
|
||||
date[1] = ''.join(new_date)
|
||||
# l = 2
|
||||
elif l == 2:
|
||||
if date[1] == 'now':
|
||||
return datetime(year,month,day).date()
|
||||
#22 min (ieri)
|
||||
if date[1] == 'min' or date[1] == 'mins':
|
||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#22 min (oggi)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
|
||||
#4 h (ieri)
|
||||
elif date[1] == 'hr' or date[1] == 'hrs':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#4 h (oggi)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
|
||||
#2 jan
|
||||
elif len(date[1]) == 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#2 january
|
||||
elif len(date[1]) > 3 and date[1].isalpha():
|
||||
day = int(date[0])
|
||||
month = months[date[1]]
|
||||
return datetime(year,month,day).date()
|
||||
#jan 2
|
||||
elif len(date[0]) == 3 and date[0].isalpha():
|
||||
day = int(date[1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#january 2
|
||||
elif len(date[0]) > 3 and date[0].isalpha():
|
||||
day = int(date[1])
|
||||
month = months[date[0]]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
return date
|
||||
# l = 3
|
||||
elif l == 3:
|
||||
#5 hours ago
|
||||
if date[2] == 'ago':
|
||||
if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs':
|
||||
# 5 hours ago (yesterday)
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
# 5 hours ago (today)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
#10 minutes ago
|
||||
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
|
||||
#22 minutes ago (yesterday)
|
||||
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#22 minutes ago (today)
|
||||
else:
|
||||
return datetime(year,month,day).date()
|
||||
else:
|
||||
return date
|
||||
else:
|
||||
#21 Jun 2017
|
||||
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months_abbr[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#21 June 2017
|
||||
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
|
||||
day = int(date[0])
|
||||
month = months[date[1].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#Jul 11, 2016
|
||||
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
|
||||
day = int(date[1][:-1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 4
|
||||
elif l == 4:
|
||||
#yesterday at 23:32 PM
|
||||
if date[0].lower() == 'yesterday' and date[1] == 'at':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#Thursday at 4:27 PM
|
||||
elif date[1] == 'at':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = days[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#monday = 0 saturday = 6
|
||||
else:
|
||||
delta += 8
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 5
|
||||
elif l == 5:
|
||||
if date[2] == 'at':
|
||||
#Jan 29 at 10:00 PM
|
||||
if len(date[0]) == 3:
|
||||
day = int(date[1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#29 febbraio alle ore 21:49
|
||||
else:
|
||||
day = int(date[1])
|
||||
month = months[date[0].lower()]
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l = 6
|
||||
elif l == 6:
|
||||
if date[3] == 'at':
|
||||
date[1]
|
||||
#Aug 25, 2016 at 7:00 PM
|
||||
if len(date[0]) == 3:
|
||||
day = int(date[1][:-1])
|
||||
month = months_abbr[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#August 25, 2016 at 7:00 PM
|
||||
else:
|
||||
day = int(date[1][:-1])
|
||||
month = months[date[0].lower()]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
# l > 6
|
||||
#parsing failed - l too big
|
||||
else:
|
||||
return date
|
||||
#parsing failed - language not supported
|
||||
else:
|
||||
return init_date
|
||||
|
||||
def id_strip(post_id):
|
||||
import json
|
||||
@ -122,11 +578,21 @@ class FbcrawlItem(scrapy.Item):
|
||||
likes = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
ahah = scrapy.Field()
|
||||
love = scrapy.Field()
|
||||
wow = scrapy.Field()
|
||||
sigh = scrapy.Field()
|
||||
grrr = scrapy.Field()
|
||||
ahah = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
love = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
wow = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
sigh = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
grrr = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
share = scrapy.Field() # num of shares
|
||||
url = scrapy.Field(
|
||||
output_processor=url_strip
|
||||
@ -140,7 +606,7 @@ class CommentsItem(scrapy.Item):
|
||||
source = scrapy.Field()
|
||||
reply_to=scrapy.Field()
|
||||
date = scrapy.Field( # when was the post published
|
||||
output_processor=parse_date
|
||||
output_processor=parse_date2
|
||||
)
|
||||
text = scrapy.Field(
|
||||
output_processor=Join(separator=u'')
|
||||
@ -153,9 +619,9 @@ class CommentsItem(scrapy.Item):
|
||||
)
|
||||
source_url = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
#ahah = scrapy.Field()
|
||||
#love = scrapy.Field()
|
||||
#wow = scrapy.Field()
|
||||
#sigh = scrapy.Field()
|
||||
#grrr = scrapy.Field()
|
||||
#share = scrapy.Field() # num of shares
|
||||
ahah = scrapy.Field()
|
||||
love = scrapy.Field()
|
||||
wow = scrapy.Field()
|
||||
sigh = scrapy.Field()
|
||||
grrr = scrapy.Field()
|
||||
share = scrapy.Field() # num of shares
|
||||
|
@ -88,6 +88,7 @@ DOWNLOAD_DELAY = 3
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
|
||||
URLLENGTH_LIMIT = 99999
|
||||
FEED_EXPORT_ENCODING = 'utf-8'
|
||||
DUPEFILTER_DEBUG = True
|
||||
LOG_LEVEL = 'INFO'
|
||||
|
Binary file not shown.
Binary file not shown.
@ -1,9 +1,11 @@
|
||||
import scrapy
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.exceptions import CloseSpider
|
||||
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||
from fbcrawl.items import CommentsItem
|
||||
from fbcrawl.items import CommentsItem, parse_date, parse_date2
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
class CommentsSpider(FacebookSpider):
|
||||
"""
|
||||
@ -14,15 +16,117 @@ class CommentsSpider(FacebookSpider):
|
||||
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
|
||||
'source_url','url'],
|
||||
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||
'CONCURRENT_REQUESTS':1,
|
||||
'CONCURRENT_REQUESTS' : 1
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if 'post' in kwargs and 'page' in kwargs:
|
||||
raise AttributeError('You need to specifiy only one between post and page')
|
||||
elif 'post' in kwargs:
|
||||
self.page = kwargs['post']
|
||||
self.type = 'post'
|
||||
elif 'page' in kwargs:
|
||||
self.type = 'page'
|
||||
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
'''
|
||||
parse page does multiple things:
|
||||
'''
|
||||
if self.type == 'post':
|
||||
yield scrapy.Request(url=response.url,
|
||||
callback=self.parse_post,
|
||||
priority=10,
|
||||
meta={'index':1})
|
||||
elif self.type == 'page':
|
||||
#select all posts
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||
many_features = post.xpath('./@data-ft').get()
|
||||
date = []
|
||||
date.append(many_features)
|
||||
date = parse_date(date,{'lang':self.lang})
|
||||
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
|
||||
|
||||
if current_date is None:
|
||||
date_string = post.xpath('.//abbr/text()').get()
|
||||
date = parse_date2([date_string],{'lang':self.lang})
|
||||
current_date = datetime(date.year,date.month,date.day) if date is not None else date
|
||||
date = str(date)
|
||||
|
||||
if abs(self.count) + 1 > self.max:
|
||||
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
|
||||
self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
|
||||
|
||||
#returns full post-link in a list
|
||||
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||
temp_post = response.urljoin(post[0])
|
||||
self.count -= 1
|
||||
yield scrapy.Request(temp_post,
|
||||
self.parse_post,
|
||||
priority = self.count,
|
||||
meta={'index':1})
|
||||
|
||||
#load following page, try to click on "more"
|
||||
#after few pages have been scraped, the "more" link might disappears
|
||||
#if not present look for the highest year not parsed yet
|
||||
#click once on the year and go back to clicking "more"
|
||||
|
||||
#new_page is different for groups
|
||||
if self.group == 1:
|
||||
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
|
||||
else:
|
||||
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
#this is why lang is needed
|
||||
|
||||
if not new_page:
|
||||
self.logger.info('[!] "more" link not found, will look for a "year" link')
|
||||
#self.k is the year link that we look for
|
||||
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||
new_page = response.xpath(xpath).extract()
|
||||
if new_page:
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.k -= 1
|
||||
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
priority = -1000,
|
||||
meta={'flag':self.k})
|
||||
else:
|
||||
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||
self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
|
||||
self.k -= 1
|
||||
if self.k < self.year:
|
||||
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
|
||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||
new_page = response.xpath(xpath).extract()
|
||||
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.k -= 1
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
priority = -1000,
|
||||
meta={'flag':self.k})
|
||||
else:
|
||||
self.logger.info('Crawling has finished with no errors!')
|
||||
else:
|
||||
new_page = response.urljoin(new_page[0])
|
||||
if 'flag' in response.meta:
|
||||
self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
priority = -1000,
|
||||
meta={'flag':response.meta['flag']})
|
||||
else:
|
||||
self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
priority = -1000,
|
||||
meta={'flag':self.k})
|
||||
|
||||
def parse_post(self, response):
|
||||
'''
|
||||
parse post does multiple things:
|
||||
1) loads replied-to-comments page one-by-one (for DFS)
|
||||
2) call parse_reply on the nested comments
|
||||
3) adds simple (not-replied-to) comments
|
||||
@ -37,9 +141,10 @@ class CommentsSpider(FacebookSpider):
|
||||
source = reply.xpath('.//h3/a/text()').extract()
|
||||
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
|
||||
ans = response.urljoin(answer[::-1][0])
|
||||
self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans))
|
||||
self.logger.info('{} nested comment'.format(str(response.meta['index'])))
|
||||
yield scrapy.Request(ans,
|
||||
callback=self.parse_reply,
|
||||
priority=1000,
|
||||
meta={'reply_to':source,
|
||||
'url':response.url,
|
||||
'index':response.meta['index'],
|
||||
@ -49,7 +154,7 @@ class CommentsSpider(FacebookSpider):
|
||||
if not response.xpath(path): #prevents from exec
|
||||
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||
for i,reply in enumerate(response.xpath(path2)):
|
||||
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
|
||||
self.logger.info('{} regular comment'.format(i+1))
|
||||
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||
new.context['lang'] = self.lang
|
||||
new.add_xpath('source','.//h3/a/text()')
|
||||
@ -71,7 +176,7 @@ class CommentsSpider(FacebookSpider):
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
callback=self.parse_post,
|
||||
meta={'index':1,
|
||||
'group':1})
|
||||
else:
|
||||
@ -80,7 +185,7 @@ class CommentsSpider(FacebookSpider):
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
callback=self.parse_post,
|
||||
meta={'index':1,
|
||||
'group':group_flag})
|
||||
|
||||
@ -88,6 +193,9 @@ class CommentsSpider(FacebookSpider):
|
||||
'''
|
||||
parse reply to comments, root comment is added if flag
|
||||
'''
|
||||
# from scrapy.utils.response import open_in_browser
|
||||
# open_in_browser(response)
|
||||
|
||||
if response.meta['flag'] == 'init':
|
||||
#parse root comment
|
||||
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||
@ -120,7 +228,7 @@ class CommentsSpider(FacebookSpider):
|
||||
back_page = response.urljoin(back[0])
|
||||
yield scrapy.Request(back_page,
|
||||
callback=self.parse_reply,
|
||||
priority=100,
|
||||
priority = 1000,
|
||||
meta={'reply_to':response.meta['reply_to'],
|
||||
'flag':'back',
|
||||
'url':response.meta['url'],
|
||||
@ -131,7 +239,7 @@ class CommentsSpider(FacebookSpider):
|
||||
next_reply = response.meta['url']
|
||||
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
|
||||
yield scrapy.Request(next_reply,
|
||||
callback=self.parse_page,
|
||||
callback=self.parse_post,
|
||||
meta={'index':response.meta['index']+1,
|
||||
'group':response.meta['group']})
|
||||
|
||||
@ -155,7 +263,7 @@ class CommentsSpider(FacebookSpider):
|
||||
back_page = response.urljoin(back[0])
|
||||
yield scrapy.Request(back_page,
|
||||
callback=self.parse_reply,
|
||||
priority=100,
|
||||
priority=1000,
|
||||
meta={'reply_to':response.meta['reply_to'],
|
||||
'flag':'back',
|
||||
'url':response.meta['url'],
|
||||
@ -166,7 +274,7 @@ class CommentsSpider(FacebookSpider):
|
||||
next_reply = response.meta['url']
|
||||
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||
yield scrapy.Request(next_reply,
|
||||
callback=self.parse_page,
|
||||
callback=self.parse_post,
|
||||
meta={'index':response.meta['index']+1,
|
||||
'group':response.meta['group']})
|
||||
|
||||
|
@ -4,7 +4,7 @@ import logging
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.http import FormRequest
|
||||
from scrapy.exceptions import CloseSpider
|
||||
from fbcrawl.items import FbcrawlItem, parse_date
|
||||
from fbcrawl.items import FbcrawlItem, parse_date, parse_date2
|
||||
from datetime import datetime
|
||||
|
||||
class FacebookSpider(scrapy.Spider):
|
||||
@ -15,7 +15,8 @@ class FacebookSpider(scrapy.Spider):
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||
'reactions','likes','ahah','love','wow', \
|
||||
'sigh','grrr','comments','post_id','url']
|
||||
'sigh','grrr','comments','post_id','url'],
|
||||
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -33,16 +34,19 @@ class FacebookSpider(scrapy.Spider):
|
||||
self.logger.info('Email and password provided, will be used to log in')
|
||||
|
||||
#page name parsing (added support for full urls)
|
||||
if 'page' not in kwargs:
|
||||
raise AttributeError('You need to provide a valid page name to crawl!'
|
||||
'scrapy fb -a page="PAGENAME"')
|
||||
elif self.page.find('https://www.facebook.com/') != -1:
|
||||
self.page = self.page[25:]
|
||||
elif self.page.find('https://mbasic.facebook.com/') != -1:
|
||||
self.page = self.page[28:]
|
||||
elif self.page.find('https://m.facebook.com/') != -1:
|
||||
self.page = self.page[23:]
|
||||
|
||||
if 'page' in kwargs:
|
||||
if self.page.find('/groups/') != -1:
|
||||
self.group = 1
|
||||
else:
|
||||
self.group = 0
|
||||
if self.page.find('https://www.facebook.com/') != -1:
|
||||
self.page = self.page[25:]
|
||||
elif self.page.find('https://mbasic.facebook.com/') != -1:
|
||||
self.page = self.page[28:]
|
||||
elif self.page.find('https://m.facebook.com/') != -1:
|
||||
self.page = self.page[23:]
|
||||
|
||||
|
||||
#parse date
|
||||
if 'date' not in kwargs:
|
||||
self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)')
|
||||
@ -148,11 +152,19 @@ class FacebookSpider(scrapy.Spider):
|
||||
many_features = post.xpath('./@data-ft').get()
|
||||
date = []
|
||||
date.append(many_features)
|
||||
date = parse_date(date)
|
||||
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S')
|
||||
|
||||
date = parse_date(date,{'lang':self.lang})
|
||||
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
|
||||
|
||||
if current_date is None:
|
||||
date_string = post.xpath('.//abbr/text()').get()
|
||||
date = parse_date2([date_string],{'lang':self.lang})
|
||||
current_date = datetime(date.year,date.month,date.day) if date is not None else date
|
||||
date = str(date)
|
||||
|
||||
#if 'date' argument is reached stop crawling
|
||||
if self.date > current_date:
|
||||
raise CloseSpider('Reached date: {}'.format(self.date))
|
||||
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
if abs(self.count) + 1 > self.max:
|
||||
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
|
||||
@ -161,8 +173,8 @@ class FacebookSpider(scrapy.Spider):
|
||||
new.add_value('date',date)
|
||||
new.add_xpath('post_id','./@data-ft')
|
||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||
|
||||
#page_url #new.add_value('url',response.url)
|
||||
|
||||
#returns full post-link in a list
|
||||
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||
temp_post = response.urljoin(post[0])
|
||||
@ -173,18 +185,24 @@ class FacebookSpider(scrapy.Spider):
|
||||
#after few pages have been scraped, the "more" link might disappears
|
||||
#if not present look for the highest year not parsed yet
|
||||
#click once on the year and go back to clicking "more"
|
||||
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
#new_page is different for groups
|
||||
if self.group == 1:
|
||||
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
|
||||
else:
|
||||
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
if not new_page:
|
||||
self.logger.info('[!] "more" link not found, will look for a year')
|
||||
#self.k is the year that we look for in the link.
|
||||
self.logger.info('[!] "more" link not found, will look for a "year" link')
|
||||
#self.k is the year link that we look for
|
||||
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||
new_page = response.xpath(xpath).extract()
|
||||
if new_page:
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.k -= 1
|
||||
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page))
|
||||
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
else:
|
||||
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||
@ -194,7 +212,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
|
||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||
new_page = response.xpath(xpath).extract()
|
||||
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page))
|
||||
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.k -= 1
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
|
Loading…
Reference in New Issue
Block a user