Adding features: crawl comments from page, crawl posts and comments from groups

This commit is contained in:
rugantio 2019-05-11 15:22:56 +02:00
parent f6e8545236
commit d875e89c52
8 changed files with 648 additions and 55 deletions

View File

@ -49,14 +49,14 @@ def reactions_strip(string,loader_context):
while newstring.rfind(',') != -1: while newstring.rfind(',') != -1:
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
return newstring return newstring
# #Mark and other 254,134 #Mark and other 254,134
# elif newstring.split()[::-1][1].isdigit(): elif newstring.split()[::-1][1].isdigit():
# friends = newstring.count(' and ') + newstring.count(',') friends = newstring.count(' and ') + newstring.count(',')
# newstring = newstring.split()[::-1][1] newstring = newstring.split()[::-1][1]
# while newstring.rfind(',') != -1: while newstring.rfind(',') != -1:
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
# return int(newstring) + friends return int(newstring) + friends
# #Philip and 1K others #Philip and 1K others
else: else:
return newstring return newstring
else: else:
@ -79,7 +79,7 @@ def url_strip(url):
else: else:
return fullurl return fullurl
def parse_date(date): def parse_date(date,loader_context):
import json import json
d = json.loads(date[0]) #nested dict of features d = json.loads(date[0]) #nested dict of features
@ -99,7 +99,463 @@ def parse_date(date):
flat_d[key] = value flat_d[key] = value
#returns timestamp in localtime conversion from linux timestamp UTC #returns timestamp in localtime conversion from linux timestamp UTC
return str(datetime.fromtimestamp(flat_d['publish_time'])) ret = str(datetime.fromtimestamp(flat_d['publish_time'])) if 'publish_time' in flat_d else None
return ret
def parse_date2(init_date,loader_context):
lang = loader_context['lang']
# =============================================================================
# Italian - status:final
# =============================================================================
if lang == 'it':
months = {
'gennaio':1,
'febbraio':2,
'marzo':3,
'aprile':4,
'maggio':5,
'giugno':6,
'luglio':7,
'agosto':8,
'settembre':9,
'ottobre':10,
'novembre':11,
'dicembre':12
}
months_abbr = {
'gen':1,
'feb':2,
'mar':3,
'apr':4,
'mag':5,
'giu':6,
'lug':7,
'ago':8,
'set':9,
'ott':10,
'nov':11,
'dic':12
}
giorni = {
'lunedì':0,
'martedì':1,
'mercoledì':2,
'giovedì':3,
'venerdì':4,
'sabato':5,
'domenica':6
}
date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check
if l == 0:
return 'Error: no data'
#adesso, ieri, 4h, 50min
elif l == 1:
if date[0].isalpha():
if date[0].lower() == 'ieri':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'adesso':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
#22 min (oggi)
if date[1] == 'min':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#22 min (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
elif date[1] == 'h':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#4 h (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#2 gen
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 gennaio
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 3
elif l == 3:
#21 giu 2017
if len(date[1]) == 3 and date[2].isdigit():
day = int(date[0])
month = months_abbr[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#21 giugno 2017
elif len(date[1]) > 3 and date[2].isdigit():
day = int(date[0])
month = months[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#9 ore fa
elif date[0].isdigit() and date[1][:2] == 'or':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#9 ore fa (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#7 minuti fa
elif date[0].isdigit() and date[1][:3] == 'min':
return datetime(year,month,day).date()
#ieri alle 20:45
elif date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#oggi alle 11:11
elif date[0].lower() == 'oggi' and date[1] == 'alle':
return datetime(year,month,day).date()
#lunedì alle 12:34
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 4
elif l == 4:
#Ieri alle ore 23:32
if date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#domenica alle ore 19:29
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'alle':
#29 feb alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'alle':
#29 feb 2016 alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#29 febbraio 2016 alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# =============================================================================
# English - status:beta
# =============================================================================
elif lang == 'en':
months = {
'january':1,
'february':2,
'march':3,
'april':4,
'may':5,
'june':6,
'july':7,
'august':8,
'september':9,
'october':10,
'november':11,
'december':12
}
months_abbr = {
'jan':1,
'feb':2,
'mar':3,
'apr':4,
'may':5,
'jun':6,
'jul':7,
'aug':8,
'sep':9,
'oct':10,
'nov':11,
'dec':12
}
days = {
'monday':0,
'tuesday':1,
'wednesday':2,
'thursday':3,
'friday':4,
'saturday':5,
'sunday':6
}
date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check
if l == 0:
return 'Error: no data'
#Yesterday, Now, 4hr, 50mins
elif l == 1:
if date[0].isalpha():
if date[0].lower() == 'yesterday':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'now':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
if date[1] == 'now':
return datetime(year,month,day).date()
#22 min (ieri)
if date[1] == 'min' or date[1] == 'mins':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#22 min (oggi)
else:
return datetime(year,month,day).date()
#4 h (ieri)
elif date[1] == 'hr' or date[1] == 'hrs':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
else:
return datetime(year,month,day).date()
#2 jan
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 january
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#jan 2
elif len(date[0]) == 3 and date[0].isalpha():
day = int(date[1])
month = months_abbr[date[0].lower()]
return datetime(year,month,day).date()
#january 2
elif len(date[0]) > 3 and date[0].isalpha():
day = int(date[1])
month = months[date[0]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
return date
# l = 3
elif l == 3:
#5 hours ago
if date[2] == 'ago':
if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs':
# 5 hours ago (yesterday)
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
# 5 hours ago (today)
else:
return datetime(year,month,day).date()
#10 minutes ago
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
#22 minutes ago (yesterday)
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#22 minutes ago (today)
else:
return datetime(year,month,day).date()
else:
return date
else:
#21 Jun 2017
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
day = int(date[0])
month = months_abbr[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#21 June 2017
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
day = int(date[0])
month = months[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#Jul 11, 2016
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
day = int(date[1][:-1])
month = months_abbr[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 4
elif l == 4:
#yesterday at 23:32 PM
if date[0].lower() == 'yesterday' and date[1] == 'at':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#Thursday at 4:27 PM
elif date[1] == 'at':
today = datetime.now().weekday() #today as a weekday
weekday = days[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#monday = 0 saturday = 6
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'at':
#Jan 29 at 10:00 PM
if len(date[0]) == 3:
day = int(date[1])
month = months_abbr[date[0].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[1])
month = months[date[0].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'at':
date[1]
#Aug 25, 2016 at 7:00 PM
if len(date[0]) == 3:
day = int(date[1][:-1])
month = months_abbr[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#August 25, 2016 at 7:00 PM
else:
day = int(date[1][:-1])
month = months[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l > 6
#parsing failed - l too big
else:
return date
#parsing failed - language not supported
else:
return init_date
def id_strip(post_id): def id_strip(post_id):
import json import json
@ -122,11 +578,21 @@ class FbcrawlItem(scrapy.Item):
likes = scrapy.Field( likes = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
ahah = scrapy.Field() ahah = scrapy.Field(
love = scrapy.Field() output_processor=reactions_strip
wow = scrapy.Field() )
sigh = scrapy.Field() love = scrapy.Field(
grrr = scrapy.Field() output_processor=reactions_strip
)
wow = scrapy.Field(
output_processor=reactions_strip
)
sigh = scrapy.Field(
output_processor=reactions_strip
)
grrr = scrapy.Field(
output_processor=reactions_strip
)
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
url = scrapy.Field( url = scrapy.Field(
output_processor=url_strip output_processor=url_strip
@ -140,7 +606,7 @@ class CommentsItem(scrapy.Item):
source = scrapy.Field() source = scrapy.Field()
reply_to=scrapy.Field() reply_to=scrapy.Field()
date = scrapy.Field( # when was the post published date = scrapy.Field( # when was the post published
output_processor=parse_date output_processor=parse_date2
) )
text = scrapy.Field( text = scrapy.Field(
output_processor=Join(separator=u'') output_processor=Join(separator=u'')
@ -153,9 +619,9 @@ class CommentsItem(scrapy.Item):
) )
source_url = scrapy.Field() source_url = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
#ahah = scrapy.Field() ahah = scrapy.Field()
#love = scrapy.Field() love = scrapy.Field()
#wow = scrapy.Field() wow = scrapy.Field()
#sigh = scrapy.Field() sigh = scrapy.Field()
#grrr = scrapy.Field() grrr = scrapy.Field()
#share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares

View File

@ -88,6 +88,7 @@ DOWNLOAD_DELAY = 3
#HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV #FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
URLLENGTH_LIMIT = 99999
FEED_EXPORT_ENCODING = 'utf-8' FEED_EXPORT_ENCODING = 'utf-8'
DUPEFILTER_DEBUG = True DUPEFILTER_DEBUG = True
LOG_LEVEL = 'INFO' LOG_LEVEL = 'INFO'

View File

@ -1,9 +1,11 @@
import scrapy import scrapy
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.exceptions import CloseSpider
from fbcrawl.spiders.fbcrawl import FacebookSpider from fbcrawl.spiders.fbcrawl import FacebookSpider
from fbcrawl.items import CommentsItem from fbcrawl.items import CommentsItem, parse_date, parse_date2
from datetime import datetime
class CommentsSpider(FacebookSpider): class CommentsSpider(FacebookSpider):
""" """
@ -14,15 +16,117 @@ class CommentsSpider(FacebookSpider):
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \ 'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
'source_url','url'], 'source_url','url'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
'CONCURRENT_REQUESTS':1, 'CONCURRENT_REQUESTS' : 1
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
if 'post' in kwargs and 'page' in kwargs:
raise AttributeError('You need to specifiy only one between post and page')
elif 'post' in kwargs:
self.page = kwargs['post']
self.type = 'post'
elif 'page' in kwargs:
self.type = 'page'
super().__init__(*args,**kwargs) super().__init__(*args,**kwargs)
def parse_page(self, response): def parse_page(self, response):
''' '''
parse page does multiple things: '''
if self.type == 'post':
yield scrapy.Request(url=response.url,
callback=self.parse_post,
priority=10,
meta={'index':1})
elif self.type == 'page':
#select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
many_features = post.xpath('./@data-ft').get()
date = []
date.append(many_features)
date = parse_date(date,{'lang':self.lang})
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
if current_date is None:
date_string = post.xpath('.//abbr/text()').get()
date = parse_date2([date_string],{'lang':self.lang})
current_date = datetime(date.year,date.month,date.day) if date is not None else date
date = str(date)
if abs(self.count) + 1 > self.max:
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
#returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0])
self.count -= 1
yield scrapy.Request(temp_post,
self.parse_post,
priority = self.count,
meta={'index':1})
#load following page, try to click on "more"
#after few pages have been scraped, the "more" link might disappears
#if not present look for the highest year not parsed yet
#click once on the year and go back to clicking "more"
#new_page is different for groups
if self.group == 1:
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
else:
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
#this is why lang is needed
if not new_page:
self.logger.info('[!] "more" link not found, will look for a "year" link')
#self.k is the year link that we look for
if response.meta['flag'] == self.k and self.k >= self.year:
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
if new_page:
new_page = response.urljoin(new_page[0])
self.k -= 1
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':self.k})
else:
while not new_page: #sometimes the years are skipped this handles small year gaps
self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
self.k -= 1
if self.k < self.year:
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
new_page = response.urljoin(new_page[0])
self.k -= 1
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':self.k})
else:
self.logger.info('Crawling has finished with no errors!')
else:
new_page = response.urljoin(new_page[0])
if 'flag' in response.meta:
self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':response.meta['flag']})
else:
self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':self.k})
def parse_post(self, response):
'''
parse post does multiple things:
1) loads replied-to-comments page one-by-one (for DFS) 1) loads replied-to-comments page one-by-one (for DFS)
2) call parse_reply on the nested comments 2) call parse_reply on the nested comments
3) adds simple (not-replied-to) comments 3) adds simple (not-replied-to) comments
@ -37,9 +141,10 @@ class CommentsSpider(FacebookSpider):
source = reply.xpath('.//h3/a/text()').extract() source = reply.xpath('.//h3/a/text()').extract()
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
ans = response.urljoin(answer[::-1][0]) ans = response.urljoin(answer[::-1][0])
self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans)) self.logger.info('{} nested comment'.format(str(response.meta['index'])))
yield scrapy.Request(ans, yield scrapy.Request(ans,
callback=self.parse_reply, callback=self.parse_reply,
priority=1000,
meta={'reply_to':source, meta={'reply_to':source,
'url':response.url, 'url':response.url,
'index':response.meta['index'], 'index':response.meta['index'],
@ -49,7 +154,7 @@ class CommentsSpider(FacebookSpider):
if not response.xpath(path): #prevents from exec if not response.xpath(path): #prevents from exec
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for i,reply in enumerate(response.xpath(path2)): for i,reply in enumerate(response.xpath(path2)):
self.logger.info('{} regular comment @ page {}'.format(i,response.url)) self.logger.info('{} regular comment'.format(i+1))
new = ItemLoader(item=CommentsItem(),selector=reply) new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang new.context['lang'] = self.lang
new.add_xpath('source','.//h3/a/text()') new.add_xpath('source','.//h3/a/text()')
@ -71,7 +176,7 @@ class CommentsSpider(FacebookSpider):
new_page = response.urljoin(new_page[0]) new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page)) self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page, yield scrapy.Request(new_page,
callback=self.parse_page, callback=self.parse_post,
meta={'index':1, meta={'index':1,
'group':1}) 'group':1})
else: else:
@ -80,7 +185,7 @@ class CommentsSpider(FacebookSpider):
new_page = response.urljoin(new_page[0]) new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page)) self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page, yield scrapy.Request(new_page,
callback=self.parse_page, callback=self.parse_post,
meta={'index':1, meta={'index':1,
'group':group_flag}) 'group':group_flag})
@ -88,6 +193,9 @@ class CommentsSpider(FacebookSpider):
''' '''
parse reply to comments, root comment is added if flag parse reply to comments, root comment is added if flag
''' '''
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)
if response.meta['flag'] == 'init': if response.meta['flag'] == 'init':
#parse root comment #parse root comment
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
@ -120,7 +228,7 @@ class CommentsSpider(FacebookSpider):
back_page = response.urljoin(back[0]) back_page = response.urljoin(back[0])
yield scrapy.Request(back_page, yield scrapy.Request(back_page,
callback=self.parse_reply, callback=self.parse_reply,
priority=100, priority = 1000,
meta={'reply_to':response.meta['reply_to'], meta={'reply_to':response.meta['reply_to'],
'flag':'back', 'flag':'back',
'url':response.meta['url'], 'url':response.meta['url'],
@ -131,7 +239,7 @@ class CommentsSpider(FacebookSpider):
next_reply = response.meta['url'] next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, yield scrapy.Request(next_reply,
callback=self.parse_page, callback=self.parse_post,
meta={'index':response.meta['index']+1, meta={'index':response.meta['index']+1,
'group':response.meta['group']}) 'group':response.meta['group']})
@ -155,7 +263,7 @@ class CommentsSpider(FacebookSpider):
back_page = response.urljoin(back[0]) back_page = response.urljoin(back[0])
yield scrapy.Request(back_page, yield scrapy.Request(back_page,
callback=self.parse_reply, callback=self.parse_reply,
priority=100, priority=1000,
meta={'reply_to':response.meta['reply_to'], meta={'reply_to':response.meta['reply_to'],
'flag':'back', 'flag':'back',
'url':response.meta['url'], 'url':response.meta['url'],
@ -166,7 +274,7 @@ class CommentsSpider(FacebookSpider):
next_reply = response.meta['url'] next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, yield scrapy.Request(next_reply,
callback=self.parse_page, callback=self.parse_post,
meta={'index':response.meta['index']+1, meta={'index':response.meta['index']+1,
'group':response.meta['group']}) 'group':response.meta['group']})

View File

@ -4,7 +4,7 @@ import logging
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.http import FormRequest from scrapy.http import FormRequest
from scrapy.exceptions import CloseSpider from scrapy.exceptions import CloseSpider
from fbcrawl.items import FbcrawlItem, parse_date from fbcrawl.items import FbcrawlItem, parse_date, parse_date2
from datetime import datetime from datetime import datetime
class FacebookSpider(scrapy.Spider): class FacebookSpider(scrapy.Spider):
@ -15,7 +15,8 @@ class FacebookSpider(scrapy.Spider):
custom_settings = { custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \ 'reactions','likes','ahah','love','wow', \
'sigh','grrr','comments','post_id','url'] 'sigh','grrr','comments','post_id','url'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -33,16 +34,19 @@ class FacebookSpider(scrapy.Spider):
self.logger.info('Email and password provided, will be used to log in') self.logger.info('Email and password provided, will be used to log in')
#page name parsing (added support for full urls) #page name parsing (added support for full urls)
if 'page' not in kwargs: if 'page' in kwargs:
raise AttributeError('You need to provide a valid page name to crawl!' if self.page.find('/groups/') != -1:
'scrapy fb -a page="PAGENAME"') self.group = 1
elif self.page.find('https://www.facebook.com/') != -1: else:
self.page = self.page[25:] self.group = 0
elif self.page.find('https://mbasic.facebook.com/') != -1: if self.page.find('https://www.facebook.com/') != -1:
self.page = self.page[28:] self.page = self.page[25:]
elif self.page.find('https://m.facebook.com/') != -1: elif self.page.find('https://mbasic.facebook.com/') != -1:
self.page = self.page[23:] self.page = self.page[28:]
elif self.page.find('https://m.facebook.com/') != -1:
self.page = self.page[23:]
#parse date #parse date
if 'date' not in kwargs: if 'date' not in kwargs:
self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)') self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)')
@ -148,11 +152,19 @@ class FacebookSpider(scrapy.Spider):
many_features = post.xpath('./@data-ft').get() many_features = post.xpath('./@data-ft').get()
date = [] date = []
date.append(many_features) date.append(many_features)
date = parse_date(date) date = parse_date(date,{'lang':self.lang})
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
if current_date is None:
date_string = post.xpath('.//abbr/text()').get()
date = parse_date2([date_string],{'lang':self.lang})
current_date = datetime(date.year,date.month,date.day) if date is not None else date
date = str(date)
#if 'date' argument is reached stop crawling
if self.date > current_date: if self.date > current_date:
raise CloseSpider('Reached date: {}'.format(self.date)) raise CloseSpider('Reached date: {}'.format(self.date))
new = ItemLoader(item=FbcrawlItem(),selector=post) new = ItemLoader(item=FbcrawlItem(),selector=post)
if abs(self.count) + 1 > self.max: if abs(self.count) + 1 > self.max:
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
@ -161,8 +173,8 @@ class FacebookSpider(scrapy.Spider):
new.add_value('date',date) new.add_value('date',date)
new.add_xpath('post_id','./@data-ft') new.add_xpath('post_id','./@data-ft')
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url #new.add_value('url',response.url) #page_url #new.add_value('url',response.url)
#returns full post-link in a list #returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0]) temp_post = response.urljoin(post[0])
@ -173,18 +185,24 @@ class FacebookSpider(scrapy.Spider):
#after few pages have been scraped, the "more" link might disappears #after few pages have been scraped, the "more" link might disappears
#if not present look for the highest year not parsed yet #if not present look for the highest year not parsed yet
#click once on the year and go back to clicking "more" #click once on the year and go back to clicking "more"
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ #new_page is different for groups
if self.group == 1:
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
else:
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
if not new_page: if not new_page:
self.logger.info('[!] "more" link not found, will look for a year') self.logger.info('[!] "more" link not found, will look for a "year" link')
#self.k is the year that we look for in the link. #self.k is the year link that we look for
if response.meta['flag'] == self.k and self.k >= self.year: if response.meta['flag'] == self.k and self.k >= self.year:
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract() new_page = response.xpath(xpath).extract()
if new_page: if new_page:
new_page = response.urljoin(new_page[0]) new_page = response.urljoin(new_page[0])
self.k -= 1 self.k -= 1
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page)) self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
else: else:
while not new_page: #sometimes the years are skipped this handles small year gaps while not new_page: #sometimes the years are skipped this handles small year gaps
@ -194,7 +212,7 @@ class FacebookSpider(scrapy.Spider):
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract() new_page = response.xpath(xpath).extract()
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page)) self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
new_page = response.urljoin(new_page[0]) new_page = response.urljoin(new_page[0])
self.k -= 1 self.k -= 1
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})