Adding features: crawl comments from page, crawl posts and comments from groups

This commit is contained in:
rugantio 2019-05-11 15:22:56 +02:00
parent f6e8545236
commit d875e89c52
8 changed files with 648 additions and 55 deletions

View File

@ -49,14 +49,14 @@ def reactions_strip(string,loader_context):
while newstring.rfind(',') != -1:
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
return newstring
# #Mark and other 254,134
# elif newstring.split()[::-1][1].isdigit():
# friends = newstring.count(' and ') + newstring.count(',')
# newstring = newstring.split()[::-1][1]
# while newstring.rfind(',') != -1:
# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
# return int(newstring) + friends
# #Philip and 1K others
#Mark and other 254,134
elif newstring.split()[::-1][1].isdigit():
friends = newstring.count(' and ') + newstring.count(',')
newstring = newstring.split()[::-1][1]
while newstring.rfind(',') != -1:
newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:]
return int(newstring) + friends
#Philip and 1K others
else:
return newstring
else:
@ -79,7 +79,7 @@ def url_strip(url):
else:
return fullurl
def parse_date(date):
def parse_date(date,loader_context):
import json
d = json.loads(date[0]) #nested dict of features
@ -99,7 +99,463 @@ def parse_date(date):
flat_d[key] = value
#returns timestamp in localtime conversion from linux timestamp UTC
return str(datetime.fromtimestamp(flat_d['publish_time']))
ret = str(datetime.fromtimestamp(flat_d['publish_time'])) if 'publish_time' in flat_d else None
return ret
def parse_date2(init_date,loader_context):
lang = loader_context['lang']
# =============================================================================
# Italian - status:final
# =============================================================================
if lang == 'it':
months = {
'gennaio':1,
'febbraio':2,
'marzo':3,
'aprile':4,
'maggio':5,
'giugno':6,
'luglio':7,
'agosto':8,
'settembre':9,
'ottobre':10,
'novembre':11,
'dicembre':12
}
months_abbr = {
'gen':1,
'feb':2,
'mar':3,
'apr':4,
'mag':5,
'giu':6,
'lug':7,
'ago':8,
'set':9,
'ott':10,
'nov':11,
'dic':12
}
giorni = {
'lunedì':0,
'martedì':1,
'mercoledì':2,
'giovedì':3,
'venerdì':4,
'sabato':5,
'domenica':6
}
date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check
if l == 0:
return 'Error: no data'
#adesso, ieri, 4h, 50min
elif l == 1:
if date[0].isalpha():
if date[0].lower() == 'ieri':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'adesso':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
#22 min (oggi)
if date[1] == 'min':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#22 min (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
elif date[1] == 'h':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#4 h (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#2 gen
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 gennaio
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 3
elif l == 3:
#21 giu 2017
if len(date[1]) == 3 and date[2].isdigit():
day = int(date[0])
month = months_abbr[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#21 giugno 2017
elif len(date[1]) > 3 and date[2].isdigit():
day = int(date[0])
month = months[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#9 ore fa
elif date[0].isdigit() and date[1][:2] == 'or':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#9 ore fa (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#7 minuti fa
elif date[0].isdigit() and date[1][:3] == 'min':
return datetime(year,month,day).date()
#ieri alle 20:45
elif date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#oggi alle 11:11
elif date[0].lower() == 'oggi' and date[1] == 'alle':
return datetime(year,month,day).date()
#lunedì alle 12:34
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 4
elif l == 4:
#Ieri alle ore 23:32
if date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#domenica alle ore 19:29
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'alle':
#29 feb alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'alle':
#29 feb 2016 alle ore 21:49
if len(date[1]) == 3:
day = int(date[0])
month = months_abbr[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#29 febbraio 2016 alle ore 21:49
else:
day = int(date[0])
month = months[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# =============================================================================
# English - status:beta
# =============================================================================
elif lang == 'en':
months = {
'january':1,
'february':2,
'march':3,
'april':4,
'may':5,
'june':6,
'july':7,
'august':8,
'september':9,
'october':10,
'november':11,
'december':12
}
months_abbr = {
'jan':1,
'feb':2,
'mar':3,
'apr':4,
'may':5,
'jun':6,
'jul':7,
'aug':8,
'sep':9,
'oct':10,
'nov':11,
'dec':12
}
days = {
'monday':0,
'tuesday':1,
'wednesday':2,
'thursday':3,
'friday':4,
'saturday':5,
'sunday':6
}
date = init_date[0].split()
year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today
l = len(date)
#sanity check
if l == 0:
return 'Error: no data'
#Yesterday, Now, 4hr, 50mins
elif l == 1:
if date[0].isalpha():
if date[0].lower() == 'yesterday':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
#check that yesterday was not in another month
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
elif date[0].lower() == 'now':
return datetime(year,month,day).date() #return today
else: #not recognized, (return date or init_date)
return date
else:
#4h, 50min (exploit future parsing)
l = 2
new_date = [x for x in date[0] if x.isdigit()]
date[0] = ''.join(new_date)
new_date = [x for x in date[0] if not(x.isdigit())]
date[1] = ''.join(new_date)
# l = 2
elif l == 2:
if date[1] == 'now':
return datetime(year,month,day).date()
#22 min (ieri)
if date[1] == 'min' or date[1] == 'mins':
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#22 min (oggi)
else:
return datetime(year,month,day).date()
#4 h (ieri)
elif date[1] == 'hr' or date[1] == 'hrs':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#4 h (oggi)
else:
return datetime(year,month,day).date()
#2 jan
elif len(date[1]) == 3 and date[1].isalpha():
day = int(date[0])
month = months_abbr[date[1].lower()]
return datetime(year,month,day).date()
#2 january
elif len(date[1]) > 3 and date[1].isalpha():
day = int(date[0])
month = months[date[1]]
return datetime(year,month,day).date()
#jan 2
elif len(date[0]) == 3 and date[0].isalpha():
day = int(date[1])
month = months_abbr[date[0].lower()]
return datetime(year,month,day).date()
#january 2
elif len(date[0]) > 3 and date[0].isalpha():
day = int(date[1])
month = months[date[0]]
return datetime(year,month,day).date()
#parsing failed
else:
return date
return date
# l = 3
elif l == 3:
#5 hours ago
if date[2] == 'ago':
if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs':
# 5 hours ago (yesterday)
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
# 5 hours ago (today)
else:
return datetime(year,month,day).date()
#10 minutes ago
elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins':
#22 minutes ago (yesterday)
if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#22 minutes ago (today)
else:
return datetime(year,month,day).date()
else:
return date
else:
#21 Jun 2017
if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit():
day = int(date[0])
month = months_abbr[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#21 June 2017
elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit():
day = int(date[0])
month = months[date[1].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#Jul 11, 2016
elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha():
day = int(date[1][:-1])
month = months_abbr[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 4
elif l == 4:
#yesterday at 23:32 PM
if date[0].lower() == 'yesterday' and date[1] == 'at':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#Thursday at 4:27 PM
elif date[1] == 'at':
today = datetime.now().weekday() #today as a weekday
weekday = days[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#monday = 0 saturday = 6
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 5
elif l == 5:
if date[2] == 'at':
#Jan 29 at 10:00 PM
if len(date[0]) == 3:
day = int(date[1])
month = months_abbr[date[0].lower()]
return datetime(year,month,day).date()
#29 febbraio alle ore 21:49
else:
day = int(date[1])
month = months[date[0].lower()]
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l = 6
elif l == 6:
if date[3] == 'at':
date[1]
#Aug 25, 2016 at 7:00 PM
if len(date[0]) == 3:
day = int(date[1][:-1])
month = months_abbr[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#August 25, 2016 at 7:00 PM
else:
day = int(date[1][:-1])
month = months[date[0].lower()]
year = int(date[2])
return datetime(year,month,day).date()
#parsing failed
else:
return date
# l > 6
#parsing failed - l too big
else:
return date
#parsing failed - language not supported
else:
return init_date
def id_strip(post_id):
import json
@ -122,11 +578,21 @@ class FbcrawlItem(scrapy.Item):
likes = scrapy.Field(
output_processor=reactions_strip
)
ahah = scrapy.Field()
love = scrapy.Field()
wow = scrapy.Field()
sigh = scrapy.Field()
grrr = scrapy.Field()
ahah = scrapy.Field(
output_processor=reactions_strip
)
love = scrapy.Field(
output_processor=reactions_strip
)
wow = scrapy.Field(
output_processor=reactions_strip
)
sigh = scrapy.Field(
output_processor=reactions_strip
)
grrr = scrapy.Field(
output_processor=reactions_strip
)
share = scrapy.Field() # num of shares
url = scrapy.Field(
output_processor=url_strip
@ -140,7 +606,7 @@ class CommentsItem(scrapy.Item):
source = scrapy.Field()
reply_to=scrapy.Field()
date = scrapy.Field( # when was the post published
output_processor=parse_date
output_processor=parse_date2
)
text = scrapy.Field(
output_processor=Join(separator=u'')
@ -153,9 +619,9 @@ class CommentsItem(scrapy.Item):
)
source_url = scrapy.Field()
url = scrapy.Field()
#ahah = scrapy.Field()
#love = scrapy.Field()
#wow = scrapy.Field()
#sigh = scrapy.Field()
#grrr = scrapy.Field()
#share = scrapy.Field() # num of shares
ahah = scrapy.Field()
love = scrapy.Field()
wow = scrapy.Field()
sigh = scrapy.Field()
grrr = scrapy.Field()
share = scrapy.Field() # num of shares

View File

@ -88,6 +88,7 @@ DOWNLOAD_DELAY = 3
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
URLLENGTH_LIMIT = 99999
FEED_EXPORT_ENCODING = 'utf-8'
DUPEFILTER_DEBUG = True
LOG_LEVEL = 'INFO'

View File

@ -1,9 +1,11 @@
import scrapy
from scrapy.loader import ItemLoader
from scrapy.exceptions import CloseSpider
from fbcrawl.spiders.fbcrawl import FacebookSpider
from fbcrawl.items import CommentsItem
from fbcrawl.items import CommentsItem, parse_date, parse_date2
from datetime import datetime
class CommentsSpider(FacebookSpider):
"""
@ -14,15 +16,117 @@ class CommentsSpider(FacebookSpider):
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
'source_url','url'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
'CONCURRENT_REQUESTS':1,
'CONCURRENT_REQUESTS' : 1
}
def __init__(self, *args, **kwargs):
if 'post' in kwargs and 'page' in kwargs:
raise AttributeError('You need to specifiy only one between post and page')
elif 'post' in kwargs:
self.page = kwargs['post']
self.type = 'post'
elif 'page' in kwargs:
self.type = 'page'
super().__init__(*args,**kwargs)
def parse_page(self, response):
'''
parse page does multiple things:
'''
if self.type == 'post':
yield scrapy.Request(url=response.url,
callback=self.parse_post,
priority=10,
meta={'index':1})
elif self.type == 'page':
#select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
many_features = post.xpath('./@data-ft').get()
date = []
date.append(many_features)
date = parse_date(date,{'lang':self.lang})
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
if current_date is None:
date_string = post.xpath('.//abbr/text()').get()
date = parse_date2([date_string],{'lang':self.lang})
current_date = datetime(date.year,date.month,date.day) if date is not None else date
date = str(date)
if abs(self.count) + 1 > self.max:
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
#returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0])
self.count -= 1
yield scrapy.Request(temp_post,
self.parse_post,
priority = self.count,
meta={'index':1})
#load following page, try to click on "more"
#after few pages have been scraped, the "more" link might disappears
#if not present look for the highest year not parsed yet
#click once on the year and go back to clicking "more"
#new_page is different for groups
if self.group == 1:
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
else:
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
#this is why lang is needed
if not new_page:
self.logger.info('[!] "more" link not found, will look for a "year" link')
#self.k is the year link that we look for
if response.meta['flag'] == self.k and self.k >= self.year:
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
if new_page:
new_page = response.urljoin(new_page[0])
self.k -= 1
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':self.k})
else:
while not new_page: #sometimes the years are skipped this handles small year gaps
self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
self.k -= 1
if self.k < self.year:
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
new_page = response.urljoin(new_page[0])
self.k -= 1
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':self.k})
else:
self.logger.info('Crawling has finished with no errors!')
else:
new_page = response.urljoin(new_page[0])
if 'flag' in response.meta:
self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':response.meta['flag']})
else:
self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
priority = -1000,
meta={'flag':self.k})
def parse_post(self, response):
'''
parse post does multiple things:
1) loads replied-to-comments page one-by-one (for DFS)
2) call parse_reply on the nested comments
3) adds simple (not-replied-to) comments
@ -37,9 +141,10 @@ class CommentsSpider(FacebookSpider):
source = reply.xpath('.//h3/a/text()').extract()
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
ans = response.urljoin(answer[::-1][0])
self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans))
self.logger.info('{} nested comment'.format(str(response.meta['index'])))
yield scrapy.Request(ans,
callback=self.parse_reply,
priority=1000,
meta={'reply_to':source,
'url':response.url,
'index':response.meta['index'],
@ -49,7 +154,7 @@ class CommentsSpider(FacebookSpider):
if not response.xpath(path): #prevents from exec
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for i,reply in enumerate(response.xpath(path2)):
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
self.logger.info('{} regular comment'.format(i+1))
new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang
new.add_xpath('source','.//h3/a/text()')
@ -71,7 +176,7 @@ class CommentsSpider(FacebookSpider):
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
callback=self.parse_post,
meta={'index':1,
'group':1})
else:
@ -80,7 +185,7 @@ class CommentsSpider(FacebookSpider):
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
callback=self.parse_post,
meta={'index':1,
'group':group_flag})
@ -88,6 +193,9 @@ class CommentsSpider(FacebookSpider):
'''
parse reply to comments, root comment is added if flag
'''
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)
if response.meta['flag'] == 'init':
#parse root comment
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
@ -120,7 +228,7 @@ class CommentsSpider(FacebookSpider):
back_page = response.urljoin(back[0])
yield scrapy.Request(back_page,
callback=self.parse_reply,
priority=100,
priority = 1000,
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
@ -131,7 +239,7 @@ class CommentsSpider(FacebookSpider):
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply,
callback=self.parse_page,
callback=self.parse_post,
meta={'index':response.meta['index']+1,
'group':response.meta['group']})
@ -155,7 +263,7 @@ class CommentsSpider(FacebookSpider):
back_page = response.urljoin(back[0])
yield scrapy.Request(back_page,
callback=self.parse_reply,
priority=100,
priority=1000,
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
@ -166,7 +274,7 @@ class CommentsSpider(FacebookSpider):
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply,
callback=self.parse_page,
callback=self.parse_post,
meta={'index':response.meta['index']+1,
'group':response.meta['group']})

View File

@ -4,7 +4,7 @@ import logging
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from scrapy.exceptions import CloseSpider
from fbcrawl.items import FbcrawlItem, parse_date
from fbcrawl.items import FbcrawlItem, parse_date, parse_date2
from datetime import datetime
class FacebookSpider(scrapy.Spider):
@ -15,7 +15,8 @@ class FacebookSpider(scrapy.Spider):
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \
'sigh','grrr','comments','post_id','url']
'sigh','grrr','comments','post_id','url'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
}
def __init__(self, *args, **kwargs):
@ -33,16 +34,19 @@ class FacebookSpider(scrapy.Spider):
self.logger.info('Email and password provided, will be used to log in')
#page name parsing (added support for full urls)
if 'page' not in kwargs:
raise AttributeError('You need to provide a valid page name to crawl!'
'scrapy fb -a page="PAGENAME"')
elif self.page.find('https://www.facebook.com/') != -1:
self.page = self.page[25:]
elif self.page.find('https://mbasic.facebook.com/') != -1:
self.page = self.page[28:]
elif self.page.find('https://m.facebook.com/') != -1:
self.page = self.page[23:]
if 'page' in kwargs:
if self.page.find('/groups/') != -1:
self.group = 1
else:
self.group = 0
if self.page.find('https://www.facebook.com/') != -1:
self.page = self.page[25:]
elif self.page.find('https://mbasic.facebook.com/') != -1:
self.page = self.page[28:]
elif self.page.find('https://m.facebook.com/') != -1:
self.page = self.page[23:]
#parse date
if 'date' not in kwargs:
self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)')
@ -148,11 +152,19 @@ class FacebookSpider(scrapy.Spider):
many_features = post.xpath('./@data-ft').get()
date = []
date.append(many_features)
date = parse_date(date)
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S')
date = parse_date(date,{'lang':self.lang})
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
if current_date is None:
date_string = post.xpath('.//abbr/text()').get()
date = parse_date2([date_string],{'lang':self.lang})
current_date = datetime(date.year,date.month,date.day) if date is not None else date
date = str(date)
#if 'date' argument is reached stop crawling
if self.date > current_date:
raise CloseSpider('Reached date: {}'.format(self.date))
new = ItemLoader(item=FbcrawlItem(),selector=post)
if abs(self.count) + 1 > self.max:
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
@ -161,8 +173,8 @@ class FacebookSpider(scrapy.Spider):
new.add_value('date',date)
new.add_xpath('post_id','./@data-ft')
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url #new.add_value('url',response.url)
#returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0])
@ -173,18 +185,24 @@ class FacebookSpider(scrapy.Spider):
#after few pages have been scraped, the "more" link might disappears
#if not present look for the highest year not parsed yet
#click once on the year and go back to clicking "more"
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
#new_page is different for groups
if self.group == 1:
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
else:
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
#this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^
if not new_page:
self.logger.info('[!] "more" link not found, will look for a year')
#self.k is the year that we look for in the link.
self.logger.info('[!] "more" link not found, will look for a "year" link')
#self.k is the year link that we look for
if response.meta['flag'] == self.k and self.k >= self.year:
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
if new_page:
new_page = response.urljoin(new_page[0])
self.k -= 1
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page))
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
else:
while not new_page: #sometimes the years are skipped this handles small year gaps
@ -194,7 +212,7 @@ class FacebookSpider(scrapy.Spider):
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page))
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
new_page = response.urljoin(new_page[0])
self.k -= 1
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})