diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index 314db6b..75fdef8 100644 Binary files a/fbcrawl/__pycache__/items.cpython-37.pyc and b/fbcrawl/__pycache__/items.cpython-37.pyc differ diff --git a/fbcrawl/__pycache__/settings.cpython-37.pyc b/fbcrawl/__pycache__/settings.cpython-37.pyc index eef5a3f..b18e5e5 100644 Binary files a/fbcrawl/__pycache__/settings.cpython-37.pyc and b/fbcrawl/__pycache__/settings.cpython-37.pyc differ diff --git a/fbcrawl/items.py b/fbcrawl/items.py index b2a09f7..3d42091 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -49,14 +49,14 @@ def reactions_strip(string,loader_context): while newstring.rfind(',') != -1: newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] return newstring -# #Mark and other 254,134 -# elif newstring.split()[::-1][1].isdigit(): -# friends = newstring.count(' and ') + newstring.count(',') -# newstring = newstring.split()[::-1][1] -# while newstring.rfind(',') != -1: -# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] -# return int(newstring) + friends -# #Philip and 1K others + #Mark and other 254,134 + elif newstring.split()[::-1][1].isdigit(): + friends = newstring.count(' and ') + newstring.count(',') + newstring = newstring.split()[::-1][1] + while newstring.rfind(',') != -1: + newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] + return int(newstring) + friends + #Philip and 1K others else: return newstring else: @@ -79,7 +79,7 @@ def url_strip(url): else: return fullurl -def parse_date(date): +def parse_date(date,loader_context): import json d = json.loads(date[0]) #nested dict of features @@ -99,7 +99,463 @@ def parse_date(date): flat_d[key] = value #returns timestamp in localtime conversion from linux timestamp UTC - return str(datetime.fromtimestamp(flat_d['publish_time'])) + ret = str(datetime.fromtimestamp(flat_d['publish_time'])) if 'publish_time' in flat_d else None + return ret + +def parse_date2(init_date,loader_context): + lang = loader_context['lang'] +# ============================================================================= +# Italian - status:final +# ============================================================================= + if lang == 'it': + months = { + 'gennaio':1, + 'febbraio':2, + 'marzo':3, + 'aprile':4, + 'maggio':5, + 'giugno':6, + 'luglio':7, + 'agosto':8, + 'settembre':9, + 'ottobre':10, + 'novembre':11, + 'dicembre':12 + } + + months_abbr = { + 'gen':1, + 'feb':2, + 'mar':3, + 'apr':4, + 'mag':5, + 'giu':6, + 'lug':7, + 'ago':8, + 'set':9, + 'ott':10, + 'nov':11, + 'dic':12 + } + + giorni = { + 'lunedì':0, + 'martedì':1, + 'mercoledì':2, + 'giovedì':3, + 'venerdì':4, + 'sabato':5, + 'domenica':6 + } + + date = init_date[0].split() + year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today + + l = len(date) + + #sanity check + if l == 0: + return 'Error: no data' + + #adesso, ieri, 4h, 50min + elif l == 1: + if date[0].isalpha(): + if date[0].lower() == 'ieri': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + #check that yesterday was not in another month + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + elif date[0].lower() == 'adesso': + return datetime(year,month,day).date() #return today + else: #not recognized, (return date or init_date) + return date + else: + #4h, 50min (exploit future parsing) + l = 2 + new_date = [x for x in date[0] if x.isdigit()] + date[0] = ''.join(new_date) + new_date = [x for x in date[0] if not(x.isdigit())] + date[1] = ''.join(new_date) +# l = 2 + elif l == 2: + #22 min (oggi) + if date[1] == 'min': + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #22 min (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #4 h (oggi) + elif date[1] == 'h': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #4 h (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #2 gen + elif len(date[1]) == 3 and date[1].isalpha(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #2 gennaio + elif len(date[1]) > 3 and date[1].isalpha(): + day = int(date[0]) + month = months[date[1]] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 3 + elif l == 3: + #21 giu 2017 + if len(date[1]) == 3 and date[2].isdigit(): + day = int(date[0]) + month = months_abbr[date[1]] + year = int(date[2]) + return datetime(year,month,day).date() + #21 giugno 2017 + elif len(date[1]) > 3 and date[2].isdigit(): + day = int(date[0]) + month = months[date[1]] + year = int(date[2]) + return datetime(year,month,day).date() + #9 ore fa + elif date[0].isdigit() and date[1][:2] == 'or': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #9 ore fa (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #7 minuti fa + elif date[0].isdigit() and date[1][:3] == 'min': + return datetime(year,month,day).date() + + #ieri alle 20:45 + elif date[0].lower() == 'ieri' and date[1] == 'alle': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #oggi alle 11:11 + elif date[0].lower() == 'oggi' and date[1] == 'alle': + return datetime(year,month,day).date() + #lunedì alle 12:34 + elif date[0].isalpha() and date[1] == 'alle': + today = datetime.now().weekday() #today as a weekday + weekday = giorni[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #lunedì = 0 sabato = 6, mar 1 ven 5 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 4 + elif l == 4: + #Ieri alle ore 23:32 + if date[0].lower() == 'ieri' and date[1] == 'alle': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #domenica alle ore 19:29 + elif date[0].isalpha() and date[1] == 'alle': + today = datetime.now().weekday() #today as a weekday + weekday = giorni[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #lunedì = 0 sabato = 6, mar 1 ven 5 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 5 + elif l == 5: + if date[2] == 'alle': + #29 feb alle ore 21:49 + if len(date[1]) == 3: + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 + else: + day = int(date[0]) + month = months[date[1].lower()] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 6 + elif l == 6: + if date[3] == 'alle': + #29 feb 2016 alle ore 21:49 + if len(date[1]) == 3: + day = int(date[0]) + month = months_abbr[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #29 febbraio 2016 alle ore 21:49 + else: + day = int(date[0]) + month = months[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# ============================================================================= +# English - status:beta +# ============================================================================= + elif lang == 'en': + months = { + 'january':1, + 'february':2, + 'march':3, + 'april':4, + 'may':5, + 'june':6, + 'july':7, + 'august':8, + 'september':9, + 'october':10, + 'november':11, + 'december':12 + } + + months_abbr = { + 'jan':1, + 'feb':2, + 'mar':3, + 'apr':4, + 'may':5, + 'jun':6, + 'jul':7, + 'aug':8, + 'sep':9, + 'oct':10, + 'nov':11, + 'dec':12 + } + + days = { + 'monday':0, + 'tuesday':1, + 'wednesday':2, + 'thursday':3, + 'friday':4, + 'saturday':5, + 'sunday':6 + } + + date = init_date[0].split() + year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today + + l = len(date) + + #sanity check + if l == 0: + return 'Error: no data' + + #Yesterday, Now, 4hr, 50mins + elif l == 1: + if date[0].isalpha(): + if date[0].lower() == 'yesterday': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + #check that yesterday was not in another month + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + elif date[0].lower() == 'now': + return datetime(year,month,day).date() #return today + else: #not recognized, (return date or init_date) + return date + else: + #4h, 50min (exploit future parsing) + l = 2 + new_date = [x for x in date[0] if x.isdigit()] + date[0] = ''.join(new_date) + new_date = [x for x in date[0] if not(x.isdigit())] + date[1] = ''.join(new_date) +# l = 2 + elif l == 2: + if date[1] == 'now': + return datetime(year,month,day).date() + #22 min (ieri) + if date[1] == 'min' or date[1] == 'mins': + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #22 min (oggi) + else: + return datetime(year,month,day).date() + + #4 h (ieri) + elif date[1] == 'hr' or date[1] == 'hrs': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #4 h (oggi) + else: + return datetime(year,month,day).date() + + #2 jan + elif len(date[1]) == 3 and date[1].isalpha(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #2 january + elif len(date[1]) > 3 and date[1].isalpha(): + day = int(date[0]) + month = months[date[1]] + return datetime(year,month,day).date() + #jan 2 + elif len(date[0]) == 3 and date[0].isalpha(): + day = int(date[1]) + month = months_abbr[date[0].lower()] + return datetime(year,month,day).date() + #january 2 + elif len(date[0]) > 3 and date[0].isalpha(): + day = int(date[1]) + month = months[date[0]] + return datetime(year,month,day).date() + #parsing failed + else: + return date + return date +# l = 3 + elif l == 3: + #5 hours ago + if date[2] == 'ago': + if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs': + # 5 hours ago (yesterday) + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + # 5 hours ago (today) + else: + return datetime(year,month,day).date() + #10 minutes ago + elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins': + #22 minutes ago (yesterday) + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #22 minutes ago (today) + else: + return datetime(year,month,day).date() + else: + return date + else: + #21 Jun 2017 + if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #21 June 2017 + elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit(): + day = int(date[0]) + month = months[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #Jul 11, 2016 + elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha(): + day = int(date[1][:-1]) + month = months_abbr[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 4 + elif l == 4: + #yesterday at 23:32 PM + if date[0].lower() == 'yesterday' and date[1] == 'at': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #Thursday at 4:27 PM + elif date[1] == 'at': + today = datetime.now().weekday() #today as a weekday + weekday = days[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #monday = 0 saturday = 6 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 5 + elif l == 5: + if date[2] == 'at': + #Jan 29 at 10:00 PM + if len(date[0]) == 3: + day = int(date[1]) + month = months_abbr[date[0].lower()] + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 + else: + day = int(date[1]) + month = months[date[0].lower()] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 6 + elif l == 6: + if date[3] == 'at': + date[1] + #Aug 25, 2016 at 7:00 PM + if len(date[0]) == 3: + day = int(date[1][:-1]) + month = months_abbr[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #August 25, 2016 at 7:00 PM + else: + day = int(date[1][:-1]) + month = months[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l > 6 + #parsing failed - l too big + else: + return date + #parsing failed - language not supported + else: + return init_date def id_strip(post_id): import json @@ -122,11 +578,21 @@ class FbcrawlItem(scrapy.Item): likes = scrapy.Field( output_processor=reactions_strip ) - ahah = scrapy.Field() - love = scrapy.Field() - wow = scrapy.Field() - sigh = scrapy.Field() - grrr = scrapy.Field() + ahah = scrapy.Field( + output_processor=reactions_strip + ) + love = scrapy.Field( + output_processor=reactions_strip + ) + wow = scrapy.Field( + output_processor=reactions_strip + ) + sigh = scrapy.Field( + output_processor=reactions_strip + ) + grrr = scrapy.Field( + output_processor=reactions_strip + ) share = scrapy.Field() # num of shares url = scrapy.Field( output_processor=url_strip @@ -140,7 +606,7 @@ class CommentsItem(scrapy.Item): source = scrapy.Field() reply_to=scrapy.Field() date = scrapy.Field( # when was the post published - output_processor=parse_date + output_processor=parse_date2 ) text = scrapy.Field( output_processor=Join(separator=u'') @@ -153,9 +619,9 @@ class CommentsItem(scrapy.Item): ) source_url = scrapy.Field() url = scrapy.Field() - #ahah = scrapy.Field() - #love = scrapy.Field() - #wow = scrapy.Field() - #sigh = scrapy.Field() - #grrr = scrapy.Field() - #share = scrapy.Field() # num of shares + ahah = scrapy.Field() + love = scrapy.Field() + wow = scrapy.Field() + sigh = scrapy.Field() + grrr = scrapy.Field() + share = scrapy.Field() # num of shares diff --git a/fbcrawl/settings.py b/fbcrawl/settings.py index fafad9b..40d3a15 100644 --- a/fbcrawl/settings.py +++ b/fbcrawl/settings.py @@ -88,6 +88,7 @@ DOWNLOAD_DELAY = 3 #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV +URLLENGTH_LIMIT = 99999 FEED_EXPORT_ENCODING = 'utf-8' DUPEFILTER_DEBUG = True LOG_LEVEL = 'INFO' diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index dd59528..2e928fc 100644 Binary files a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc differ diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 74cf76f..826194a 100644 Binary files a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc differ diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index 39c7b3f..2b6331c 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -1,9 +1,11 @@ import scrapy from scrapy.loader import ItemLoader +from scrapy.exceptions import CloseSpider from fbcrawl.spiders.fbcrawl import FacebookSpider -from fbcrawl.items import CommentsItem +from fbcrawl.items import CommentsItem, parse_date, parse_date2 +from datetime import datetime class CommentsSpider(FacebookSpider): """ @@ -14,15 +16,117 @@ class CommentsSpider(FacebookSpider): 'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \ 'source_url','url'], 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', - 'CONCURRENT_REQUESTS':1, + 'CONCURRENT_REQUESTS' : 1 } def __init__(self, *args, **kwargs): + if 'post' in kwargs and 'page' in kwargs: + raise AttributeError('You need to specifiy only one between post and page') + elif 'post' in kwargs: + self.page = kwargs['post'] + self.type = 'post' + elif 'page' in kwargs: + self.type = 'page' + super().__init__(*args,**kwargs) def parse_page(self, response): ''' - parse page does multiple things: + ''' + if self.type == 'post': + yield scrapy.Request(url=response.url, + callback=self.parse_post, + priority=10, + meta={'index':1}) + elif self.type == 'page': + #select all posts + for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): + many_features = post.xpath('./@data-ft').get() + date = [] + date.append(many_features) + date = parse_date(date,{'lang':self.lang}) + current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date + + if current_date is None: + date_string = post.xpath('.//abbr/text()').get() + date = parse_date2([date_string],{'lang':self.lang}) + current_date = datetime(date.year,date.month,date.day) if date is not None else date + date = str(date) + + if abs(self.count) + 1 > self.max: + raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) + self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) + + #returns full post-link in a list + post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() + temp_post = response.urljoin(post[0]) + self.count -= 1 + yield scrapy.Request(temp_post, + self.parse_post, + priority = self.count, + meta={'index':1}) + + #load following page, try to click on "more" + #after few pages have been scraped, the "more" link might disappears + #if not present look for the highest year not parsed yet + #click once on the year and go back to clicking "more" + + #new_page is different for groups + if self.group == 1: + new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() + else: + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + #this is why lang is needed + + if not new_page: + self.logger.info('[!] "more" link not found, will look for a "year" link') + #self.k is the year link that we look for + if response.meta['flag'] == self.k and self.k >= self.year: + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + if new_page: + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':self.k}) + else: + while not new_page: #sometimes the years are skipped this handles small year gaps + self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) + self.k -= 1 + if self.k < self.year: + raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) + new_page = response.urljoin(new_page[0]) + self.k -= 1 + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':self.k}) + else: + self.logger.info('Crawling has finished with no errors!') + else: + new_page = response.urljoin(new_page[0]) + if 'flag' in response.meta: + self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':response.meta['flag']}) + else: + self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) + yield scrapy.Request(new_page, + callback=self.parse_page, + priority = -1000, + meta={'flag':self.k}) + + def parse_post(self, response): + ''' + parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments @@ -37,9 +141,10 @@ class CommentsSpider(FacebookSpider): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) - self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans)) + self.logger.info('{} nested comment'.format(str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, + priority=1000, meta={'reply_to':source, 'url':response.url, 'index':response.meta['index'], @@ -49,7 +154,7 @@ class CommentsSpider(FacebookSpider): if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i,reply in enumerate(response.xpath(path2)): - self.logger.info('{} regular comment @ page {}'.format(i,response.url)) + self.logger.info('{} regular comment'.format(i+1)) new = ItemLoader(item=CommentsItem(),selector=reply) new.context['lang'] = self.lang new.add_xpath('source','.//h3/a/text()') @@ -71,7 +176,7 @@ class CommentsSpider(FacebookSpider): new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, - callback=self.parse_page, + callback=self.parse_post, meta={'index':1, 'group':1}) else: @@ -80,7 +185,7 @@ class CommentsSpider(FacebookSpider): new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, - callback=self.parse_page, + callback=self.parse_post, meta={'index':1, 'group':group_flag}) @@ -88,6 +193,9 @@ class CommentsSpider(FacebookSpider): ''' parse reply to comments, root comment is added if flag ''' +# from scrapy.utils.response import open_in_browser +# open_in_browser(response) + if response.meta['flag'] == 'init': #parse root comment for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): @@ -120,7 +228,7 @@ class CommentsSpider(FacebookSpider): back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, - priority=100, + priority = 1000, meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], @@ -131,7 +239,7 @@ class CommentsSpider(FacebookSpider): next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, - callback=self.parse_page, + callback=self.parse_post, meta={'index':response.meta['index']+1, 'group':response.meta['group']}) @@ -155,7 +263,7 @@ class CommentsSpider(FacebookSpider): back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, - priority=100, + priority=1000, meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], @@ -166,7 +274,7 @@ class CommentsSpider(FacebookSpider): next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, - callback=self.parse_page, + callback=self.parse_post, meta={'index':response.meta['index']+1, 'group':response.meta['group']}) diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index c455308..f4f07ad 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -4,7 +4,7 @@ import logging from scrapy.loader import ItemLoader from scrapy.http import FormRequest from scrapy.exceptions import CloseSpider -from fbcrawl.items import FbcrawlItem, parse_date +from fbcrawl.items import FbcrawlItem, parse_date, parse_date2 from datetime import datetime class FacebookSpider(scrapy.Spider): @@ -15,7 +15,8 @@ class FacebookSpider(scrapy.Spider): custom_settings = { 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'reactions','likes','ahah','love','wow', \ - 'sigh','grrr','comments','post_id','url'] + 'sigh','grrr','comments','post_id','url'], + 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', } def __init__(self, *args, **kwargs): @@ -33,16 +34,19 @@ class FacebookSpider(scrapy.Spider): self.logger.info('Email and password provided, will be used to log in') #page name parsing (added support for full urls) - if 'page' not in kwargs: - raise AttributeError('You need to provide a valid page name to crawl!' - 'scrapy fb -a page="PAGENAME"') - elif self.page.find('https://www.facebook.com/') != -1: - self.page = self.page[25:] - elif self.page.find('https://mbasic.facebook.com/') != -1: - self.page = self.page[28:] - elif self.page.find('https://m.facebook.com/') != -1: - self.page = self.page[23:] - + if 'page' in kwargs: + if self.page.find('/groups/') != -1: + self.group = 1 + else: + self.group = 0 + if self.page.find('https://www.facebook.com/') != -1: + self.page = self.page[25:] + elif self.page.find('https://mbasic.facebook.com/') != -1: + self.page = self.page[28:] + elif self.page.find('https://m.facebook.com/') != -1: + self.page = self.page[23:] + + #parse date if 'date' not in kwargs: self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)') @@ -148,11 +152,19 @@ class FacebookSpider(scrapy.Spider): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) - date = parse_date(date) - current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') - + date = parse_date(date,{'lang':self.lang}) + current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date + + if current_date is None: + date_string = post.xpath('.//abbr/text()').get() + date = parse_date2([date_string],{'lang':self.lang}) + current_date = datetime(date.year,date.month,date.day) if date is not None else date + date = str(date) + + #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) + new = ItemLoader(item=FbcrawlItem(),selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) @@ -161,8 +173,8 @@ class FacebookSpider(scrapy.Spider): new.add_value('date',date) new.add_xpath('post_id','./@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") - #page_url #new.add_value('url',response.url) + #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) @@ -173,18 +185,24 @@ class FacebookSpider(scrapy.Spider): #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" - new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() - #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ + + #new_page is different for groups + if self.group == 1: + new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() + else: + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ + if not new_page: - self.logger.info('[!] "more" link not found, will look for a year') - #self.k is the year that we look for in the link. + self.logger.info('[!] "more" link not found, will look for a "year" link') + #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 - self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page)) + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps @@ -194,7 +212,7 @@ class FacebookSpider(scrapy.Spider): raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() - self.logger.info('Found a link for more posts, click on year "{}", new_page = {}'.format(self.k,new_page)) + self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})