diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index ccfb6f5..b2f57d3 100644 Binary files a/fbcrawl/__pycache__/items.cpython-37.pyc and b/fbcrawl/__pycache__/items.cpython-37.pyc differ diff --git a/fbcrawl/items.py b/fbcrawl/items.py index 852e968..4f48bae 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -11,8 +11,11 @@ from datetime import datetime, timedelta def parse_date(init_date,loader_context): lang = loader_context['lang'] +# ============================================================================= +# Italian - status:final +# ============================================================================= if lang == 'it': - mesi = { + months = { 'gennaio':1, 'febbraio':2, 'marzo':3, @@ -27,7 +30,7 @@ def parse_date(init_date,loader_context): 'dicembre':12 } - mesi_abbr = { + months_abbr = { 'gen':1, 'feb':2, 'mar':3, @@ -43,101 +46,379 @@ def parse_date(init_date,loader_context): } giorni = { - 'domenica':0, - 'lunedì':1, - 'martedì':2, - 'mercoledì':3, - 'giovedì':4, - 'venerdì':5, - 'sabato':6 + 'lunedì':0, + 'martedì':1, + 'mercoledì':2, + 'giovedì':3, + 'venerdì':4, + 'sabato':5, + 'domenica':6 } - date = init_date - date = date[0].split() + date = init_date[0].split() year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today + + l = len(date) #sanity check - if len(date) == 0: + if l == 0: return 'Error: no data' - #yesterday - elif len(date) == 1: - day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) - - #4h - elif len(date) == 2 and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: - pass + #adesso, ieri, 4h, 50min + elif l == 1: + if date[0].isalpha(): + if date[0].lower() == 'ieri': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + #check that yesterday was not in another month + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + elif date[0].lower() == 'adesso': + return datetime(year,month,day).date() #return today + else: #not recognized, (return date or init_date) + return date + else: + #4h, 50min (exploit future parsing) + l = 2 + new_date = [x for x in date[0] if x.isdigit()] + date[0] = ''.join(new_date) + new_date = [x for x in date[0] if not(x.isdigit())] + date[1] = ''.join(new_date) +# l = 2 + elif l == 2: + #22 min (oggi) + if date[1] == 'min': + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #22 min (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #4 h (oggi) + elif date[1] == 'h': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #4 h (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #2 gen + elif len(date[1]) == 3 and date[1].isalpha(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #2 gennaio + elif len(date[1]) > 3 and date[1].isalpha(): + day = int(date[0]) + month = months[date[1]] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 3 + elif l == 3: + #21 giu 2017 + if len(date[1]) == 3 and date[2].isdigit(): + day = int(date[0]) + month = months_abbr[date[1]] + year = int(date[2]) + return datetime(year,month,day).date() + #21 giugno 2017 + elif len(date[1]) > 3 and date[2].isdigit(): + day = int(date[0]) + month = months[date[1]] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 4 + elif l == 4: + #Ieri alle ore 23:32 + if date[0].lower() == 'ieri' and date[1] == 'alle': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #domenica alle ore 19:29 + elif date[0].isalpha() and date[1] == 'alle': + today = datetime.now().weekday() #today as a weekday + weekday = giorni[date[0].lower()] #day to be match as number weekday + #weekday is chronologically always lower than day + delta = today - weekday + if delta >= 0: + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #lunedì = 0 sabato = 6, mar 1 ven 5 + else: + delta += 8 + day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 5 + elif l == 5: + if date[2] == 'alle': + #29 feb alle ore 21:49 + if len(date[1]) == 3: + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 + else: + day = int(date[0]) + month = months[date[1].lower()] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 6 + elif l == 6: + if date[3] == 'alle': + #29 feb 2016 alle ore 21:49 + if len(date[1]) == 3: + day = int(date[0]) + month = months_abbr[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #29 febbraio 2016 alle ore 21:49 + else: + day = int(date[0]) + month = months[date[1].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# ============================================================================= +# English - status:beta +# ============================================================================= + elif lang == 'en': + months = { + 'january':1, + 'february':2, + 'march':3, + 'april':4, + 'may':5, + 'june':6, + 'july':7, + 'august':8, + 'september':9, + 'october':10, + 'november':11, + 'december':12 + } + + months_abbr = { + 'jan':1, + 'feb':2, + 'mar':3, + 'apr':4, + 'may':5, + 'jun':6, + 'jul':7, + 'aug':8, + 'sep':9, + 'oct':10, + 'nov':11, + 'dec':12 + } - #22h (yesterday) - elif date[1] == 'h' and int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: - day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + date = init_date[0].split() + year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today + + l = len(date) - #yesterday - elif date[0].isdigit() == False and date[1].isdigit() == False: - day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) - - #day with 3 month length of this year - elif len(date[1]) == 3 and not(date[2].isdigit()): - day = int(date[0]) - month = mesi_abbr[date[1]] - - elif len(date[1]) > 3 and not(date[2].isdigit()): - day = int(date[0]) - month = mesi[date[1]] - - elif len(date[1]) == 3 and date[2].isdigit(): - day = int(date[0]) - month = mesi_abbr[date[1]] - year = int(date[2]) - - #usual dates, with regular length month - elif date[0].isdigit() and date[2].isdigit(): - day = int(date[0]) - month = mesi[date[1]] - year = int(date[2]) - - #dates with weekdays (this function assumes that the month is the same) - elif date[0].isdigit() == False and date[1].isdigit() == False: - today = datetime.now().weekday() #today as a weekday - weekday = giorni[date[0]] #day to be match as number weekday - #weekday is chronologically always lower than day - if weekday < today: - day -= today - weekday - elif weekday > today: - weekday += 7 - day -= today - weekday + #sanity check + if l == 0: + return 'Error: no data' + + #Yesterday, Now, 4hr, 50mins + elif l == 1: + if date[0].isalpha(): + if date[0].lower() == 'yesterday': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + #check that yesterday was not in another month + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + elif date[0].lower() == 'now': + return datetime(year,month,day).date() #return today + else: #not recognized, (return date or init_date) + return date + else: + #4h, 50min (exploit future parsing) + l = 2 + new_date = [x for x in date[0] if x.isdigit()] + date[0] = ''.join(new_date) + new_date = [x for x in date[0] if not(x.isdigit())] + date[1] = ''.join(new_date) +# l = 2 + elif l == 2: + #22 min (oggi) + if date[1] == 'min' or date[1] == 'mins': + if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #22 min (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #4 h (oggi) + elif date[1] == 'hr' or date[1] == 'hrs': + if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: + return datetime(year,month,day).date() + #4 h (ieri) + else: + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #2 gen + elif len(date[1]) == 3 and date[1].isalpha(): + day = int(date[0]) + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() + #2 gennaio + elif len(date[1]) > 3 and date[1].isalpha(): + day = int(date[0]) + month = months[date[1]] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 3 + elif l == 3: +# #21 Jun 2017 +# if len(date[1] == 3) and date[2].isdigit(): +# day = int(date[0]) +# month = months_abbr[date[1].lower()] +# year = int(date[2]) +# return datetime(year,month,day).date() +# #21 June 2017 +# elif len(date[1] > 3) and date[2].isdigit(): +# day = int(date[0]) +# month = months[date[1].lower()] +# year = int(date[2]) +# return datetime(year,month,day).date() +# #parsing failed +# else: + return date +# l = 4 + elif l == 4: + #Ieri alle ore 23:32 + if date[0].lower() == 'yesteday' and date[1] == 'at': + day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) + month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 5 + elif l == 5: + if date[2] == 'at': + #Jan 29 at 10:00 PM + if len(date[0]) == 3: + day = int(date[1]) + month = months_abbr[date[0].lower()] + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 + else: + day = int(date[1]) + month = months[date[0].lower()] + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l = 6 + elif l == 6: + if date[3] == 'at': + date[1] + #Aug 25, 2016 at 7:00 PM + if len(date[0]) == 3: + day = int(date[1][:-1]) + month = months_abbr[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #August 25, 2016 at 7:00 PM + else: + day = int(date[1][:-1]) + month = months[date[0].lower()] + year = int(date[2]) + return datetime(year,month,day).date() + #parsing failed + else: + return date +# l > 6 + #parsing failed - l too big else: - #date item parser fail. datetime format unknown, check xpath selector or change the language of the interface' - return init_date + return date + #parsing failed - language not supported else: return init_date - date = datetime(year,month,day) - return date.date() - -def comments_strip(string): - return string[0].rstrip(' commenti') - -def reactions_strip(string): - friends = 1 + string[0].count(',') - e = 1 + string[0].count(' e ') - string = string[0].split()[::-1] - if len(string) == 1: - string = string[0] - while string.rfind('.') != -1: - string = string[0:string.rfind('.')] + string[string.rfind('.')+1:] + +def comments_strip(string,loader_context): + lang = loader_context['lang'] + if lang == 'it': + if string[0].rfind('Commenta') != -1: + return + else: + return string[0].rstrip(' commenti') + + elif lang == 'en': + new_string = string[0].rstrip(' Comments') + while new_string.rfind(',') != -1: + new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:] + return new_string + else: return string - string = string[0] - while string.rfind('.') != -1: - string = string[0:string.rfind('.')] + string[string.rfind('.')+1:] - - if not string.isdigit(): - return e +def reactions_strip(string,loader_context): + lang = loader_context['lang'] + if lang == 'it': + newstring = string[0] + #19.298.873 + if len(newstring.split()) == 1: + while newstring.rfind('.') != -1: + newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:] + return newstring + #Pamela, Luigi e altri 4 + else: + return string +# friends = newstring.count(' e ') + newstring.count(',') +# newstring = newstring.split()[::-1][0] +# while newstring.rfind('.') != -1: +# newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:] +# return int(newstring) + friends + elif lang == 'en': + newstring = string[0] + #19,298,873 + if len(newstring.split()) == 1: + while newstring.rfind(',') != -1: + newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] + return newstring +# #Mark and other 254,134 +# elif newstring.split()[::-1][1].isdigit(): +# friends = newstring.count(' and ') + newstring.count(',') +# newstring = newstring.split()[::-1][1] +# while newstring.rfind(',') != -1: +# newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] +# return int(newstring) + friends +# #Philip and 1K others + else: + return newstring else: - return int(string) + friends + return string + +def url_strip(url): + fullurl = url[0] + #catchin '&id=' is enough to identify the post + i = fullurl.find('&id=') + if i != -1: + j = fullurl[:i+4] + fullurl[i+4:].split('&')[0] + return j + else: + return fullurl class FbcrawlItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() source = scrapy.Field( output_processor=TakeFirst() ) # page that published the post @@ -153,10 +434,7 @@ class FbcrawlItem(scrapy.Item): comments = scrapy.Field( output_processor=comments_strip - ) - commentators = scrapy.Field( - output_processor=Join(separator=u'\n') - ) + ) reactions = scrapy.Field( output_processor=reactions_strip @@ -171,4 +449,6 @@ class FbcrawlItem(scrapy.Item): sigh = scrapy.Field() grrr = scrapy.Field() share = scrapy.Field() # num of shares - url = scrapy.Field() + url = scrapy.Field( + output_processor=url_strip + ) diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index 38fa5c5..bb0bb00 100644 Binary files a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc differ diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 9f0911a..41f9bb3 100644 Binary files a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc differ diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index c2d19d1..1ec1239 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -89,7 +89,6 @@ class FacebookSpider(scrapy.Spider): for i in range(len(rispostina)): risp = response.urljoin(rispostina[i].extract()) yield scrapy.Request(risp, callback=self.parse_rispostina) - next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") if len(next_page) > 0: diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index 8407664..c2ba592 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -3,6 +3,7 @@ import scrapy from scrapy.loader import ItemLoader from scrapy.http import FormRequest from fbcrawl.items import FbcrawlItem +from scrapy.exceptions import CloseSpider class FacebookSpider(scrapy.Spider): @@ -11,37 +12,51 @@ class FacebookSpider(scrapy.Spider): """ name = "fb" - def __init__(self, email='', password='', page='', year=2018, lang='', **kwargs): + def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs): super(FacebookSpider, self).__init__(**kwargs) - self.year = int(year) #arguments are passed as strings - + #email & pass need to be passed as attributes! if not email or not password: raise ValueError("You need to provide valid email and password!") else: self.email = email self.password = password + #page name parsing (added support for full urls) if not page: raise ValueError("You need to provide a valid page name to crawl!") + elif page.find('https://www.facebook.com/') != -1: + self.page = page[25:] + elif page.find('https://mbasic.facebook.com/') != -1: + self.page = page[28:] + elif page.find('https://m.facebook.com/') != -1: + self.page = page[23:] else: self.page = page - - if not(lang): - self.logger.info('Language attribute not provided, assuming "en"') - self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') - self.lang = 'en' + + #parse year + assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019' + self.year = int(year) #arguments are passed as strings + + #parse lang, if not provided (but is supported) it will be guessed in parse_home + if lang=='_': + self.logger.info('Language attribute not provided, I will try to guess it') + self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') + self.lang=lang elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt': self.lang = lang else: - self.logger.info('Lang:{} not currently supported'.format(lang)) + self.logger.info('Lang "{}" not currently supported'.format(lang)) self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') - self.logger.info('Change your interface lang from facebook and try again') - return + self.logger.info('Change your interface lang from facebook and try again') + raise CloseSpider('Language provided not currently supported') self.start_urls = ['https://mbasic.facebook.com'] def parse(self, response): + ''' + Handle login with provided credentials + ''' return FormRequest.from_response( response, formxpath='//form[contains(@action, "login")]', @@ -51,59 +66,57 @@ class FacebookSpider(scrapy.Spider): def parse_home(self, response): ''' - Parse user news feed page. This code is outdate and needs review. + This method has multiple purposes: + 1) Handle failed logins due to facebook 'save-device' redirection + 2) Set language interface, if not already provided + 3) Navigate to given page ''' - if response.css('#approvals_code'): - # Handle 'Approvals Code' checkpoint (ask user to enter code). - if not self.code: - # Show facebook messages via logs - # and request user for approval code. - message = response.css('._50f4::text').extract()[0] - self.log(message) - message = response.css('._3-8y._50f4').xpath('string()').extract()[0] - self.log(message) - self.code = input('Enter the code: ') - self.code = str(self.code) - if not (self.code and self.code.isdigit()): - self.log('Bad approvals code detected.') - return - return FormRequest.from_response( - response, - formdata={'approvals_code': self.code}, - callback=self.parse_home) - elif response.xpath("//div/a[contains(@href,'save-device')]"): -# elif response.xpath("//div/input[@value='Ok' and @type='submit']"): - # Handle 'Save Browser' checkpoint. + #handle 'save-device' redirection + if response.xpath("//div/a[contains(@href,'save-device')]"): return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, callback=self.parse_home) - elif response.css('button#checkpointSubmitButton'): - # Handle 'Someone tried to log into your account' warning. - return FormRequest.from_response( - response, callback=self.parse_home) - # Else go to the page requested. - if self.page.find('https://www.facebook.com/') != -1: - self.page = self.page[25:] + + #set language interface + if self.lang == '_': + if response.xpath("//input[@placeholder='Search Facebook']"): + self.lang = 'en' + elif response.xpath("//input[@value='Buscar']"): + self.lang = 'es' + elif response.xpath("//input[@value='Rechercher']"): + self.lang = 'fr' + elif response.xpath("//input[@value='Cerca']"): + self.lang = 'it' + elif response.xpath("//input[@value='Pesquisar']"): + self.lang = 'pt' + else: + raise CloseSpider('Language not recognized') + + #navigate to provided page href = response.urljoin(self.page) - self.logger.info('Parse function called on %s', href) + self.logger.info('Parsing facebook page %s', href) return scrapy.Request(url=href,callback=self.parse_page) def parse_page(self, response): + ''' + Parse the given page selecting the posts. + Then ask recursively for another page. + ''' #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(),selector=post) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") - #page_url - #new.add_value('url',response.url) + new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()") + + #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) yield scrapy.Request(temp_post, self.parse_post, meta={'item':new}) #load following page - #next_page = response.xpath('//*[@id="structured_composer_async_container"]/div[2]/a/@href') next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() if len(next_page) == 0: if response.meta['flag'] == 4 and self.year <= 2015: @@ -148,7 +161,6 @@ class FacebookSpider(scrapy.Spider): new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') - new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.urljoin(reactions[0].extract())