diff --git a/fbcrawl/__pycache__/settings.cpython-37.pyc b/fbcrawl/__pycache__/settings.cpython-37.pyc index 57cea57..4755c23 100644 Binary files a/fbcrawl/__pycache__/settings.cpython-37.pyc and b/fbcrawl/__pycache__/settings.cpython-37.pyc differ diff --git a/fbcrawl/settings.py b/fbcrawl/settings.py index c833770..fafad9b 100644 --- a/fbcrawl/settings.py +++ b/fbcrawl/settings.py @@ -25,7 +25,7 @@ CONCURRENT_REQUESTS = 16 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -DOWNLOAD_DELAY = 2 +DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 1 diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 9b2b117..05c2a2a 100644 Binary files a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc differ diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index d793595..6585c34 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -52,9 +52,8 @@ class FacebookSpider(scrapy.Spider): self.date = datetime(2014,1,1) self.year = 2014 else: - print(type(kwargs['date'])) self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d') - self.year = datetime.now().year - 1 + self.year = self.date.year #parse lang, if not provided (but is supported) it will be guessed in parse_home if 'lang' not in kwargs: @@ -71,7 +70,7 @@ class FacebookSpider(scrapy.Spider): raise AttributeError('Language provided not currently supported') #current year, this variable is needed for parse_page recursion - self.k = 2019 + self.k = datetime.now().year #count number of posts, used to prioritized parsing and correctly insert in the csv self.count = 0 diff --git a/fbcrawl/spiders/new_fbcrawl.py b/fbcrawl/spiders/new_fbcrawl.py new file mode 100644 index 0000000..c70a196 --- /dev/null +++ b/fbcrawl/spiders/new_fbcrawl.py @@ -0,0 +1,248 @@ +import scrapy +import logging + +from scrapy.loader import ItemLoader +from scrapy.http import FormRequest +from scrapy.exceptions import CloseSpider +from fbcrawl.items import FbcrawlItem, parse_date2 +from datetime import datetime +from time import sleep + +class FacebookSpider(scrapy.Spider): + ''' + Parse FB pages (needs credentials) + ''' + name = 'newfb' + custom_settings = { + 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ + 'reactions','likes','ahah','love','wow', \ + 'sigh','grrr','comments','post_id','url'] + } + + def __init__(self, *args, **kwargs): + #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs + logger = logging.getLogger('scrapy.middleware') + logger.setLevel(logging.WARNING) + super().__init__(*args,**kwargs) + + #email & pass need to be passed as attributes! + if 'email' not in kwargs or 'password' not in kwargs: + raise AttributeError('You need to provide valid email and password:\n' + 'scrapy fb -a email="EMAIL" -a password="PASSWORD"') + else: + self.logger.info('Email and password provided, using these as credentials') + + #page name parsing (added support for full urls) + if 'page' not in kwargs: + raise AttributeError('You need to provide a valid page name to crawl!' + 'scrapy fb -a page="PAGENAME"') + elif self.page.find('https://www.facebook.com/') != -1: + self.page = self.page[25:] + self.logger.info('Page attribute provided, scraping "{}"'.format(self.page)) + elif self.page.find('https://mbasic.facebook.com/') != -1: + self.page = self.page[28:] + self.logger.info('Page attribute provided, scraping "{}"'.format(self.page)) + elif self.page.find('https://m.facebook.com/') != -1: + self.page = self.page[23:] + self.logger.info('Page attribute provided, scraping "{}"'.format(self.page)) + else: + self.logger.info('Page attribute provided, scraping "{}"'.format(self.page)) + + #parse date + if 'date' not in kwargs: + self.date = datetime(2014,1,1) + self.year = 2014 + else: + self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d') + self.year = datetime.now().year - 1 + + #parse lang, if not provided (but is supported) it will be guessed in parse_home + if 'lang' not in kwargs: + self.logger.info('Language attribute not provided, I will try to guess it from the fb interface') + self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"') + self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"') + self.lang = '_' + elif self.lang == 'en' or self.lang == 'es' or self.lang == 'fr' or self.lang == 'it' or self.lang == 'pt': + self.logger.info('Language attribute recognized, using "{}" for the facebook interface'.format(self.lang)) + else: + self.logger.info('Lang "{}" not currently supported'.format(self.lang)) + self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') + self.logger.info('Change your interface lang from facebook settings and try again') + raise AttributeError('Language provided not currently supported') + + #current year, this variable is needed for parse_page recursion + self.k = 2019 + #count number of posts, used to prioritized parsing and correctly insert in the csv + self.count = 0 + + self.start_urls = ['https://mbasic.facebook.com/' + self.page] + + def parse(self,response): + data = response.xpath("//div[contains(@data-ft,'top_level_post_id')]/@data-ft").extract() + import json + json_data = json.loads(data[0]) + page_id = json_data['page_id'] + first_post = json_data['top_level_post_id'] + + magic_link = 'https://m.facebook.com/page_content_list_view/more/?page_id=' + \ + str(page_id) + return scrapy.Request(url=magic_link)#,callback=self.parse_page,meta={'index':1}) + + def parse2(self, response): + ''' + Handle login with provided credentials + ''' + return FormRequest.from_response( + response, + formxpath='//form[contains(@action, "login")]', + formdata={'email': self.email,'pass': self.password}, + callback=self.parse_home + ) + + def parse_home(self, response): + ''' + This method has multiple purposes: + 1) Handle failed logins due to facebook 'save-device' redirection + 2) Set language interface, if not already provided + 3) Navigate to given page + ''' + #handle 'save-device' redirection + if response.xpath("//div/a[contains(@href,'save-device')]"): + self.logger.info('Got stuck in "save-device" checkpoint') + self.logger.info('I will now try to redirect to the correct page') + return FormRequest.from_response( + response, + formdata={'name_action_selected': 'dont_save'}, + callback=self.parse_home + ) + + #set language interface + if self.lang == '_': + if response.xpath("//input[@placeholder='Search Facebook']"): + self.logger.info('Language recognized: lang="en"') + self.lang = 'en' + elif response.xpath("//input[@placeholder='Buscar en Facebook']"): + self.logger.info('Language recognized: lang="es"') + self.lang = 'es' + elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"): + self.logger.info('Language recognized: lang="fr"') + self.lang = 'fr' + elif response.xpath("//input[@placeholder='Cerca su Facebook']"): + self.logger.info('Language recognized: lang="it"') + self.lang = 'it' + elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"): + self.logger.info('Language recognized: lang="pt"') + self.lang = 'pt' + else: + raise AttributeError('Language not recognized\n' + 'Change your interface lang from facebook ' + 'and try again') + + #navigate to provided page + href = response.urljoin(self.page) + self.logger.info('Scraping facebook page {}'.format(href)) + return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1}) + + def parse_page(self, response): + ''' + Parse the given page selecting the posts. + Then ask recursively for another page. + ''' +# #open page in browser for debug +# from scrapy.utils.response import open_in_browser +# open_in_browser(response) + + #select all posts + for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): + + many_features = post.xpath('./@data-ft').get() + date = [] + date.append(many_features) + date = parse_date2(date) + current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') + + if self.date > current_date: + raise CloseSpider('Reached date: {}'.format(self.date)) + new = ItemLoader(item=FbcrawlItem(),selector=post) + self.logger.info('Parsing post n = {}'.format(abs(self.count))) + new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') + new.add_xpath('date','./@data-ft') + new.add_xpath('post_id','./@data-ft') + new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") + + #page_url #new.add_value('url',response.url) + #returns full post-link in a list + post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() + temp_post = response.urljoin(post[0]) + self.count -= 1 + sleep(2) + yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) + + #load following page, try to click on "more" + #after few pages have been scraped, the "more" link might disappears + #if not present look for the highest year not parsed yet, click once + #and keep looking for "more" + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + if not new_page: + if response.meta['flag'] == self.k and self.k >= self.year: + self.logger.info('There are no more, flag set at = {}'.format(self.k)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + if new_page: + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Everything OK, new flag: {}'.format(self.k)) + sleep(2) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) + else: + while not new_page: #sometimes the years are skipped this handles small year gaps + self.logger.info('XPATH not found for year {}'.format(self.k-1)) + self.k -= 1 + self.logger.info('Trying with previous year, flag={}'.format(self.k)) + if self.k < self.year: + self.logger.info('The previous year to crawl is less than the parameter year: {} < {}'.format(self.k,self.year)) + self.logger.info('This is not handled well, please re-run with -a year="{}" or less'.format(self.k)) + break + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + self.logger.info('New page found with flag {}'.format(self.k)) + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Now going with flag {}'.format(self.k)) + sleep(2) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) + else: + self.logger.info('Crawling has finished with no errors!') + else: + new_page = response.urljoin(new_page[0]) + if 'flag' in response.meta: + self.logger.info('Page scraped, click on more! new_page = {} flag = {}'.format(new_page,date)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) + else: +# self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') + self.logger.info('First page scraped, click on more {}! Flag not set, default flag = {}'.format(new_page,date)) + sleep(2) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) + + def parse_post(self,response): + new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) + new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") + new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') + # new.add_xpath('date','//div/div/abbr/text()') + new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') + new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") + + reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") + reactions = response.urljoin(reactions[0].extract()) + yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) + + def parse_reactions(self,response): + new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item']) + new.context['lang'] = self.lang + new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") + new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") + new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") + new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") + new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") + new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") + yield new.load_item() \ No newline at end of file