diff --git a/.~lock.Trump.csv# b/.~lock.Trump.csv# new file mode 100644 index 0000000..6f06104 --- /dev/null +++ b/.~lock.Trump.csv# @@ -0,0 +1 @@ +,rugantio,alice,04.02.2019 17:42,file:///home/rugantio/.config/libreoffice/4; \ No newline at end of file diff --git a/fbcrawl/__pycache__/__init__.cpython-37.pyc b/fbcrawl/__pycache__/__init__.cpython-37.pyc index d4240eb..2b54685 100644 Binary files a/fbcrawl/__pycache__/__init__.cpython-37.pyc and b/fbcrawl/__pycache__/__init__.cpython-37.pyc differ diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index b2f57d3..825ad8e 100644 Binary files a/fbcrawl/__pycache__/items.cpython-37.pyc and b/fbcrawl/__pycache__/items.cpython-37.pyc differ diff --git a/fbcrawl/__pycache__/settings.cpython-37.pyc b/fbcrawl/__pycache__/settings.cpython-37.pyc index 32cc9a4..380b6bd 100644 Binary files a/fbcrawl/__pycache__/settings.cpython-37.pyc and b/fbcrawl/__pycache__/settings.cpython-37.pyc differ diff --git a/fbcrawl/items.py b/fbcrawl/items.py index 4f48bae..ecafbd9 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -413,35 +413,38 @@ def url_strip(url): #catchin '&id=' is enough to identify the post i = fullurl.find('&id=') if i != -1: - j = fullurl[:i+4] + fullurl[i+4:].split('&')[0] - return j - else: - return fullurl + return fullurl[:i+4] + fullurl[i+4:].split('&')[0] + else: #catch photos + i = fullurl.find('/photos/') + if i != -1: + return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] + else: #catch albums + i = fullurl.find('/albums/') + if i != -1: + return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] + else: + return fullurl + class FbcrawlItem(scrapy.Item): - source = scrapy.Field( - output_processor=TakeFirst() - ) # page that published the post - + source = scrapy.Field( + output_processor=TakeFirst() + ) date = scrapy.Field( # when was the post published - input_processor=TakeFirst(), - output_processor=parse_date + input_processor=TakeFirst(), + output_processor=parse_date ) - text = scrapy.Field( - output_processor=Join(separator=u'') + output_processor=Join(separator=u'') ) # full text of the post - comments = scrapy.Field( - output_processor=comments_strip + output_processor=comments_strip ) - reactions = scrapy.Field( - output_processor=reactions_strip + output_processor=reactions_strip ) # num of reactions - likes = scrapy.Field( - output_processor=reactions_strip + output_processor=reactions_strip ) ahah = scrapy.Field() love = scrapy.Field() @@ -451,4 +454,5 @@ class FbcrawlItem(scrapy.Item): share = scrapy.Field() # num of shares url = scrapy.Field( output_processor=url_strip - ) + ) + shared_from = scrapy.Field() diff --git a/fbcrawl/settings.py b/fbcrawl/settings.py index ee82e25..0d6f667 100644 --- a/fbcrawl/settings.py +++ b/fbcrawl/settings.py @@ -14,7 +14,6 @@ BOT_NAME = 'fbcrawl' SPIDER_MODULES = ['fbcrawl.spiders'] NEWSPIDER_MODULE = 'fbcrawl.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' @@ -22,7 +21,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS = 1 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay @@ -88,7 +87,7 @@ ROBOTSTXT_OBEY = False #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV +#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV FEED_EXPORT_ENCODING = 'utf-8' DUPEFILTER_DEBUG = True LOG_LEVEL = 'INFO' diff --git a/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc b/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc index da2dd56..ef2c244 100644 Binary files a/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc differ diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index bb0bb00..633f30c 100644 Binary files a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc differ diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 41f9bb3..c6c0fc6 100644 Binary files a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc differ diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index 1ec1239..c9da10b 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -4,7 +4,6 @@ from scrapy.loader import ItemLoader from scrapy.http import FormRequest from fbcrawl.items import FbcrawlItem - class FacebookSpider(scrapy.Spider): """ Parse FB comments, given a page (needs credentials) @@ -78,22 +77,27 @@ class FacebookSpider(scrapy.Spider): ) def parse_page(self, response): - for post in response.xpath('//div[count(@class)=1 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts - new = ItemLoader(item=FbcrawlItem(),selector=post) - new.add_xpath('source', "./div/h3/a/text()") - new.add_xpath('text',"//div/div/span[not(contains(text(),' ยท '))]/text() | ./div/div/text()") - yield new.load_item() - - rispostina = response.xpath('//div/a[contains(text(),"rispost")]/@href') - - for i in range(len(rispostina)): - risp = response.urljoin(rispostina[i].extract()) + #answer from page + for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): +# resp = ItemLoader(item=FbcrawlItem(),selector=risposta) + rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href') + risp = response.urljoin(rispostina[0].extract()) yield scrapy.Request(risp, callback=self.parse_rispostina) - next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") - if len(next_page) > 0: - next_page = response.urljoin(next_page[0].extract()) - yield scrapy.Request(next_page, callback=self.parse_page) + +# for i in range(len(rispostina)): +# risp = response.urljoin(rispostina[i].extract()) +# +# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts +# new = ItemLoader(item=FbcrawlItem(),selector=post) +# new.add_xpath('source', "./div/h3/a/text()") +# new.add_xpath('text',"./div[1]/div[1]/text()") +# yield new.load_item() +# +# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") +# if len(next_page) > 0: +# next_page = response.urljoin(next_page[0].extract()) +# yield scrapy.Request(next_page, callback=self.parse_page) def parse_rispostina(self,response): for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index c2ba592..10e256f 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -1,30 +1,39 @@ import scrapy +import logging from scrapy.loader import ItemLoader from scrapy.http import FormRequest from fbcrawl.items import FbcrawlItem -from scrapy.exceptions import CloseSpider - class FacebookSpider(scrapy.Spider): """ Parse FB pages (needs credentials) """ name = "fb" + custom_settings = { + 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ + 'reactions','likes','ahah','love','wow', \ + 'sigh','grrr','comments','url'] + } - def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs): - super(FacebookSpider, self).__init__(**kwargs) + def __init__(self,email='',password='',page='',year=2018,lang='_',*args,**kwargs): + #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs + logger = logging.getLogger('scrapy.middleware') + logger.setLevel(logging.WARNING) + super().__init__(**kwargs) #email & pass need to be passed as attributes! if not email or not password: - raise ValueError("You need to provide valid email and password!") + raise AttributeError('You need to provide valid email and password:\n' + 'scrapy fb -a email="EMAIL" -a password="PASSWORD"') else: self.email = email self.password = password #page name parsing (added support for full urls) if not page: - raise ValueError("You need to provide a valid page name to crawl!") + raise AttributeError('You need to provide a valid page name to crawl!' + 'scrapy fb -a page="PAGENAME"') elif page.find('https://www.facebook.com/') != -1: self.page = page[25:] elif page.find('https://mbasic.facebook.com/') != -1: @@ -35,22 +44,27 @@ class FacebookSpider(scrapy.Spider): self.page = page #parse year - assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019' + assert int(year) <= 2019 and int(year) >= 2006, 'Year must be a number 2006 <= year <= 2019' self.year = int(year) #arguments are passed as strings - + #parse lang, if not provided (but is supported) it will be guessed in parse_home if lang=='_': - self.logger.info('Language attribute not provided, I will try to guess it') - self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') + self.logger.info('Language attribute not provided, I will try to guess it from the fb interface') + self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"') + self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"') self.lang=lang elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt': - self.lang = lang + self.lang = lang.lower() else: self.logger.info('Lang "{}" not currently supported'.format(lang)) self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('Change your interface lang from facebook and try again') - raise CloseSpider('Language provided not currently supported') + raise AttributeError('Language provided not currently supported') + #current year, this variable is needed for parse_page recursion + self.k = 2019 + self.count = 0 + self.start_urls = ['https://mbasic.facebook.com'] def parse(self, response): @@ -73,29 +87,39 @@ class FacebookSpider(scrapy.Spider): ''' #handle 'save-device' redirection if response.xpath("//div/a[contains(@href,'save-device')]"): + self.logger.info('Got stuck in "save-device" checkpoint') + self.logger.info('I will now try to redirect to the correct page') return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, - callback=self.parse_home) + callback=self.parse_home + ) #set language interface if self.lang == '_': if response.xpath("//input[@placeholder='Search Facebook']"): + self.logger.info('Language recognized: lang="en"') self.lang = 'en' - elif response.xpath("//input[@value='Buscar']"): + elif response.xpath("//input[@placeholder='Buscar en Facebook']"): + self.logger.info('Language recognized: lang="es"') self.lang = 'es' - elif response.xpath("//input[@value='Rechercher']"): + elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"): + self.logger.info('Language recognized: lang="fr"') self.lang = 'fr' - elif response.xpath("//input[@value='Cerca']"): + elif response.xpath("//input[@placeholder='Cerca su Facebook']"): + self.logger.info('Language recognized: lang="it"') self.lang = 'it' - elif response.xpath("//input[@value='Pesquisar']"): + elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"): + self.logger.info('Language recognized: lang="pt"') self.lang = 'pt' else: - raise CloseSpider('Language not recognized') - + raise AttributeError('Language not recognized\n' + 'Change your interface lang from facebook ' + 'and try again') + #navigate to provided page href = response.urljoin(self.page) - self.logger.info('Parsing facebook page %s', href) + self.logger.info('Scraping facebook page {}'.format(href)) return scrapy.Request(url=href,callback=self.parse_page) def parse_page(self, response): @@ -106,6 +130,7 @@ class FacebookSpider(scrapy.Spider): #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(),selector=post) + self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()") @@ -113,54 +138,53 @@ class FacebookSpider(scrapy.Spider): #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() - temp_post = response.urljoin(post[0]) - yield scrapy.Request(temp_post, self.parse_post, meta={'item':new}) + temp_post = response.urljoin(post[0]) + self.count -= 1 + yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) #load following page - next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() - if len(next_page) == 0: - if response.meta['flag'] == 4 and self.year <= 2015: - self.logger.info('2014 reached, flag = 5') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5}) - elif response.meta['flag'] == 3 and self.year <= 2015: - self.logger.info('2015 reached, flag = 4') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4}) - elif response.meta['flag'] == 2 and self.year <= 2016: - self.logger.info('2016 reached, flag = 3') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3}) - elif response.meta['flag'] == 1 and self.year <= 2017: - self.logger.info('2017 reached, flag = 2') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2}) - elif response.meta['flag'] == 0 and self.year <= 2018: - self.logger.info('2018 reached, flag = 1') - next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract() - self.logger.info('next_page = {}'.format(next_page[0])) - new_page = response.urljoin(next_page[0]) - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':1}) + #tries to click on "more", otherwise it looks for the appropriate + #year for 1-click only and proceeds to click on others + new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() + if not new_page: + if response.meta['flag'] == self.k and self.year <= self.k: + self.logger.info('There are no more, clicking on year = {}'.format(self.k)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + if new_page: + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Everything OK, new flag: {}'.format(self.k)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) + else: + while not new_page: #sometimes the years are skipped + self.logger.info('XPATH not found for year {}'.format(self.k-1)) + self.k -= 1 + self.logger.info('Trying with previous year, flag={}'.format(self.k)) + xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" + new_page = response.xpath(xpath).extract() + self.logger.info('New page found with flag {}'.format(self.k)) + new_page = response.urljoin(new_page[0]) + self.k -= 1 + self.logger.info('Now going with flag {}'.format(self.k)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: - new_page = response.urljoin(next_page[0]) + new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: + self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag'])) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) else: - yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':0}) + self.logger.info('FLAG DOES NOT REPRESENT ACTUAL YEAR') + self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k)) + yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) def parse_post(self,response): new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") - new.add_xpath('date', '//div/div/abbr/text()') + new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') + new.add_xpath('date','//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') + new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.urljoin(reactions[0].extract()) @@ -175,4 +199,4 @@ class FacebookSpider(scrapy.Spider): new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") - yield new.load_item() + yield new.load_item() \ No newline at end of file