fixed recursion on pages

This commit is contained in:
rugantio 2019-02-04 19:26:00 +01:00
parent 918cd9ce64
commit dafd01c8bd
11 changed files with 127 additions and 95 deletions

1
.~lock.Trump.csv# Normal file
View File

@ -0,0 +1 @@
,rugantio,alice,04.02.2019 17:42,file:///home/rugantio/.config/libreoffice/4;

View File

@ -413,35 +413,38 @@ def url_strip(url):
#catchin '&id=' is enough to identify the post #catchin '&id=' is enough to identify the post
i = fullurl.find('&id=') i = fullurl.find('&id=')
if i != -1: if i != -1:
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0] return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
return j else: #catch photos
else: i = fullurl.find('/photos/')
return fullurl if i != -1:
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
else: #catch albums
i = fullurl.find('/albums/')
if i != -1:
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
else:
return fullurl
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):
source = scrapy.Field( source = scrapy.Field(
output_processor=TakeFirst() output_processor=TakeFirst()
) # page that published the post )
date = scrapy.Field( # when was the post published date = scrapy.Field( # when was the post published
input_processor=TakeFirst(), input_processor=TakeFirst(),
output_processor=parse_date output_processor=parse_date
) )
text = scrapy.Field( text = scrapy.Field(
output_processor=Join(separator=u'') output_processor=Join(separator=u'')
) # full text of the post ) # full text of the post
comments = scrapy.Field( comments = scrapy.Field(
output_processor=comments_strip output_processor=comments_strip
) )
reactions = scrapy.Field( reactions = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) # num of reactions ) # num of reactions
likes = scrapy.Field( likes = scrapy.Field(
output_processor=reactions_strip output_processor=reactions_strip
) )
ahah = scrapy.Field() ahah = scrapy.Field()
love = scrapy.Field() love = scrapy.Field()
@ -451,4 +454,5 @@ class FbcrawlItem(scrapy.Item):
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
url = scrapy.Field( url = scrapy.Field(
output_processor=url_strip output_processor=url_strip
) )
shared_from = scrapy.Field()

View File

@ -14,7 +14,6 @@ BOT_NAME = 'fbcrawl'
SPIDER_MODULES = ['fbcrawl.spiders'] SPIDER_MODULES = ['fbcrawl.spiders']
NEWSPIDER_MODULE = 'fbcrawl.spiders' NEWSPIDER_MODULE = 'fbcrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
@ -22,7 +21,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM
ROBOTSTXT_OBEY = False ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 CONCURRENT_REQUESTS = 1
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
@ -88,7 +87,7 @@ ROBOTSTXT_OBEY = False
#HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV #FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
FEED_EXPORT_ENCODING = 'utf-8' FEED_EXPORT_ENCODING = 'utf-8'
DUPEFILTER_DEBUG = True DUPEFILTER_DEBUG = True
LOG_LEVEL = 'INFO' LOG_LEVEL = 'INFO'

View File

@ -4,7 +4,6 @@ from scrapy.loader import ItemLoader
from scrapy.http import FormRequest from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem from fbcrawl.items import FbcrawlItem
class FacebookSpider(scrapy.Spider): class FacebookSpider(scrapy.Spider):
""" """
Parse FB comments, given a page (needs credentials) Parse FB comments, given a page (needs credentials)
@ -78,22 +77,27 @@ class FacebookSpider(scrapy.Spider):
) )
def parse_page(self, response): def parse_page(self, response):
for post in response.xpath('//div[count(@class)=1 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts #answer from page
new = ItemLoader(item=FbcrawlItem(),selector=post) for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
new.add_xpath('source', "./div/h3/a/text()") # resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
new.add_xpath('text',"//div/div/span[not(contains(text(),' · '))]/text() | ./div/div/text()") rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
yield new.load_item() risp = response.urljoin(rispostina[0].extract())
rispostina = response.xpath('//div/a[contains(text(),"rispost")]/@href')
for i in range(len(rispostina)):
risp = response.urljoin(rispostina[i].extract())
yield scrapy.Request(risp, callback=self.parse_rispostina) yield scrapy.Request(risp, callback=self.parse_rispostina)
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
if len(next_page) > 0: # for i in range(len(rispostina)):
next_page = response.urljoin(next_page[0].extract()) # risp = response.urljoin(rispostina[i].extract())
yield scrapy.Request(next_page, callback=self.parse_page) #
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
# new = ItemLoader(item=FbcrawlItem(),selector=post)
# new.add_xpath('source', "./div/h3/a/text()")
# new.add_xpath('text',"./div[1]/div[1]/text()")
# yield new.load_item()
#
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
# if len(next_page) > 0:
# next_page = response.urljoin(next_page[0].extract())
# yield scrapy.Request(next_page, callback=self.parse_page)
def parse_rispostina(self,response): def parse_rispostina(self,response):
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts

View File

@ -1,30 +1,39 @@
import scrapy import scrapy
import logging
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.http import FormRequest from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem from fbcrawl.items import FbcrawlItem
from scrapy.exceptions import CloseSpider
class FacebookSpider(scrapy.Spider): class FacebookSpider(scrapy.Spider):
""" """
Parse FB pages (needs credentials) Parse FB pages (needs credentials)
""" """
name = "fb" name = "fb"
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \
'sigh','grrr','comments','url']
}
def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs): def __init__(self,email='',password='',page='',year=2018,lang='_',*args,**kwargs):
super(FacebookSpider, self).__init__(**kwargs) #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
logger = logging.getLogger('scrapy.middleware')
logger.setLevel(logging.WARNING)
super().__init__(**kwargs)
#email & pass need to be passed as attributes! #email & pass need to be passed as attributes!
if not email or not password: if not email or not password:
raise ValueError("You need to provide valid email and password!") raise AttributeError('You need to provide valid email and password:\n'
'scrapy fb -a email="EMAIL" -a password="PASSWORD"')
else: else:
self.email = email self.email = email
self.password = password self.password = password
#page name parsing (added support for full urls) #page name parsing (added support for full urls)
if not page: if not page:
raise ValueError("You need to provide a valid page name to crawl!") raise AttributeError('You need to provide a valid page name to crawl!'
'scrapy fb -a page="PAGENAME"')
elif page.find('https://www.facebook.com/') != -1: elif page.find('https://www.facebook.com/') != -1:
self.page = page[25:] self.page = page[25:]
elif page.find('https://mbasic.facebook.com/') != -1: elif page.find('https://mbasic.facebook.com/') != -1:
@ -35,22 +44,27 @@ class FacebookSpider(scrapy.Spider):
self.page = page self.page = page
#parse year #parse year
assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019' assert int(year) <= 2019 and int(year) >= 2006, 'Year must be a number 2006 <= year <= 2019'
self.year = int(year) #arguments are passed as strings self.year = int(year) #arguments are passed as strings
#parse lang, if not provided (but is supported) it will be guessed in parse_home #parse lang, if not provided (but is supported) it will be guessed in parse_home
if lang=='_': if lang=='_':
self.logger.info('Language attribute not provided, I will try to guess it') self.logger.info('Language attribute not provided, I will try to guess it from the fb interface')
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"')
self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"')
self.lang=lang self.lang=lang
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt': elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
self.lang = lang self.lang = lang.lower()
else: else:
self.logger.info('Lang "{}" not currently supported'.format(lang)) self.logger.info('Lang "{}" not currently supported'.format(lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again') self.logger.info('Change your interface lang from facebook and try again')
raise CloseSpider('Language provided not currently supported') raise AttributeError('Language provided not currently supported')
#current year, this variable is needed for parse_page recursion
self.k = 2019
self.count = 0
self.start_urls = ['https://mbasic.facebook.com'] self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response): def parse(self, response):
@ -73,29 +87,39 @@ class FacebookSpider(scrapy.Spider):
''' '''
#handle 'save-device' redirection #handle 'save-device' redirection
if response.xpath("//div/a[contains(@href,'save-device')]"): if response.xpath("//div/a[contains(@href,'save-device')]"):
self.logger.info('Got stuck in "save-device" checkpoint')
self.logger.info('I will now try to redirect to the correct page')
return FormRequest.from_response( return FormRequest.from_response(
response, response,
formdata={'name_action_selected': 'dont_save'}, formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home) callback=self.parse_home
)
#set language interface #set language interface
if self.lang == '_': if self.lang == '_':
if response.xpath("//input[@placeholder='Search Facebook']"): if response.xpath("//input[@placeholder='Search Facebook']"):
self.logger.info('Language recognized: lang="en"')
self.lang = 'en' self.lang = 'en'
elif response.xpath("//input[@value='Buscar']"): elif response.xpath("//input[@placeholder='Buscar en Facebook']"):
self.logger.info('Language recognized: lang="es"')
self.lang = 'es' self.lang = 'es'
elif response.xpath("//input[@value='Rechercher']"): elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"):
self.logger.info('Language recognized: lang="fr"')
self.lang = 'fr' self.lang = 'fr'
elif response.xpath("//input[@value='Cerca']"): elif response.xpath("//input[@placeholder='Cerca su Facebook']"):
self.logger.info('Language recognized: lang="it"')
self.lang = 'it' self.lang = 'it'
elif response.xpath("//input[@value='Pesquisar']"): elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"):
self.logger.info('Language recognized: lang="pt"')
self.lang = 'pt' self.lang = 'pt'
else: else:
raise CloseSpider('Language not recognized') raise AttributeError('Language not recognized\n'
'Change your interface lang from facebook '
'and try again')
#navigate to provided page #navigate to provided page
href = response.urljoin(self.page) href = response.urljoin(self.page)
self.logger.info('Parsing facebook page %s', href) self.logger.info('Scraping facebook page {}'.format(href))
return scrapy.Request(url=href,callback=self.parse_page) return scrapy.Request(url=href,callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
@ -106,6 +130,7 @@ class FacebookSpider(scrapy.Spider):
#select all posts #select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
new = ItemLoader(item=FbcrawlItem(),selector=post) new = ItemLoader(item=FbcrawlItem(),selector=post)
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()") new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
@ -113,54 +138,53 @@ class FacebookSpider(scrapy.Spider):
#page_url #new.add_value('url',response.url) #page_url #new.add_value('url',response.url)
#returns full post-link in a list #returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0]) temp_post = response.urljoin(post[0])
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new}) self.count -= 1
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
#load following page #load following page
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() #tries to click on "more", otherwise it looks for the appropriate
if len(next_page) == 0: #year for 1-click only and proceeds to click on others
if response.meta['flag'] == 4 and self.year <= 2015: new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
self.logger.info('2014 reached, flag = 5') if not new_page:
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() if response.meta['flag'] == self.k and self.year <= self.k:
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('There are no more, clicking on year = {}'.format(self.k))
new_page = response.urljoin(next_page[0]) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5}) new_page = response.xpath(xpath).extract()
elif response.meta['flag'] == 3 and self.year <= 2015: if new_page:
self.logger.info('2015 reached, flag = 4') new_page = response.urljoin(new_page[0])
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract() self.k -= 1
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('Everything OK, new flag: {}'.format(self.k))
new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4}) else:
elif response.meta['flag'] == 2 and self.year <= 2016: while not new_page: #sometimes the years are skipped
self.logger.info('2016 reached, flag = 3') self.logger.info('XPATH not found for year {}'.format(self.k-1))
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract() self.k -= 1
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('Trying with previous year, flag={}'.format(self.k))
new_page = response.urljoin(next_page[0]) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3}) new_page = response.xpath(xpath).extract()
elif response.meta['flag'] == 1 and self.year <= 2017: self.logger.info('New page found with flag {}'.format(self.k))
self.logger.info('2017 reached, flag = 2') new_page = response.urljoin(new_page[0])
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract() self.k -= 1
self.logger.info('next_page = {}'.format(next_page[0])) self.logger.info('Now going with flag {}'.format(self.k))
new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})
elif response.meta['flag'] == 0 and self.year <= 2018:
self.logger.info('2018 reached, flag = 1')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':1})
else: else:
new_page = response.urljoin(next_page[0]) new_page = response.urljoin(new_page[0])
if 'flag' in response.meta: if 'flag' in response.meta:
self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag']))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
else: else:
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':0}) self.logger.info('FLAG DOES NOT REPRESENT ACTUAL YEAR')
self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
def parse_post(self,response): def parse_post(self,response):
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
new.add_xpath('date','//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
reactions = response.urljoin(reactions[0].extract()) reactions = response.urljoin(reactions[0].extract())
@ -175,4 +199,4 @@ class FacebookSpider(scrapy.Spider):
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
yield new.load_item() yield new.load_item()