fixed recursion on pages

This commit is contained in:
rugantio 2019-02-04 19:26:00 +01:00
parent 918cd9ce64
commit dafd01c8bd
11 changed files with 127 additions and 95 deletions

1
.~lock.Trump.csv# Normal file
View File

@ -0,0 +1 @@
,rugantio,alice,04.02.2019 17:42,file:///home/rugantio/.config/libreoffice/4;

View File

@ -413,33 +413,36 @@ def url_strip(url):
#catchin '&id=' is enough to identify the post
i = fullurl.find('&id=')
if i != -1:
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
return j
return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
else: #catch photos
i = fullurl.find('/photos/')
if i != -1:
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
else: #catch albums
i = fullurl.find('/albums/')
if i != -1:
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
else:
return fullurl
class FbcrawlItem(scrapy.Item):
source = scrapy.Field(
output_processor=TakeFirst()
) # page that published the post
)
date = scrapy.Field( # when was the post published
input_processor=TakeFirst(),
output_processor=parse_date
)
text = scrapy.Field(
output_processor=Join(separator=u'')
) # full text of the post
comments = scrapy.Field(
output_processor=comments_strip
)
reactions = scrapy.Field(
output_processor=reactions_strip
) # num of reactions
likes = scrapy.Field(
output_processor=reactions_strip
)
@ -452,3 +455,4 @@ class FbcrawlItem(scrapy.Item):
url = scrapy.Field(
output_processor=url_strip
)
shared_from = scrapy.Field()

View File

@ -14,7 +14,6 @@ BOT_NAME = 'fbcrawl'
SPIDER_MODULES = ['fbcrawl.spiders']
NEWSPIDER_MODULE = 'fbcrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
@ -22,7 +21,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = 1
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
@ -88,7 +87,7 @@ ROBOTSTXT_OBEY = False
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
FEED_EXPORT_ENCODING = 'utf-8'
DUPEFILTER_DEBUG = True
LOG_LEVEL = 'INFO'

View File

@ -4,7 +4,6 @@ from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem
class FacebookSpider(scrapy.Spider):
"""
Parse FB comments, given a page (needs credentials)
@ -78,22 +77,27 @@ class FacebookSpider(scrapy.Spider):
)
def parse_page(self, response):
for post in response.xpath('//div[count(@class)=1 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
new = ItemLoader(item=FbcrawlItem(),selector=post)
new.add_xpath('source', "./div/h3/a/text()")
new.add_xpath('text',"//div/div/span[not(contains(text(),' · '))]/text() | ./div/div/text()")
yield new.load_item()
rispostina = response.xpath('//div/a[contains(text(),"rispost")]/@href')
for i in range(len(rispostina)):
risp = response.urljoin(rispostina[i].extract())
#answer from page
for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
# resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
risp = response.urljoin(rispostina[0].extract())
yield scrapy.Request(risp, callback=self.parse_rispostina)
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
if len(next_page) > 0:
next_page = response.urljoin(next_page[0].extract())
yield scrapy.Request(next_page, callback=self.parse_page)
# for i in range(len(rispostina)):
# risp = response.urljoin(rispostina[i].extract())
#
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
# new = ItemLoader(item=FbcrawlItem(),selector=post)
# new.add_xpath('source', "./div/h3/a/text()")
# new.add_xpath('text',"./div[1]/div[1]/text()")
# yield new.load_item()
#
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
# if len(next_page) > 0:
# next_page = response.urljoin(next_page[0].extract())
# yield scrapy.Request(next_page, callback=self.parse_page)
def parse_rispostina(self,response):
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts

View File

@ -1,30 +1,39 @@
import scrapy
import logging
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem
from scrapy.exceptions import CloseSpider
class FacebookSpider(scrapy.Spider):
"""
Parse FB pages (needs credentials)
"""
name = "fb"
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \
'sigh','grrr','comments','url']
}
def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
super(FacebookSpider, self).__init__(**kwargs)
def __init__(self,email='',password='',page='',year=2018,lang='_',*args,**kwargs):
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
logger = logging.getLogger('scrapy.middleware')
logger.setLevel(logging.WARNING)
super().__init__(**kwargs)
#email & pass need to be passed as attributes!
if not email or not password:
raise ValueError("You need to provide valid email and password!")
raise AttributeError('You need to provide valid email and password:\n'
'scrapy fb -a email="EMAIL" -a password="PASSWORD"')
else:
self.email = email
self.password = password
#page name parsing (added support for full urls)
if not page:
raise ValueError("You need to provide a valid page name to crawl!")
raise AttributeError('You need to provide a valid page name to crawl!'
'scrapy fb -a page="PAGENAME"')
elif page.find('https://www.facebook.com/') != -1:
self.page = page[25:]
elif page.find('https://mbasic.facebook.com/') != -1:
@ -35,21 +44,26 @@ class FacebookSpider(scrapy.Spider):
self.page = page
#parse year
assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
assert int(year) <= 2019 and int(year) >= 2006, 'Year must be a number 2006 <= year <= 2019'
self.year = int(year) #arguments are passed as strings
#parse lang, if not provided (but is supported) it will be guessed in parse_home
if lang=='_':
self.logger.info('Language attribute not provided, I will try to guess it')
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Language attribute not provided, I will try to guess it from the fb interface')
self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"')
self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"')
self.lang=lang
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
self.lang = lang
self.lang = lang.lower()
else:
self.logger.info('Lang "{}" not currently supported'.format(lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again')
raise CloseSpider('Language provided not currently supported')
raise AttributeError('Language provided not currently supported')
#current year, this variable is needed for parse_page recursion
self.k = 2019
self.count = 0
self.start_urls = ['https://mbasic.facebook.com']
@ -73,29 +87,39 @@ class FacebookSpider(scrapy.Spider):
'''
#handle 'save-device' redirection
if response.xpath("//div/a[contains(@href,'save-device')]"):
self.logger.info('Got stuck in "save-device" checkpoint')
self.logger.info('I will now try to redirect to the correct page')
return FormRequest.from_response(
response,
formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home)
callback=self.parse_home
)
#set language interface
if self.lang == '_':
if response.xpath("//input[@placeholder='Search Facebook']"):
self.logger.info('Language recognized: lang="en"')
self.lang = 'en'
elif response.xpath("//input[@value='Buscar']"):
elif response.xpath("//input[@placeholder='Buscar en Facebook']"):
self.logger.info('Language recognized: lang="es"')
self.lang = 'es'
elif response.xpath("//input[@value='Rechercher']"):
elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"):
self.logger.info('Language recognized: lang="fr"')
self.lang = 'fr'
elif response.xpath("//input[@value='Cerca']"):
elif response.xpath("//input[@placeholder='Cerca su Facebook']"):
self.logger.info('Language recognized: lang="it"')
self.lang = 'it'
elif response.xpath("//input[@value='Pesquisar']"):
elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"):
self.logger.info('Language recognized: lang="pt"')
self.lang = 'pt'
else:
raise CloseSpider('Language not recognized')
raise AttributeError('Language not recognized\n'
'Change your interface lang from facebook '
'and try again')
#navigate to provided page
href = response.urljoin(self.page)
self.logger.info('Parsing facebook page %s', href)
self.logger.info('Scraping facebook page {}'.format(href))
return scrapy.Request(url=href,callback=self.parse_page)
def parse_page(self, response):
@ -106,6 +130,7 @@ class FacebookSpider(scrapy.Spider):
#select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
new = ItemLoader(item=FbcrawlItem(),selector=post)
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
@ -114,53 +139,52 @@ class FacebookSpider(scrapy.Spider):
#returns full post-link in a list
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
temp_post = response.urljoin(post[0])
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
self.count -= 1
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
#load following page
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if len(next_page) == 0:
if response.meta['flag'] == 4 and self.year <= 2015:
self.logger.info('2014 reached, flag = 5')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5})
elif response.meta['flag'] == 3 and self.year <= 2015:
self.logger.info('2015 reached, flag = 4')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4})
elif response.meta['flag'] == 2 and self.year <= 2016:
self.logger.info('2016 reached, flag = 3')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3})
elif response.meta['flag'] == 1 and self.year <= 2017:
self.logger.info('2017 reached, flag = 2')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})
elif response.meta['flag'] == 0 and self.year <= 2018:
self.logger.info('2018 reached, flag = 1')
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
self.logger.info('next_page = {}'.format(next_page[0]))
new_page = response.urljoin(next_page[0])
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':1})
#tries to click on "more", otherwise it looks for the appropriate
#year for 1-click only and proceeds to click on others
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if not new_page:
if response.meta['flag'] == self.k and self.year <= self.k:
self.logger.info('There are no more, clicking on year = {}'.format(self.k))
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
if new_page:
new_page = response.urljoin(new_page[0])
self.k -= 1
self.logger.info('Everything OK, new flag: {}'.format(self.k))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
else:
new_page = response.urljoin(next_page[0])
while not new_page: #sometimes the years are skipped
self.logger.info('XPATH not found for year {}'.format(self.k-1))
self.k -= 1
self.logger.info('Trying with previous year, flag={}'.format(self.k))
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
new_page = response.xpath(xpath).extract()
self.logger.info('New page found with flag {}'.format(self.k))
new_page = response.urljoin(new_page[0])
self.k -= 1
self.logger.info('Now going with flag {}'.format(self.k))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
else:
new_page = response.urljoin(new_page[0])
if 'flag' in response.meta:
self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag']))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
else:
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':0})
self.logger.info('FLAG DOES NOT REPRESENT ACTUAL YEAR')
self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
def parse_post(self,response):
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
new.add_xpath('date','//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
reactions = response.urljoin(reactions[0].extract())