fixed recursion on pages
This commit is contained in:
parent
918cd9ce64
commit
dafd01c8bd
1
.~lock.Trump.csv#
Normal file
1
.~lock.Trump.csv#
Normal file
@ -0,0 +1 @@
|
||||
,rugantio,alice,04.02.2019 17:42,file:///home/rugantio/.config/libreoffice/4;
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -413,35 +413,38 @@ def url_strip(url):
|
||||
#catchin '&id=' is enough to identify the post
|
||||
i = fullurl.find('&id=')
|
||||
if i != -1:
|
||||
j = fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
||||
return j
|
||||
else:
|
||||
return fullurl
|
||||
return fullurl[:i+4] + fullurl[i+4:].split('&')[0]
|
||||
else: #catch photos
|
||||
i = fullurl.find('/photos/')
|
||||
if i != -1:
|
||||
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
||||
else: #catch albums
|
||||
i = fullurl.find('/albums/')
|
||||
if i != -1:
|
||||
return fullurl[:i+8] + fullurl[i+8:].split('/?')[0]
|
||||
else:
|
||||
return fullurl
|
||||
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
source = scrapy.Field(
|
||||
output_processor=TakeFirst()
|
||||
) # page that published the post
|
||||
|
||||
source = scrapy.Field(
|
||||
output_processor=TakeFirst()
|
||||
)
|
||||
date = scrapy.Field( # when was the post published
|
||||
input_processor=TakeFirst(),
|
||||
output_processor=parse_date
|
||||
input_processor=TakeFirst(),
|
||||
output_processor=parse_date
|
||||
)
|
||||
|
||||
text = scrapy.Field(
|
||||
output_processor=Join(separator=u'')
|
||||
output_processor=Join(separator=u'')
|
||||
) # full text of the post
|
||||
|
||||
comments = scrapy.Field(
|
||||
output_processor=comments_strip
|
||||
output_processor=comments_strip
|
||||
)
|
||||
|
||||
reactions = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
output_processor=reactions_strip
|
||||
) # num of reactions
|
||||
|
||||
likes = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
ahah = scrapy.Field()
|
||||
love = scrapy.Field()
|
||||
@ -451,4 +454,5 @@ class FbcrawlItem(scrapy.Item):
|
||||
share = scrapy.Field() # num of shares
|
||||
url = scrapy.Field(
|
||||
output_processor=url_strip
|
||||
)
|
||||
)
|
||||
shared_from = scrapy.Field()
|
||||
|
@ -14,7 +14,6 @@ BOT_NAME = 'fbcrawl'
|
||||
SPIDER_MODULES = ['fbcrawl.spiders']
|
||||
NEWSPIDER_MODULE = 'fbcrawl.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
|
||||
|
||||
@ -22,7 +21,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
CONCURRENT_REQUESTS = 1
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
@ -88,7 +87,7 @@ ROBOTSTXT_OBEY = False
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
|
||||
#FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
|
||||
FEED_EXPORT_ENCODING = 'utf-8'
|
||||
DUPEFILTER_DEBUG = True
|
||||
LOG_LEVEL = 'INFO'
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -4,7 +4,6 @@ from scrapy.loader import ItemLoader
|
||||
from scrapy.http import FormRequest
|
||||
from fbcrawl.items import FbcrawlItem
|
||||
|
||||
|
||||
class FacebookSpider(scrapy.Spider):
|
||||
"""
|
||||
Parse FB comments, given a page (needs credentials)
|
||||
@ -78,22 +77,27 @@ class FacebookSpider(scrapy.Spider):
|
||||
)
|
||||
|
||||
def parse_page(self, response):
|
||||
for post in response.xpath('//div[count(@class)=1 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
new.add_xpath('source', "./div/h3/a/text()")
|
||||
new.add_xpath('text',"//div/div/span[not(contains(text(),' · '))]/text() | ./div/div/text()")
|
||||
yield new.load_item()
|
||||
|
||||
rispostina = response.xpath('//div/a[contains(text(),"rispost")]/@href')
|
||||
|
||||
for i in range(len(rispostina)):
|
||||
risp = response.urljoin(rispostina[i].extract())
|
||||
#answer from page
|
||||
for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||
# resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
|
||||
rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
|
||||
risp = response.urljoin(rispostina[0].extract())
|
||||
yield scrapy.Request(risp, callback=self.parse_rispostina)
|
||||
|
||||
next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
||||
if len(next_page) > 0:
|
||||
next_page = response.urljoin(next_page[0].extract())
|
||||
yield scrapy.Request(next_page, callback=self.parse_page)
|
||||
|
||||
# for i in range(len(rispostina)):
|
||||
# risp = response.urljoin(rispostina[i].extract())
|
||||
#
|
||||
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
|
||||
# new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
# new.add_xpath('source', "./div/h3/a/text()")
|
||||
# new.add_xpath('text',"./div[1]/div[1]/text()")
|
||||
# yield new.load_item()
|
||||
#
|
||||
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
||||
# if len(next_page) > 0:
|
||||
# next_page = response.urljoin(next_page[0].extract())
|
||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
||||
|
||||
def parse_rispostina(self,response):
|
||||
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts
|
||||
|
@ -1,30 +1,39 @@
|
||||
import scrapy
|
||||
import logging
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.http import FormRequest
|
||||
from fbcrawl.items import FbcrawlItem
|
||||
from scrapy.exceptions import CloseSpider
|
||||
|
||||
|
||||
class FacebookSpider(scrapy.Spider):
|
||||
"""
|
||||
Parse FB pages (needs credentials)
|
||||
"""
|
||||
name = "fb"
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||
'reactions','likes','ahah','love','wow', \
|
||||
'sigh','grrr','comments','url']
|
||||
}
|
||||
|
||||
def __init__(self, email='', password='', page='', year=2018, lang='_', **kwargs):
|
||||
super(FacebookSpider, self).__init__(**kwargs)
|
||||
def __init__(self,email='',password='',page='',year=2018,lang='_',*args,**kwargs):
|
||||
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
|
||||
logger = logging.getLogger('scrapy.middleware')
|
||||
logger.setLevel(logging.WARNING)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
#email & pass need to be passed as attributes!
|
||||
if not email or not password:
|
||||
raise ValueError("You need to provide valid email and password!")
|
||||
raise AttributeError('You need to provide valid email and password:\n'
|
||||
'scrapy fb -a email="EMAIL" -a password="PASSWORD"')
|
||||
else:
|
||||
self.email = email
|
||||
self.password = password
|
||||
|
||||
#page name parsing (added support for full urls)
|
||||
if not page:
|
||||
raise ValueError("You need to provide a valid page name to crawl!")
|
||||
raise AttributeError('You need to provide a valid page name to crawl!'
|
||||
'scrapy fb -a page="PAGENAME"')
|
||||
elif page.find('https://www.facebook.com/') != -1:
|
||||
self.page = page[25:]
|
||||
elif page.find('https://mbasic.facebook.com/') != -1:
|
||||
@ -35,22 +44,27 @@ class FacebookSpider(scrapy.Spider):
|
||||
self.page = page
|
||||
|
||||
#parse year
|
||||
assert int(year) <= 2019 and int(year) >= 2015, 'Year must be a number 2015 <= year <= 2019'
|
||||
assert int(year) <= 2019 and int(year) >= 2006, 'Year must be a number 2006 <= year <= 2019'
|
||||
self.year = int(year) #arguments are passed as strings
|
||||
|
||||
|
||||
#parse lang, if not provided (but is supported) it will be guessed in parse_home
|
||||
if lang=='_':
|
||||
self.logger.info('Language attribute not provided, I will try to guess it')
|
||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||
self.logger.info('Language attribute not provided, I will try to guess it from the fb interface')
|
||||
self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"')
|
||||
self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"')
|
||||
self.lang=lang
|
||||
elif lang == 'en' or lang == 'es' or lang == 'fr' or lang == 'it' or lang == 'pt':
|
||||
self.lang = lang
|
||||
self.lang = lang.lower()
|
||||
else:
|
||||
self.logger.info('Lang "{}" not currently supported'.format(lang))
|
||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||
self.logger.info('Change your interface lang from facebook and try again')
|
||||
raise CloseSpider('Language provided not currently supported')
|
||||
raise AttributeError('Language provided not currently supported')
|
||||
|
||||
#current year, this variable is needed for parse_page recursion
|
||||
self.k = 2019
|
||||
self.count = 0
|
||||
|
||||
self.start_urls = ['https://mbasic.facebook.com']
|
||||
|
||||
def parse(self, response):
|
||||
@ -73,29 +87,39 @@ class FacebookSpider(scrapy.Spider):
|
||||
'''
|
||||
#handle 'save-device' redirection
|
||||
if response.xpath("//div/a[contains(@href,'save-device')]"):
|
||||
self.logger.info('Got stuck in "save-device" checkpoint')
|
||||
self.logger.info('I will now try to redirect to the correct page')
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'name_action_selected': 'dont_save'},
|
||||
callback=self.parse_home)
|
||||
callback=self.parse_home
|
||||
)
|
||||
|
||||
#set language interface
|
||||
if self.lang == '_':
|
||||
if response.xpath("//input[@placeholder='Search Facebook']"):
|
||||
self.logger.info('Language recognized: lang="en"')
|
||||
self.lang = 'en'
|
||||
elif response.xpath("//input[@value='Buscar']"):
|
||||
elif response.xpath("//input[@placeholder='Buscar en Facebook']"):
|
||||
self.logger.info('Language recognized: lang="es"')
|
||||
self.lang = 'es'
|
||||
elif response.xpath("//input[@value='Rechercher']"):
|
||||
elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"):
|
||||
self.logger.info('Language recognized: lang="fr"')
|
||||
self.lang = 'fr'
|
||||
elif response.xpath("//input[@value='Cerca']"):
|
||||
elif response.xpath("//input[@placeholder='Cerca su Facebook']"):
|
||||
self.logger.info('Language recognized: lang="it"')
|
||||
self.lang = 'it'
|
||||
elif response.xpath("//input[@value='Pesquisar']"):
|
||||
elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"):
|
||||
self.logger.info('Language recognized: lang="pt"')
|
||||
self.lang = 'pt'
|
||||
else:
|
||||
raise CloseSpider('Language not recognized')
|
||||
|
||||
raise AttributeError('Language not recognized\n'
|
||||
'Change your interface lang from facebook '
|
||||
'and try again')
|
||||
|
||||
#navigate to provided page
|
||||
href = response.urljoin(self.page)
|
||||
self.logger.info('Parsing facebook page %s', href)
|
||||
self.logger.info('Scraping facebook page {}'.format(href))
|
||||
return scrapy.Request(url=href,callback=self.parse_page)
|
||||
|
||||
def parse_page(self, response):
|
||||
@ -106,6 +130,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
#select all posts
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
|
||||
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
|
||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||
new.add_xpath('reactions',".//a[contains(@aria-label,'reactions')]/text()")
|
||||
@ -113,54 +138,53 @@ class FacebookSpider(scrapy.Spider):
|
||||
#page_url #new.add_value('url',response.url)
|
||||
#returns full post-link in a list
|
||||
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||
temp_post = response.urljoin(post[0])
|
||||
yield scrapy.Request(temp_post, self.parse_post, meta={'item':new})
|
||||
temp_post = response.urljoin(post[0])
|
||||
self.count -= 1
|
||||
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
|
||||
|
||||
#load following page
|
||||
next_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
if len(next_page) == 0:
|
||||
if response.meta['flag'] == 4 and self.year <= 2015:
|
||||
self.logger.info('2014 reached, flag = 5')
|
||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
|
||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||
new_page = response.urljoin(next_page[0])
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':5})
|
||||
elif response.meta['flag'] == 3 and self.year <= 2015:
|
||||
self.logger.info('2015 reached, flag = 4')
|
||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2015')]/@href").extract()
|
||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||
new_page = response.urljoin(next_page[0])
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':4})
|
||||
elif response.meta['flag'] == 2 and self.year <= 2016:
|
||||
self.logger.info('2016 reached, flag = 3')
|
||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2016')]/@href").extract()
|
||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||
new_page = response.urljoin(next_page[0])
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':3})
|
||||
elif response.meta['flag'] == 1 and self.year <= 2017:
|
||||
self.logger.info('2017 reached, flag = 2')
|
||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2017')]/@href").extract()
|
||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||
new_page = response.urljoin(next_page[0])
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':2})
|
||||
elif response.meta['flag'] == 0 and self.year <= 2018:
|
||||
self.logger.info('2018 reached, flag = 1')
|
||||
next_page = response.xpath("//div/a[contains(@href,'time') and contains(text(),'2018')]/@href").extract()
|
||||
self.logger.info('next_page = {}'.format(next_page[0]))
|
||||
new_page = response.urljoin(next_page[0])
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':1})
|
||||
#tries to click on "more", otherwise it looks for the appropriate
|
||||
#year for 1-click only and proceeds to click on others
|
||||
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
if not new_page:
|
||||
if response.meta['flag'] == self.k and self.year <= self.k:
|
||||
self.logger.info('There are no more, clicking on year = {}'.format(self.k))
|
||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||
new_page = response.xpath(xpath).extract()
|
||||
if new_page:
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.k -= 1
|
||||
self.logger.info('Everything OK, new flag: {}'.format(self.k))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
else:
|
||||
while not new_page: #sometimes the years are skipped
|
||||
self.logger.info('XPATH not found for year {}'.format(self.k-1))
|
||||
self.k -= 1
|
||||
self.logger.info('Trying with previous year, flag={}'.format(self.k))
|
||||
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||
new_page = response.xpath(xpath).extract()
|
||||
self.logger.info('New page found with flag {}'.format(self.k))
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.k -= 1
|
||||
self.logger.info('Now going with flag {}'.format(self.k))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
else:
|
||||
new_page = response.urljoin(next_page[0])
|
||||
new_page = response.urljoin(new_page[0])
|
||||
if 'flag' in response.meta:
|
||||
self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag']))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
|
||||
else:
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':0})
|
||||
self.logger.info('FLAG DOES NOT REPRESENT ACTUAL YEAR')
|
||||
self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
|
||||
def parse_post(self,response):
|
||||
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
|
||||
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
|
||||
new.add_xpath('date', '//div/div/abbr/text()')
|
||||
new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
|
||||
new.add_xpath('date','//div/div/abbr/text()')
|
||||
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
|
||||
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
|
||||
|
||||
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
|
||||
reactions = response.urljoin(reactions[0].extract())
|
||||
@ -175,4 +199,4 @@ class FacebookSpider(scrapy.Spider):
|
||||
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
||||
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
||||
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
||||
yield new.load_item()
|
||||
yield new.load_item()
|
Loading…
Reference in New Issue
Block a user