[comments.py] Added support for groups

This commit is contained in:
rugantio 2019-04-23 03:41:52 +02:00
parent 2d404a7667
commit 462cb0eff1
4 changed files with 77 additions and 24 deletions

View File

@ -24,10 +24,15 @@ class CommentsSpider(FacebookSpider):
'''
parse page does multiple things:
1) loads replied-to-comments page one-by-one (for DFS)
2) retrieves not-replied-to comments
2) call parse_reply on the nested comments
3) adds simple (not-replied-to) comments
4) follows to new comment page
'''
#loads replied-to comments pages
#load replied-to comments pages
#select nested comment one-by-one matching with the index: response.meta['index']
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
group_flag = response.meta['group'] if 'group' in response.meta else None
for reply in response.xpath(path):
source = reply.xpath('.//h3/a/text()').extract()
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
@ -38,9 +43,10 @@ class CommentsSpider(FacebookSpider):
meta={'reply_to':source,
'url':response.url,
'index':response.meta['index'],
'flag':'init'})
#loads regular comments
if not response.xpath(path):
'flag':'init',
'group':group_flag})
#load regular comments
if not response.xpath(path): #prevents from exec
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for i,reply in enumerate(response.xpath(path2)):
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
@ -53,15 +59,29 @@ class CommentsSpider(FacebookSpider):
new.add_value('url',response.url)
yield new.load_item()
#previous comments
#new comment page
if not response.xpath(path):
for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
#for groups
next_xpath = './/div[contains(@id,"see_next")]'
prev_xpath = './/div[contains(@id,"see_prev")]'
if not response.xpath(next_xpath) or group_flag == 1:
for next_page in response.xpath(prev_xpath):
new_page = next_page.xpath('.//@href').extract()
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
meta={'index':1})
meta={'index':1,
'group':1})
else:
for next_page in response.xpath(next_xpath):
new_page = next_page.xpath('.//@href').extract()
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
meta={'index':1,
'group':group_flag})
def parse_reply(self,response):
'''
@ -101,13 +121,16 @@ class CommentsSpider(FacebookSpider):
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
'index':response.meta['index']})
'index':response.meta['index'],
'group':response.meta['group']})
else:
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply,
callback=self.parse_page,
meta={'index':response.meta['index']+1})
meta={'index':response.meta['index']+1,
'group':response.meta['group']})
elif response.meta['flag'] == 'back':
#parse all comments
@ -132,10 +155,39 @@ class CommentsSpider(FacebookSpider):
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
'index':response.meta['index']})
'index':response.meta['index'],
'group':response.meta['group']})
else:
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply,
callback=self.parse_page,
meta={'index':response.meta['index']+1})
meta={'index':response.meta['index']+1,
'group':response.meta['group']})
# =============================================================================
# CRAWL REACTIONS
# =============================================================================
# def parse_reactions(self,response):
# new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
# new.context['lang'] = self.lang
# new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
# new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
# new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
# new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
# new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
# new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
# yield new.load_item()
#
# #substitute
# yield new.load_item()
# ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
# _________v___
# #response --> reply/root
# reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
# reactions = response.urljoin(reactions[0].extract())
# if reactions:
# yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
# else:
# yield new.load_item()

View File

@ -66,7 +66,7 @@ class FacebookSpider(scrapy.Spider):
else:
self.logger.info('Lang "{}" not currently supported'.format(self.lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again')
self.logger.info('Change your interface lang from facebook settings and try again')
raise AttributeError('Language provided not currently supported')
#current year, this variable is needed for parse_page recursion
@ -140,7 +140,7 @@ class FacebookSpider(scrapy.Spider):
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
new = ItemLoader(item=FbcrawlItem(),selector=post)
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url #new.add_value('url',response.url)
@ -150,9 +150,10 @@ class FacebookSpider(scrapy.Spider):
self.count -= 1
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
#load following page
#tries to click on "more", otherwise it looks for the appropriate
#year for 1-click only and proceeds to click on others
#load following page, try to click on "more"
#after few pages have gone scraped, the "more" link disappears
#if not present look for the highest year not parsed yet, click once
#and keep looking for "more"
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if not new_page:
if response.meta['flag'] == self.k and self.k >= self.year:
@ -165,7 +166,7 @@ class FacebookSpider(scrapy.Spider):
self.logger.info('Everything OK, new flag: {}'.format(self.k))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
else:
while not new_page: #sometimes the years are skipped
while not new_page: #sometimes the years are skipped this handles small year gaps
self.logger.info('XPATH not found for year {}'.format(self.k-1))
self.k -= 1
self.logger.info('Trying with previous year, flag={}'.format(self.k))