[comments.py] Added support for groups

This commit is contained in:
rugantio 2019-04-23 03:41:52 +02:00
parent 2d404a7667
commit 462cb0eff1
4 changed files with 77 additions and 24 deletions

View File

@ -24,10 +24,15 @@ class CommentsSpider(FacebookSpider):
''' '''
parse page does multiple things: parse page does multiple things:
1) loads replied-to-comments page one-by-one (for DFS) 1) loads replied-to-comments page one-by-one (for DFS)
2) retrieves not-replied-to comments 2) call parse_reply on the nested comments
3) adds simple (not-replied-to) comments
4) follows to new comment page
''' '''
#loads replied-to comments pages #load replied-to comments pages
#select nested comment one-by-one matching with the index: response.meta['index']
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
group_flag = response.meta['group'] if 'group' in response.meta else None
for reply in response.xpath(path): for reply in response.xpath(path):
source = reply.xpath('.//h3/a/text()').extract() source = reply.xpath('.//h3/a/text()').extract()
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
@ -38,9 +43,10 @@ class CommentsSpider(FacebookSpider):
meta={'reply_to':source, meta={'reply_to':source,
'url':response.url, 'url':response.url,
'index':response.meta['index'], 'index':response.meta['index'],
'flag':'init'}) 'flag':'init',
#loads regular comments 'group':group_flag})
if not response.xpath(path): #load regular comments
if not response.xpath(path): #prevents from exec
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for i,reply in enumerate(response.xpath(path2)): for i,reply in enumerate(response.xpath(path2)):
self.logger.info('{} regular comment @ page {}'.format(i,response.url)) self.logger.info('{} regular comment @ page {}'.format(i,response.url))
@ -53,15 +59,29 @@ class CommentsSpider(FacebookSpider):
new.add_value('url',response.url) new.add_value('url',response.url)
yield new.load_item() yield new.load_item()
#previous comments #new comment page
if not response.xpath(path): if not response.xpath(path):
for next_page in response.xpath('.//div[contains(@id,"see_next")]'): #for groups
new_page = next_page.xpath('.//@href').extract() next_xpath = './/div[contains(@id,"see_next")]'
new_page = response.urljoin(new_page[0]) prev_xpath = './/div[contains(@id,"see_prev")]'
self.logger.info('New page to be crawled {}'.format(new_page)) if not response.xpath(next_xpath) or group_flag == 1:
yield scrapy.Request(new_page, for next_page in response.xpath(prev_xpath):
callback=self.parse_page, new_page = next_page.xpath('.//@href').extract()
meta={'index':1}) new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
meta={'index':1,
'group':1})
else:
for next_page in response.xpath(next_xpath):
new_page = next_page.xpath('.//@href').extract()
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
meta={'index':1,
'group':group_flag})
def parse_reply(self,response): def parse_reply(self,response):
''' '''
@ -101,13 +121,16 @@ class CommentsSpider(FacebookSpider):
meta={'reply_to':response.meta['reply_to'], meta={'reply_to':response.meta['reply_to'],
'flag':'back', 'flag':'back',
'url':response.meta['url'], 'url':response.meta['url'],
'index':response.meta['index']}) 'index':response.meta['index'],
'group':response.meta['group']})
else: else:
next_reply = response.meta['url'] next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, yield scrapy.Request(next_reply,
callback=self.parse_page, callback=self.parse_page,
meta={'index':response.meta['index']+1}) meta={'index':response.meta['index']+1,
'group':response.meta['group']})
elif response.meta['flag'] == 'back': elif response.meta['flag'] == 'back':
#parse all comments #parse all comments
@ -132,10 +155,39 @@ class CommentsSpider(FacebookSpider):
meta={'reply_to':response.meta['reply_to'], meta={'reply_to':response.meta['reply_to'],
'flag':'back', 'flag':'back',
'url':response.meta['url'], 'url':response.meta['url'],
'index':response.meta['index']}) 'index':response.meta['index'],
'group':response.meta['group']})
else: else:
next_reply = response.meta['url'] next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, yield scrapy.Request(next_reply,
callback=self.parse_page, callback=self.parse_page,
meta={'index':response.meta['index']+1}) meta={'index':response.meta['index']+1,
'group':response.meta['group']})
# =============================================================================
# CRAWL REACTIONS
# =============================================================================
# def parse_reactions(self,response):
# new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
# new.context['lang'] = self.lang
# new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
# new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
# new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
# new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
# new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
# new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
# yield new.load_item()
#
# #substitute
# yield new.load_item()
# ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
# _________v___
# #response --> reply/root
# reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
# reactions = response.urljoin(reactions[0].extract())
# if reactions:
# yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
# else:
# yield new.load_item()

View File

@ -66,7 +66,7 @@ class FacebookSpider(scrapy.Spider):
else: else:
self.logger.info('Lang "{}" not currently supported'.format(self.lang)) self.logger.info('Lang "{}" not currently supported'.format(self.lang))
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
self.logger.info('Change your interface lang from facebook and try again') self.logger.info('Change your interface lang from facebook settings and try again')
raise AttributeError('Language provided not currently supported') raise AttributeError('Language provided not currently supported')
#current year, this variable is needed for parse_page recursion #current year, this variable is needed for parse_page recursion
@ -85,7 +85,7 @@ class FacebookSpider(scrapy.Spider):
formxpath='//form[contains(@action, "login")]', formxpath='//form[contains(@action, "login")]',
formdata={'email': self.email,'pass': self.password}, formdata={'email': self.email,'pass': self.password},
callback=self.parse_home callback=self.parse_home
) )
def parse_home(self, response): def parse_home(self, response):
''' '''
@ -140,7 +140,7 @@ class FacebookSpider(scrapy.Spider):
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
new = ItemLoader(item=FbcrawlItem(),selector=post) new = ItemLoader(item=FbcrawlItem(),selector=post)
self.logger.info('Parsing post n = {}'.format(abs(self.count))) self.logger.info('Parsing post n = {}'.format(abs(self.count)))
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url #new.add_value('url',response.url) #page_url #new.add_value('url',response.url)
@ -150,9 +150,10 @@ class FacebookSpider(scrapy.Spider):
self.count -= 1 self.count -= 1
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
#load following page #load following page, try to click on "more"
#tries to click on "more", otherwise it looks for the appropriate #after few pages have gone scraped, the "more" link disappears
#year for 1-click only and proceeds to click on others #if not present look for the highest year not parsed yet, click once
#and keep looking for "more"
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
if not new_page: if not new_page:
if response.meta['flag'] == self.k and self.k >= self.year: if response.meta['flag'] == self.k and self.k >= self.year:
@ -165,7 +166,7 @@ class FacebookSpider(scrapy.Spider):
self.logger.info('Everything OK, new flag: {}'.format(self.k)) self.logger.info('Everything OK, new flag: {}'.format(self.k))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
else: else:
while not new_page: #sometimes the years are skipped while not new_page: #sometimes the years are skipped this handles small year gaps
self.logger.info('XPATH not found for year {}'.format(self.k-1)) self.logger.info('XPATH not found for year {}'.format(self.k-1))
self.k -= 1 self.k -= 1
self.logger.info('Trying with previous year, flag={}'.format(self.k)) self.logger.info('Trying with previous year, flag={}'.format(self.k))