[comments.py] Added support for groups
This commit is contained in:
parent
2d404a7667
commit
462cb0eff1
Binary file not shown.
Binary file not shown.
@ -24,10 +24,15 @@ class CommentsSpider(FacebookSpider):
|
||||
'''
|
||||
parse page does multiple things:
|
||||
1) loads replied-to-comments page one-by-one (for DFS)
|
||||
2) retrieves not-replied-to comments
|
||||
2) call parse_reply on the nested comments
|
||||
3) adds simple (not-replied-to) comments
|
||||
4) follows to new comment page
|
||||
'''
|
||||
#loads replied-to comments pages
|
||||
#load replied-to comments pages
|
||||
#select nested comment one-by-one matching with the index: response.meta['index']
|
||||
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
|
||||
group_flag = response.meta['group'] if 'group' in response.meta else None
|
||||
|
||||
for reply in response.xpath(path):
|
||||
source = reply.xpath('.//h3/a/text()').extract()
|
||||
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
|
||||
@ -38,9 +43,10 @@ class CommentsSpider(FacebookSpider):
|
||||
meta={'reply_to':source,
|
||||
'url':response.url,
|
||||
'index':response.meta['index'],
|
||||
'flag':'init'})
|
||||
#loads regular comments
|
||||
if not response.xpath(path):
|
||||
'flag':'init',
|
||||
'group':group_flag})
|
||||
#load regular comments
|
||||
if not response.xpath(path): #prevents from exec
|
||||
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||
for i,reply in enumerate(response.xpath(path2)):
|
||||
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
|
||||
@ -53,15 +59,29 @@ class CommentsSpider(FacebookSpider):
|
||||
new.add_value('url',response.url)
|
||||
yield new.load_item()
|
||||
|
||||
#previous comments
|
||||
#new comment page
|
||||
if not response.xpath(path):
|
||||
for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
|
||||
new_page = next_page.xpath('.//@href').extract()
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
meta={'index':1})
|
||||
#for groups
|
||||
next_xpath = './/div[contains(@id,"see_next")]'
|
||||
prev_xpath = './/div[contains(@id,"see_prev")]'
|
||||
if not response.xpath(next_xpath) or group_flag == 1:
|
||||
for next_page in response.xpath(prev_xpath):
|
||||
new_page = next_page.xpath('.//@href').extract()
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
meta={'index':1,
|
||||
'group':1})
|
||||
else:
|
||||
for next_page in response.xpath(next_xpath):
|
||||
new_page = next_page.xpath('.//@href').extract()
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
meta={'index':1,
|
||||
'group':group_flag})
|
||||
|
||||
def parse_reply(self,response):
|
||||
'''
|
||||
@ -101,13 +121,16 @@ class CommentsSpider(FacebookSpider):
|
||||
meta={'reply_to':response.meta['reply_to'],
|
||||
'flag':'back',
|
||||
'url':response.meta['url'],
|
||||
'index':response.meta['index']})
|
||||
'index':response.meta['index'],
|
||||
'group':response.meta['group']})
|
||||
|
||||
else:
|
||||
next_reply = response.meta['url']
|
||||
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
|
||||
yield scrapy.Request(next_reply,
|
||||
callback=self.parse_page,
|
||||
meta={'index':response.meta['index']+1})
|
||||
meta={'index':response.meta['index']+1,
|
||||
'group':response.meta['group']})
|
||||
|
||||
elif response.meta['flag'] == 'back':
|
||||
#parse all comments
|
||||
@ -132,10 +155,39 @@ class CommentsSpider(FacebookSpider):
|
||||
meta={'reply_to':response.meta['reply_to'],
|
||||
'flag':'back',
|
||||
'url':response.meta['url'],
|
||||
'index':response.meta['index']})
|
||||
'index':response.meta['index'],
|
||||
'group':response.meta['group']})
|
||||
|
||||
else:
|
||||
next_reply = response.meta['url']
|
||||
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||
yield scrapy.Request(next_reply,
|
||||
callback=self.parse_page,
|
||||
meta={'index':response.meta['index']+1})
|
||||
meta={'index':response.meta['index']+1,
|
||||
'group':response.meta['group']})
|
||||
|
||||
# =============================================================================
|
||||
# CRAWL REACTIONS
|
||||
# =============================================================================
|
||||
# def parse_reactions(self,response):
|
||||
# new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
|
||||
# new.context['lang'] = self.lang
|
||||
# new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
|
||||
# new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
|
||||
# new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
|
||||
# new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
||||
# new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
||||
# new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
||||
# yield new.load_item()
|
||||
#
|
||||
# #substitute
|
||||
# yield new.load_item()
|
||||
# ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
|
||||
# _________v___
|
||||
# #response --> reply/root
|
||||
# reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
|
||||
# reactions = response.urljoin(reactions[0].extract())
|
||||
# if reactions:
|
||||
# yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
|
||||
# else:
|
||||
# yield new.load_item()
|
@ -66,7 +66,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
else:
|
||||
self.logger.info('Lang "{}" not currently supported'.format(self.lang))
|
||||
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||
self.logger.info('Change your interface lang from facebook and try again')
|
||||
self.logger.info('Change your interface lang from facebook settings and try again')
|
||||
raise AttributeError('Language provided not currently supported')
|
||||
|
||||
#current year, this variable is needed for parse_page recursion
|
||||
@ -85,7 +85,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
formxpath='//form[contains(@action, "login")]',
|
||||
formdata={'email': self.email,'pass': self.password},
|
||||
callback=self.parse_home
|
||||
)
|
||||
)
|
||||
|
||||
def parse_home(self, response):
|
||||
'''
|
||||
@ -140,7 +140,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
|
||||
new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
|
||||
new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
|
||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||
|
||||
#page_url #new.add_value('url',response.url)
|
||||
@ -150,9 +150,10 @@ class FacebookSpider(scrapy.Spider):
|
||||
self.count -= 1
|
||||
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
|
||||
|
||||
#load following page
|
||||
#tries to click on "more", otherwise it looks for the appropriate
|
||||
#year for 1-click only and proceeds to click on others
|
||||
#load following page, try to click on "more"
|
||||
#after few pages have gone scraped, the "more" link disappears
|
||||
#if not present look for the highest year not parsed yet, click once
|
||||
#and keep looking for "more"
|
||||
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||
if not new_page:
|
||||
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||
@ -165,7 +166,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
self.logger.info('Everything OK, new flag: {}'.format(self.k))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
else:
|
||||
while not new_page: #sometimes the years are skipped
|
||||
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||
self.logger.info('XPATH not found for year {}'.format(self.k-1))
|
||||
self.k -= 1
|
||||
self.logger.info('Trying with previous year, flag={}'.format(self.k))
|
||||
|
Loading…
Reference in New Issue
Block a user