refactoring comments spider

This commit is contained in:
rugantio 2019-02-18 07:18:34 +01:00
parent 069f64f61e
commit dc1d0f29c0
5 changed files with 9 additions and 9 deletions

View File

@ -513,7 +513,5 @@ class CommentsItem(scrapy.Item):
sigh = scrapy.Field()
grrr = scrapy.Field()
share = scrapy.Field() # num of shares
url = scrapy.Field(
output_processor=url_strip
)
url = scrapy.Field()
shared_from = scrapy.Field()

View File

@ -11,9 +11,8 @@ class CommentsSpider(FacebookSpider):
"""
name = "comments"
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
'reactions','likes','ahah','love','wow', \
'sigh','grrr','url'],
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
'url'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
'CONCURRENT_REQUESTS':1,
}
@ -25,7 +24,7 @@ class CommentsSpider(FacebookSpider):
'''
parse page does multiple things:
1) loads replied-to-comments page one-by-one (for DFS)
2) gets common not-replied-to comments
2) retrieves not-replied-to comments
'''
#loads replied-to comments pages
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
@ -43,7 +42,6 @@ class CommentsSpider(FacebookSpider):
#loads regular comments
if not response.xpath(path):
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for i,reply in enumerate(response.xpath(path2)):
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
new = ItemLoader(item=CommentsItem(),selector=reply)
@ -51,6 +49,7 @@ class CommentsSpider(FacebookSpider):
new.add_xpath('source','.//h3/a/text()')
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url)
yield new.load_item()
@ -77,6 +76,7 @@ class CommentsSpider(FacebookSpider):
new.add_value('reply_to','ROOT')
new.add_xpath('text','.//div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url)
yield new.load_item()
#parse all replies in the page
@ -87,6 +87,7 @@ class CommentsSpider(FacebookSpider):
new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url)
yield new.load_item()
@ -117,6 +118,7 @@ class CommentsSpider(FacebookSpider):
new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url)
yield new.load_item()
#keep going backwards

View File

@ -213,4 +213,4 @@ class FacebookSpider(scrapy.Spider):
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
yield new.load_item()
yield new.load_item()