refactoring comments spider

This commit is contained in:
rugantio 2019-02-18 07:18:34 +01:00
parent 069f64f61e
commit dc1d0f29c0
5 changed files with 9 additions and 9 deletions

View File

@ -513,7 +513,5 @@ class CommentsItem(scrapy.Item):
sigh = scrapy.Field() sigh = scrapy.Field()
grrr = scrapy.Field() grrr = scrapy.Field()
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
url = scrapy.Field( url = scrapy.Field()
output_processor=url_strip
)
shared_from = scrapy.Field() shared_from = scrapy.Field()

View File

@ -11,9 +11,8 @@ class CommentsSpider(FacebookSpider):
""" """
name = "comments" name = "comments"
custom_settings = { custom_settings = {
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \ 'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
'reactions','likes','ahah','love','wow', \ 'url'],
'sigh','grrr','url'],
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
'CONCURRENT_REQUESTS':1, 'CONCURRENT_REQUESTS':1,
} }
@ -25,7 +24,7 @@ class CommentsSpider(FacebookSpider):
''' '''
parse page does multiple things: parse page does multiple things:
1) loads replied-to-comments page one-by-one (for DFS) 1) loads replied-to-comments page one-by-one (for DFS)
2) gets common not-replied-to comments 2) retrieves not-replied-to comments
''' '''
#loads replied-to comments pages #loads replied-to comments pages
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
@ -43,7 +42,6 @@ class CommentsSpider(FacebookSpider):
#loads regular comments #loads regular comments
if not response.xpath(path): if not response.xpath(path):
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for i,reply in enumerate(response.xpath(path2)): for i,reply in enumerate(response.xpath(path2)):
self.logger.info('{} regular comment @ page {}'.format(i,response.url)) self.logger.info('{} regular comment @ page {}'.format(i,response.url))
new = ItemLoader(item=CommentsItem(),selector=reply) new = ItemLoader(item=CommentsItem(),selector=reply)
@ -51,6 +49,7 @@ class CommentsSpider(FacebookSpider):
new.add_xpath('source','.//h3/a/text()') new.add_xpath('source','.//h3/a/text()')
new.add_xpath('text','.//div[h3]/div[1]//text()') new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()') new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url) new.add_value('url',response.url)
yield new.load_item() yield new.load_item()
@ -77,6 +76,7 @@ class CommentsSpider(FacebookSpider):
new.add_value('reply_to','ROOT') new.add_value('reply_to','ROOT')
new.add_xpath('text','.//div[1]//text()') new.add_xpath('text','.//div[1]//text()')
new.add_xpath('date','.//abbr/text()') new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url) new.add_value('url',response.url)
yield new.load_item() yield new.load_item()
#parse all replies in the page #parse all replies in the page
@ -87,6 +87,7 @@ class CommentsSpider(FacebookSpider):
new.add_value('reply_to',response.meta['reply_to']) new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()') new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()') new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url) new.add_value('url',response.url)
yield new.load_item() yield new.load_item()
@ -117,6 +118,7 @@ class CommentsSpider(FacebookSpider):
new.add_value('reply_to',response.meta['reply_to']) new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()') new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()') new.add_xpath('date','.//abbr/text()')
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
new.add_value('url',response.url) new.add_value('url',response.url)
yield new.load_item() yield new.load_item()
#keep going backwards #keep going backwards

View File

@ -213,4 +213,4 @@ class FacebookSpider(scrapy.Spider):
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
yield new.load_item() yield new.load_item()