refactoring comments spider
This commit is contained in:
parent
069f64f61e
commit
dc1d0f29c0
Binary file not shown.
@ -513,7 +513,5 @@ class CommentsItem(scrapy.Item):
|
|||||||
sigh = scrapy.Field()
|
sigh = scrapy.Field()
|
||||||
grrr = scrapy.Field()
|
grrr = scrapy.Field()
|
||||||
share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
url = scrapy.Field(
|
url = scrapy.Field()
|
||||||
output_processor=url_strip
|
|
||||||
)
|
|
||||||
shared_from = scrapy.Field()
|
shared_from = scrapy.Field()
|
||||||
|
Binary file not shown.
@ -11,9 +11,8 @@ class CommentsSpider(FacebookSpider):
|
|||||||
"""
|
"""
|
||||||
name = "comments"
|
name = "comments"
|
||||||
custom_settings = {
|
custom_settings = {
|
||||||
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
|
'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
|
||||||
'reactions','likes','ahah','love','wow', \
|
'url'],
|
||||||
'sigh','grrr','url'],
|
|
||||||
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||||
'CONCURRENT_REQUESTS':1,
|
'CONCURRENT_REQUESTS':1,
|
||||||
}
|
}
|
||||||
@ -25,7 +24,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
'''
|
'''
|
||||||
parse page does multiple things:
|
parse page does multiple things:
|
||||||
1) loads replied-to-comments page one-by-one (for DFS)
|
1) loads replied-to-comments page one-by-one (for DFS)
|
||||||
2) gets common not-replied-to comments
|
2) retrieves not-replied-to comments
|
||||||
'''
|
'''
|
||||||
#loads replied-to comments pages
|
#loads replied-to comments pages
|
||||||
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
|
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
|
||||||
@ -43,7 +42,6 @@ class CommentsSpider(FacebookSpider):
|
|||||||
#loads regular comments
|
#loads regular comments
|
||||||
if not response.xpath(path):
|
if not response.xpath(path):
|
||||||
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||||
|
|
||||||
for i,reply in enumerate(response.xpath(path2)):
|
for i,reply in enumerate(response.xpath(path2)):
|
||||||
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
|
self.logger.info('{} regular comment @ page {}'.format(i,response.url))
|
||||||
new = ItemLoader(item=CommentsItem(),selector=reply)
|
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||||
@ -51,6 +49,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
new.add_xpath('source','.//h3/a/text()')
|
new.add_xpath('source','.//h3/a/text()')
|
||||||
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||||
new.add_xpath('date','.//abbr/text()')
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
|
||||||
new.add_value('url',response.url)
|
new.add_value('url',response.url)
|
||||||
yield new.load_item()
|
yield new.load_item()
|
||||||
|
|
||||||
@ -77,6 +76,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
new.add_value('reply_to','ROOT')
|
new.add_value('reply_to','ROOT')
|
||||||
new.add_xpath('text','.//div[1]//text()')
|
new.add_xpath('text','.//div[1]//text()')
|
||||||
new.add_xpath('date','.//abbr/text()')
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
|
||||||
new.add_value('url',response.url)
|
new.add_value('url',response.url)
|
||||||
yield new.load_item()
|
yield new.load_item()
|
||||||
#parse all replies in the page
|
#parse all replies in the page
|
||||||
@ -87,6 +87,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
new.add_value('reply_to',response.meta['reply_to'])
|
new.add_value('reply_to',response.meta['reply_to'])
|
||||||
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||||
new.add_xpath('date','.//abbr/text()')
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
|
||||||
new.add_value('url',response.url)
|
new.add_value('url',response.url)
|
||||||
yield new.load_item()
|
yield new.load_item()
|
||||||
|
|
||||||
@ -117,6 +118,7 @@ class CommentsSpider(FacebookSpider):
|
|||||||
new.add_value('reply_to',response.meta['reply_to'])
|
new.add_value('reply_to',response.meta['reply_to'])
|
||||||
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||||
new.add_xpath('date','.//abbr/text()')
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
|
||||||
new.add_value('url',response.url)
|
new.add_value('url',response.url)
|
||||||
yield new.load_item()
|
yield new.load_item()
|
||||||
#keep going backwards
|
#keep going backwards
|
||||||
|
@ -213,4 +213,4 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
||||||
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
||||||
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
||||||
yield new.load_item()
|
yield new.load_item()
|
Loading…
Reference in New Issue
Block a user