refactoring comments spider

2019-02-18 07:18:34 +01:00 · 2019-02-18 07:18:34 +01:00 · dc1d0f29c0
commit dc1d0f29c0
parent 069f64f61e
5 changed files with 9 additions and 9 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -513,7 +513,5 @@ class CommentsItem(scrapy.Item):
    sigh = scrapy.Field()                      
    grrr = scrapy.Field()                      
    share = scrapy.Field()                      # num of shares
-    url = scrapy.Field(
-        output_processor=url_strip
-    )
+    url = scrapy.Field()
    shared_from = scrapy.Field()
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -11,9 +11,8 @@ class CommentsSpider(FacebookSpider):
    """    
    name = "comments"
    custom_settings = {
-        'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
-                               'reactions','likes','ahah','love','wow', \
-                               'sigh','grrr','url'],
+        'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \
+                               'url'],
        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
        'CONCURRENT_REQUESTS':1, 
    }
@ -25,7 +24,7 @@ class CommentsSpider(FacebookSpider):
        '''
        parse page does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
-            2) gets common not-replied-to comments
+            2) retrieves not-replied-to comments
        '''
        #loads replied-to comments pages
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'  + '['+ str(response.meta['index']) + ']'
@ -43,7 +42,6 @@ class CommentsSpider(FacebookSpider):
        #loads regular comments     
        if not response.xpath(path):
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
-            
            for i,reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment @ page {}'.format(i,response.url))
                new = ItemLoader(item=CommentsItem(),selector=reply)
@ -51,6 +49,7 @@ class CommentsSpider(FacebookSpider):
                new.add_xpath('source','.//h3/a/text()')  
                new.add_xpath('text','.//div[h3]/div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
+                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url',response.url)
                yield new.load_item()
            
@ -77,6 +76,7 @@ class CommentsSpider(FacebookSpider):
                new.add_value('reply_to','ROOT')
                new.add_xpath('text','.//div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
+                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url',response.url)
                yield new.load_item()
            #parse all replies in the page
@ -87,6 +87,7 @@ class CommentsSpider(FacebookSpider):
                new.add_value('reply_to',response.meta['reply_to'])
                new.add_xpath('text','.//div[h3]/div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
+                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url',response.url)   
                yield new.load_item()
                
@ -117,6 +118,7 @@ class CommentsSpider(FacebookSpider):
                new.add_value('reply_to',response.meta['reply_to'])
                new.add_xpath('text','.//div[h3]/div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
+                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url',response.url)   
                yield new.load_item()
            #keep going backwards
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -213,4 +213,4 @@ class FacebookSpider(scrapy.Spider):
        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        
-        yield new.load_item()
+        yield new.load_item()