refactoring comments spider

2019-02-18 05:07:38 +01:00 · 2019-02-18 05:07:38 +01:00 · bd41255361
commit bd41255361
parent 96d3423b8d
3 changed files with 18 additions and 14 deletions
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/comments.py
+++ b/fbcrawl/spiders/comments.py
@ -1,7 +1,6 @@
 import scrapy

 from scrapy.loader import ItemLoader
-from scrapy.http import FormRequest
 from fbcrawl.spiders.fbcrawl import FacebookSpider
 from fbcrawl.items import CommentsItem

@ -14,7 +13,9 @@ class CommentsSpider(FacebookSpider):
    custom_settings = {
        'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
                               'reactions','likes','ahah','love','wow', \
-                               'sigh','grrr','url']
+                               'sigh','grrr','url'],
+        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
+        'CONCURRENT_REQUESTS':1, 
    }

    def __init__(self, *args, **kwargs):
@ -32,7 +33,7 @@ class CommentsSpider(FacebookSpider):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
-            self.logger.info('Nested comment at page {}'.format(ans))
+            self.logger.info('{} nested comment @ page {}'.format(str(response.meta['index']),ans))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 meta={'reply_to':source,
@ -42,16 +43,19 @@ class CommentsSpider(FacebookSpider):
        #loads regular comments     
        if not response.xpath(path):
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
-            for reply in response.xpath(path2):
+            
+            for i,reply in enumerate(response.xpath(path2)):
+                self.logger.info('{} regular comment @ page {}'.format(i,response.url))
                new = ItemLoader(item=CommentsItem(),selector=reply)
                new.context['lang'] = self.lang           
                new.add_xpath('source','.//h3/a/text()')  
                new.add_xpath('text','.//div[h3]/div[1]//text()')
                new.add_xpath('date','.//abbr/text()')
+                new.add_value('url',response.url)
                yield new.load_item()
-#            
-#        #previous comments
-        if not response.xpath(path) and not response.xpath(path2):
+            
+        #previous comments
+        if not response.xpath(path):
            for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
                new_page = next_page.xpath('.//@href').extract()
                new_page = response.urljoin(new_page[0])
@ -59,7 +63,7 @@ class CommentsSpider(FacebookSpider):
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'index':1})        
-#        
+        
    def parse_reply(self,response):
        '''
        parse reply to comments, root comment is added if flag
@ -88,7 +92,7 @@ class CommentsSpider(FacebookSpider):
                
            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
            if back:
-                self.logger.info('Back found, trying to go back')
+                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page, 
                                     callback=self.parse_reply,
@ -99,8 +103,8 @@ class CommentsSpider(FacebookSpider):
                                           'index':response.meta['index']})
            else:
                next_reply = response.meta['url']
-                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
-                yield scrapy.Request(next_reply, dont_filter=True,
+                self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
+                yield scrapy.Request(next_reply,
                                     callback=self.parse_page,
                                     meta={'index':response.meta['index']+1})
                
@ -117,7 +121,7 @@ class CommentsSpider(FacebookSpider):
                yield new.load_item()
            #keep going backwards
            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
-            self.logger.info('Back found, trying to go back')
+            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page, 
@ -130,6 +134,6 @@ class CommentsSpider(FacebookSpider):
            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
-                yield scrapy.Request(next_reply, dont_filter=True,
+                yield scrapy.Request(next_reply,
                                     callback=self.parse_page,
-                                     meta={'index':response.meta['index']+1})
+                                     meta={'index':response.meta['index']+1})