diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index 9bf9b78..dd77293 100644 Binary files a/fbcrawl/__pycache__/items.cpython-37.pyc and b/fbcrawl/__pycache__/items.cpython-37.pyc differ diff --git a/fbcrawl/items.py b/fbcrawl/items.py index 9a54bb1..d54eb0f 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -513,7 +513,5 @@ class CommentsItem(scrapy.Item): sigh = scrapy.Field() grrr = scrapy.Field() share = scrapy.Field() # num of shares - url = scrapy.Field( - output_processor=url_strip - ) + url = scrapy.Field() shared_from = scrapy.Field() diff --git a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc index c6999a2..995eab3 100644 Binary files a/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc differ diff --git a/fbcrawl/spiders/comments.py b/fbcrawl/spiders/comments.py index 41bccd5..76af529 100644 --- a/fbcrawl/spiders/comments.py +++ b/fbcrawl/spiders/comments.py @@ -11,9 +11,8 @@ class CommentsSpider(FacebookSpider): """ name = "comments" custom_settings = { - 'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \ - 'reactions','likes','ahah','love','wow', \ - 'sigh','grrr','url'], + 'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \ + 'url'], 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 'CONCURRENT_REQUESTS':1, } @@ -25,7 +24,7 @@ class CommentsSpider(FacebookSpider): ''' parse page does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) - 2) gets common not-replied-to comments + 2) retrieves not-replied-to comments ''' #loads replied-to comments pages path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' @@ -43,7 +42,6 @@ class CommentsSpider(FacebookSpider): #loads regular comments if not response.xpath(path): path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' - for i,reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment @ page {}'.format(i,response.url)) new = ItemLoader(item=CommentsItem(),selector=reply) @@ -51,6 +49,7 @@ class CommentsSpider(FacebookSpider): new.add_xpath('source','.//h3/a/text()') new.add_xpath('text','.//div[h3]/div[1]//text()') new.add_xpath('date','.//abbr/text()') + new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') new.add_value('url',response.url) yield new.load_item() @@ -77,6 +76,7 @@ class CommentsSpider(FacebookSpider): new.add_value('reply_to','ROOT') new.add_xpath('text','.//div[1]//text()') new.add_xpath('date','.//abbr/text()') + new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') new.add_value('url',response.url) yield new.load_item() #parse all replies in the page @@ -87,6 +87,7 @@ class CommentsSpider(FacebookSpider): new.add_value('reply_to',response.meta['reply_to']) new.add_xpath('text','.//div[h3]/div[1]//text()') new.add_xpath('date','.//abbr/text()') + new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') new.add_value('url',response.url) yield new.load_item() @@ -117,6 +118,7 @@ class CommentsSpider(FacebookSpider): new.add_value('reply_to',response.meta['reply_to']) new.add_xpath('text','.//div[h3]/div[1]//text()') new.add_xpath('date','.//abbr/text()') + new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') new.add_value('url',response.url) yield new.load_item() #keep going backwards diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index 11832b6..9e7f0db 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -213,4 +213,4 @@ class FacebookSpider(scrapy.Spider): new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") - yield new.load_item() \ No newline at end of file + yield new.load_item() \ No newline at end of file