diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index aec6e3c..ec0c829 100644 Binary files a/fbcrawl/__pycache__/items.cpython-37.pyc and b/fbcrawl/__pycache__/items.cpython-37.pyc differ diff --git a/fbcrawl/items.py b/fbcrawl/items.py index f424258..4fcde54 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -483,6 +483,12 @@ def parse_date2(date): return str(datetime.fromtimestamp(flat_d['publish_time']) - timedelta(hours=5)) +def id_strip(post_id): + import json + + d = json.loads(post_id[0]) #nested dict of features + return d['top_level_post_id'] + class FbcrawlItem(scrapy.Item): @@ -511,6 +517,9 @@ class FbcrawlItem(scrapy.Item): url = scrapy.Field( output_processor=url_strip ) + post_id = scrapy.Field( + output_processor=id_strip + ) shared_from = scrapy.Field() class CommentsItem(scrapy.Item): diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index a0830c7..324ab93 100644 Binary files a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc and b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc differ diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index 4dcc049..87febcd 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -15,7 +15,7 @@ class FacebookSpider(scrapy.Spider): custom_settings = { 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'reactions','likes','ahah','love','wow', \ - 'sigh','grrr','comments','url'] + 'sigh','grrr','comments','post_id','url'] } def __init__(self, *args, **kwargs): @@ -151,6 +151,7 @@ class FacebookSpider(scrapy.Spider): self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_xpath('date','./@data-ft') + new.add_xpath('post_id','./@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url)