added post_id column

This commit is contained in:
rugantio 2019-04-24 17:26:53 +02:00
parent 8baa108aab
commit 4a379f3af4
4 changed files with 11 additions and 1 deletions

View File

@ -483,6 +483,12 @@ def parse_date2(date):
return str(datetime.fromtimestamp(flat_d['publish_time']) - timedelta(hours=5)) return str(datetime.fromtimestamp(flat_d['publish_time']) - timedelta(hours=5))
def id_strip(post_id):
import json
d = json.loads(post_id[0]) #nested dict of features
return d['top_level_post_id']
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):
@ -511,6 +517,9 @@ class FbcrawlItem(scrapy.Item):
url = scrapy.Field( url = scrapy.Field(
output_processor=url_strip output_processor=url_strip
) )
post_id = scrapy.Field(
output_processor=id_strip
)
shared_from = scrapy.Field() shared_from = scrapy.Field()
class CommentsItem(scrapy.Item): class CommentsItem(scrapy.Item):

View File

@ -15,7 +15,7 @@ class FacebookSpider(scrapy.Spider):
custom_settings = { custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \ 'reactions','likes','ahah','love','wow', \
'sigh','grrr','comments','url'] 'sigh','grrr','comments','post_id','url']
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -151,6 +151,7 @@ class FacebookSpider(scrapy.Spider):
self.logger.info('Parsing post n = {}'.format(abs(self.count))) self.logger.info('Parsing post n = {}'.format(abs(self.count)))
new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
new.add_xpath('date','./@data-ft') new.add_xpath('date','./@data-ft')
new.add_xpath('post_id','./@data-ft')
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
#page_url #new.add_value('url',response.url) #page_url #new.add_value('url',response.url)