added post_id column
This commit is contained in:
parent
8baa108aab
commit
4a379f3af4
Binary file not shown.
@ -483,6 +483,12 @@ def parse_date2(date):
|
||||
|
||||
return str(datetime.fromtimestamp(flat_d['publish_time']) - timedelta(hours=5))
|
||||
|
||||
def id_strip(post_id):
|
||||
import json
|
||||
|
||||
d = json.loads(post_id[0]) #nested dict of features
|
||||
return d['top_level_post_id']
|
||||
|
||||
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
@ -511,6 +517,9 @@ class FbcrawlItem(scrapy.Item):
|
||||
url = scrapy.Field(
|
||||
output_processor=url_strip
|
||||
)
|
||||
post_id = scrapy.Field(
|
||||
output_processor=id_strip
|
||||
)
|
||||
shared_from = scrapy.Field()
|
||||
|
||||
class CommentsItem(scrapy.Item):
|
||||
|
Binary file not shown.
@ -15,7 +15,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||
'reactions','likes','ahah','love','wow', \
|
||||
'sigh','grrr','comments','url']
|
||||
'sigh','grrr','comments','post_id','url']
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -151,6 +151,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
|
||||
new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
|
||||
new.add_xpath('date','./@data-ft')
|
||||
new.add_xpath('post_id','./@data-ft')
|
||||
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||
|
||||
#page_url #new.add_value('url',response.url)
|
||||
|
Loading…
Reference in New Issue
Block a user