This commit is contained in:
rugantio 2018-08-26 17:09:16 +02:00
parent a26cbd969c
commit 8359748b81
6 changed files with 22 additions and 5 deletions

View File

@ -71,6 +71,7 @@ def comments_strip(string):
def reactions_strip(string):
friends = 1 + string[0].count(',')
e = 1 + string[0].count(' e ')
string = string[0].split()[::-1]
if len(string) == 1:
string = string[0]
@ -81,7 +82,11 @@ def reactions_strip(string):
string = string[0]
while string.rfind('.') != -1:
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
return int(string) + friends
if not string.isdigit():
return e
else:
return int(string) + friends
class FbcrawlItem(scrapy.Item):
# define the fields for your item here like:

View File

@ -78,9 +78,7 @@ class FacebookSpider(scrapy.Spider):
)
def parse_page(self, response):
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
self.logger.info('Parsing post %s', post)
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
new = ItemLoader(item=FbcrawlItem(),selector=post)
new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()")
new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href")
@ -118,4 +116,4 @@ class FacebookSpider(scrapy.Spider):
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
yield new.load_item()
yield new.load_item()

14
post_processing.py Normal file
View File

@ -0,0 +1,14 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 26 16:16:58 2018
@author: rugantio
"""
import pandas as pd
df = pd.read_csv('./exploit.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date',ascending=False)
df.to_csv('./exploit_sorted.csv',index=False, float_format = '%.12g')