final

2018-08-26 17:09:16 +02:00 · 2018-08-26 17:09:16 +02:00 · 8359748b81
commit 8359748b81
parent a26cbd969c
6 changed files with 22 additions and 5 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/pycache/settings.cpython-37.pyc
+++ b/fbcrawl/pycache/settings.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -71,6 +71,7 @@ def comments_strip(string):

 def reactions_strip(string):
    friends = 1 + string[0].count(',')
+    e = 1 + string[0].count(' e ')
    string = string[0].split()[::-1]
    if len(string) == 1:
        string = string[0]
@ -81,7 +82,11 @@ def reactions_strip(string):
    string = string[0]
    while string.rfind('.') != -1:
        string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
-    return int(string) + friends
+    
+    if not string.isdigit():
+        return e
+    else:
+        return int(string) + friends

 class FbcrawlItem(scrapy.Item):
    # define the fields for your item here like:
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -78,9 +78,7 @@ class FacebookSpider(scrapy.Spider):
        )

    def parse_page(self, response):        
-        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
-            self.logger.info('Parsing post %s', post)
-            
+        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts            
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()")
            new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href")
@ -118,4 +116,4 @@ class FacebookSpider(scrapy.Spider):
        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        
-        yield new.load_item()
+        yield new.load_item()
--- a/post_processing.py
+++ b/post_processing.py
@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Aug 26 16:16:58 2018
+
+@author: rugantio
+"""
+import pandas as pd
+df = pd.read_csv('./exploit.csv')
+df['date'] = pd.to_datetime(df['date'])
+df = df.sort_values(by='date',ascending=False)
+df.to_csv('./exploit_sorted.csv',index=False, float_format = '%.12g')
+  
+