final
This commit is contained in:
parent
a26cbd969c
commit
8359748b81
Binary file not shown.
Binary file not shown.
@ -71,6 +71,7 @@ def comments_strip(string):
|
||||
|
||||
def reactions_strip(string):
|
||||
friends = 1 + string[0].count(',')
|
||||
e = 1 + string[0].count(' e ')
|
||||
string = string[0].split()[::-1]
|
||||
if len(string) == 1:
|
||||
string = string[0]
|
||||
@ -81,7 +82,11 @@ def reactions_strip(string):
|
||||
string = string[0]
|
||||
while string.rfind('.') != -1:
|
||||
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
||||
return int(string) + friends
|
||||
|
||||
if not string.isdigit():
|
||||
return e
|
||||
else:
|
||||
return int(string) + friends
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
|
Binary file not shown.
@ -78,9 +78,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
)
|
||||
|
||||
def parse_page(self, response):
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
|
||||
self.logger.info('Parsing post %s', post)
|
||||
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()")
|
||||
new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href")
|
||||
@ -118,4 +116,4 @@ class FacebookSpider(scrapy.Spider):
|
||||
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
||||
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
||||
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
||||
yield new.load_item()
|
||||
yield new.load_item()
|
||||
|
14
post_processing.py
Normal file
14
post_processing.py
Normal file
@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sun Aug 26 16:16:58 2018
|
||||
|
||||
@author: rugantio
|
||||
"""
|
||||
import pandas as pd
|
||||
df = pd.read_csv('./exploit.csv')
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
df = df.sort_values(by='date',ascending=False)
|
||||
df.to_csv('./exploit_sorted.csv',index=False, float_format = '%.12g')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user