This commit is contained in:
rugantio 2018-08-26 14:08:36 +02:00
parent 888ebeab70
commit 31c30c7b52
6 changed files with 7 additions and 8 deletions

1
.~lock.exploit.csv# Normal file
View File

@ -0,0 +1 @@
,rugantio,alice,26.08.2018 14:07,file:///home/rugantio/.config/libreoffice/4;

View File

@ -70,18 +70,18 @@ def comments_strip(string):
return string[0].rstrip(" commenti") return string[0].rstrip(" commenti")
def reactions_strip(string): def reactions_strip(string):
friends = 1 + string[0].count(',')
string = string[0].split()[::-1]
if len(string) == 1: if len(string) == 1:
string = string[0] string = string[0]
while string.rfind('.') != -1: while string.rfind('.') != -1:
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:] string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
return string return string
string = string[0].split()
string = string[::-1][0] string = string[0]
while string.rfind('.') != -1: while string.rfind('.') != -1:
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:] string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
return int(string) + friends
return int(string) + 1
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
@ -119,5 +119,4 @@ class FbcrawlItem(scrapy.Item):
sigh = scrapy.Field() sigh = scrapy.Field()
grrr = scrapy.Field() grrr = scrapy.Field()
share = scrapy.Field() # num of shares share = scrapy.Field() # num of shares
num_id = scrapy.Field() # progressive int associated to the entry in the final table, not present in the webpage
url = scrapy.Field() url = scrapy.Field()

View File

@ -77,7 +77,6 @@ class FacebookSpider(scrapy.Spider):
callback=self.parse_page, callback=self.parse_page,
) )
def parse_page(self, response): def parse_page(self, response):
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
self.logger.info('Parsing post %s', post) self.logger.info('Parsing post %s', post)
@ -102,7 +101,7 @@ class FacebookSpider(scrapy.Spider):
def parse_post(self,response): def parse_post(self,response):
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
new.add_xpath('source', '//span/strong/a/text() | //div/a/strong/text() | //td/div/h3/strong/a/text()') new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath('date', '//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text()') new.add_xpath('text','//div[@data-ft]//p//text()')
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")