final

2018-08-26 03:42:52 +02:00 · 2018-08-26 03:42:52 +02:00 · 8babf7aa1d
commit 8babf7aa1d
parent cdf6bdc68e
18 changed files with 264 additions and 180 deletions
--- a/README.md
+++ b/README.md
@ -1,11 +1,2 @@
 # fbcrawl
 A Facebook crawler
-
-## TODO
-work in progress
-
-## DISCLAIMER
-This software is NOT to be used. It violates Facebook's terms and conditions. It is for educational purposes only, to show how a crawler can be made to recursively parse a web page.
-
-## Contribute
-Pull requests are welcomed!!
--- a/fbcrawl/init.py
+++ b/fbcrawl/init.py
--- a/fbcrawl/pycache/init.cpython-37.pyc
+++ b/fbcrawl/pycache/init.cpython-37.pyc
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/pycache/pipelines.cpython-37.pyc
+++ b/fbcrawl/pycache/pipelines.cpython-37.pyc
--- a/fbcrawl/pycache/settings.cpython-37.pyc
+++ b/fbcrawl/pycache/settings.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+from scrapy.loader.processors import TakeFirst, Join, MapCompose
+from datetime import datetime, timedelta
+
+def parse_date(date):
+    date = date[0].split()
+    
+    mesi = {
+    "gennaio":1,
+    "febbraio":2,
+    "marzo":3,
+    "aprile":4,
+    "maggio":5,
+    "giugno":6,
+    "luglio":7,
+    "agosto":8,
+    "settembre":9,
+    "ottobre":10,
+    "novembre":11,
+    "dicembre":12
+    }
+
+    mesi_abbr = {
+    "gen":1,
+    "feb":2,
+    "mar":3,
+    "apr":4,
+    "mag":5,
+    "giu":6,
+    "lug":7,
+    "ago":8,
+    "set":9,
+    "ott":10,
+    "nov":11,
+    "dic":12
+    }    
+    if len(date) == 0:
+        return "Error: no data"
+    elif len(date) == 1 or date[1] == 'h': # meaning that date[0] == 'Adesso' or "n hours" ago
+        day = int(str(datetime.now().date()).split(sep='-')[2])
+        month = int(str(datetime.now().date()).split(sep='-')[1])
+        year = int(str(datetime.now().date()).split(sep='-')[0])
+    elif date[0] == 'Ieri':
+        day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
+        month = int(str(datetime.now().date()).split(sep='-')[1])
+        year = int(str(datetime.now().date()).split(sep='-')[0])
+    elif (len(date) == 2 and len(date[1]) == 3) or (len(date) == 4 and len(date[1]) == 3):
+        day = int(date[0])
+        month = mesi_abbr[date[1]]
+        year = int(str(datetime.now().date()).split(sep='-')[0])
+    elif date[2] != 'alle':
+        day = int(date[0])
+        month = mesi[date[1]]
+        year = int(date[2])
+    else:
+        day = int(date[0])
+        month = mesi[date[1]]
+        year = int(str(datetime.now().date()).split(sep='-')[0])
+    date = datetime(year,month,day)
+    return date.date()
+
+def comments_strip(string):
+    return string[0].rstrip(" commenti")
+
+def reactions_strip(string):
+    if len(string) == 1:
+        string = string[0]
+        while string.rfind('.') != -1:
+            string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
+        return string
+    string = string[0].split()
+    string = string[::-1][0]
+    
+    while string.rfind('.') != -1:
+        string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
+
+    return int(string) + 1
+
+class FbcrawlItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    source = scrapy.Field(
+            output_processor=TakeFirst()
+    )                     # page that published the post
+
+    date = scrapy.Field(      # when was the post published
+            input_processor=TakeFirst(),
+            output_processor=parse_date
+    )       
+                                    
+    text = scrapy.Field(
+            output_processor=Join(separator=u'')
+    )                       # full text of the post
+
+    comments = scrapy.Field(
+            output_processor=comments_strip
+    )                       
+    commentators = scrapy.Field(
+            output_processor=Join(separator=u'\n')
+    )                    
+
+    reactions = scrapy.Field(
+            output_processor=reactions_strip
+    )                  # num of reactions
+    
+    likes = scrapy.Field(
+            output_processor=reactions_strip
+    )                      
+    ahah = scrapy.Field()                      
+    love = scrapy.Field()                      
+    wow = scrapy.Field()                      
+    sigh = scrapy.Field()                      
+    grrr = scrapy.Field()                      
+    share = scrapy.Field()                      # num of shares
+    num_id = scrapy.Field()                     # progressive int associated to the entry in the final table, not present in the webpage
+    url = scrapy.Field()
--- a/fbcrawl/middlewares.py
+++ b/fbcrawl/middlewares.py
--- a/fbcrawl/pipelines.py
+++ b/fbcrawl/pipelines.py
@ -5,7 +5,12 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

+from scrapy.exceptions import DropItem
+from datetime import datetime

 class FbcrawlPipeline(object):
    def process_item(self, item, spider):
+        if item['date'] < datetime(2017,3,4).date():
+            raise DropItem("Dropping element because it's older than 04/03/2017")
+        else:
            return item
--- a/fbcrawl/settings.py
+++ b/fbcrawl/settings.py
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = False
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    'fbcrawl.pipelines.FbcrawlPipeline': 300,
+    #'fbcrawl.pipelines.FbcrawlPipeline': 300,
 #}

 # Enable and configure the AutoThrottle extension (disabled by default)
@ -88,6 +88,6 @@ ROBOTSTXT_OBEY = False
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
-FEED_EXPORT_FIELDS = ["source", "date", "text", "commentators","comments","like", "share"] # specifies the order of the column to export as CSV
+FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
 FEED_EXPORT_ENCODING = 'utf-8'
 DUPEFILTER_DEBUG = True
--- a/fbcrawl/spiders/init.py
+++ b/fbcrawl/spiders/init.py
--- a/fbcrawl/spiders/pycache/init.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/init.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -0,0 +1,122 @@
+import scrapy
+
+from scrapy.loader import ItemLoader
+from scrapy.http import FormRequest
+from fbcrawl.items import FbcrawlItem
+
+
+class FacebookSpider(scrapy.Spider):
+    """
+    Parse FB pages (needs credentials)
+    """    
+    name = "fb"
+
+    def __init__(self, email='', password='', page='', **kwargs):
+        super(FacebookSpider, self).__init__(**kwargs)
+    
+        if not email or not password:
+            raise ValueError("You need to provide valid email and password!")
+        else:
+            self.email = email
+            self.password = password
+            
+        if not page:
+            raise ValueError("You need to provide a valid page name to crawl!")
+        else:
+            self.page = page
+            
+        self.start_urls = ['https://mbasic.facebook.com']    
+
+
+    def parse(self, response):
+        return FormRequest.from_response(
+                response,
+                formxpath='//form[contains(@action, "login")]',
+                formdata={'email': self.email,'pass': self.password},
+                callback=self.parse_home
+        )
+  
+    def parse_home(self, response):
+        '''Parse user news feed page'''
+        if response.css('#approvals_code'):
+            # Handle 'Approvals Code' checkpoint (ask user to enter code).
+            if not self.code:
+                # Show facebook messages via logs
+                # and request user for approval code.
+                message = response.css('._50f4::text').extract()[0]
+                self.log(message)
+                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
+                self.log(message)
+                self.code = input('Enter the code: ')
+            self.code = str(self.code)
+            if not (self.code and self.code.isdigit()):
+                self.log('Bad approvals code detected.')
+                return
+            return FormRequest.from_response(
+                response,
+                formdata={'approvals_code': self.code},
+                callback=self.parse_home,
+            )
+        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
+            # Handle 'Save Browser' checkpoint.
+            return FormRequest.from_response(
+                response,
+                formdata={'name_action_selected': 'dont_save'},
+                callback=self.parse_home,
+                dont_filter=True,
+            )
+        elif response.css('button#checkpointSubmitButton'):
+            # Handle 'Someone tried to log into your account' warning.
+            return FormRequest.from_response(
+                response, callback=self.parse_home, dont_filter=True,)
+        # Else go to the user profile.
+        href = response.urljoin(self.page)
+        self.logger.info('Parse function called on %s', href)
+        return scrapy.Request(
+            url=href,
+            callback=self.parse_page,
+        )
+
+
+    def parse_page(self, response):        
+        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
+            self.logger.info('Parsing post %s', post)
+            
+            new = ItemLoader(item=FbcrawlItem(),selector=post)
+            new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()")
+            new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href")
+            
+            post = post.xpath(".//a[contains(text(),'Notizia completa')]/@href").extract() #returns full post-link in a list
+            temp_post = response.urljoin(post[0])        
+            yield scrapy.Request(temp_post, self.parse_post,dont_filter = True, meta={'item':new})       
+
+        next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
+        if len(next_page) > 0:
+            next_page = response.urljoin(next_page[0].extract())
+            yield scrapy.Request(next_page, callback=self.parse_page)
+        else:
+            next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
+            if len(next_page) > 0:
+                next_page = response.urljoin(next_page[0].extract())
+                yield scrapy.Request(next_page, callback=self.parse_page)
+                
+    def parse_post(self,response):
+        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])            
+        new.add_xpath('source', '//span/strong/a/text() | //div/a/strong/text() | //td/div/h3/strong/a/text()')
+        new.add_xpath('date', '//div/div/abbr/text()')
+        new.add_xpath('text','//div[@data-ft]//p//text()')
+        new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")   
+        
+        reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
+        reactions = response.urljoin(reactions[0].extract())
+        yield scrapy.Request(reactions, callback=self.parse_reactions, dont_filter = True, meta={'item':new})
+        
+    def parse_reactions(self,response):
+        new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
+        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
+        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
+        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
+        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
+        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
+        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        
+        yield new.load_item()
--- a/items.py
+++ b/items.py
@ -1,34 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-from scrapy.loader.processors import TakeFirst, Join
-
-class FbcrawlItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    source = scrapy.Field()                     # page that published the post
-
-    date = scrapy.Field(
-            output_processor=TakeFirst()
-    )       
-                                    # when was the post published
-    text = scrapy.Field(
-            output_processor=Join(separator=u'')
-    )                       # full text of the post
-
-    comments = scrapy.Field(
-            output_processor=Join(separator=u'\n')
-    )                       # full text of the post
-    commentators = scrapy.Field(
-            output_processor=Join(separator=u'\n')
-    )                       # full text of the post
-
-    like = scrapy.Field()                       # num of likes
-    share = scrapy.Field()                      # num of shares
-    num_id = scrapy.Field()                     # progressive int associated to the entry in the final table, not present in the webpage
-    
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = fbcrawl.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = fbcrawl
--- a/spiders/fbcrawl.py
+++ b/spiders/fbcrawl.py
@ -1,134 +0,0 @@
-import scrapy
-
-from datetime import datetime
-
-
-from scrapy.loader import ItemLoader
-from scrapy.http import FormRequest
-from fbcrawl.items import FbcrawlItem
-
-
-class FacebookSpider(scrapy.Spider):
-    """
-    Parse FB pages (needs credentials)
-    """    
-    name = "fb"
-
-    def __init__(self, email='', password='', til='2004-1-1', **kwargs):
-        super(FacebookSpider, self).__init__(**kwargs)
-
-        til = til.split(sep='-')
-        self.til = datetime(int(til[0]),int(til[1]),int(til[2]))
-        
-        self.email = email
-        self.password = password
-        self.start_urls = ['https://mbasic.facebook.com']    
-
-    def parse(self, response):
-        return FormRequest.from_response(
-                response,
-                formxpath='//form[contains(@action, "login")]',
-                formdata={'email': self.email,'pass': self.password},
-                callback=self.parse_home
-        )
-  
-    def parse_home(self, response):
-        '''Parse user news feed page'''
-        if response.css('#approvals_code'):
-            # Handle 'Approvals Code' checkpoint (ask user to enter code).
-            if not self.code:
-                # Show facebook messages via logs
-                # and request user for approval code.
-                message = response.css('._50f4::text').extract()[0]
-                self.log(message)
-                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
-                self.log(message)
-                self.code = input('Enter the code: ')
-            self.code = str(self.code)
-            if not (self.code and self.code.isdigit()):
-                self.log('Bad approvals code detected.')
-                return
-            return FormRequest.from_response(
-                response,
-                formdata={'approvals_code': self.code},
-                callback=self.parse_home,
-            )
-        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
-            # Handle 'Save Browser' checkpoint.
-            return FormRequest.from_response(
-                response,
-                formdata={'name_action_selected': 'dont_save'},
-                callback=self.parse_home,
-                dont_filter=True,
-            )
-        elif response.css('button#checkpointSubmitButton'):
-            # Handle `Someone tried to log into your account` warning.
-            return FormRequest.from_response(
-                response, callback=self.parse_home, dont_filter=True,)
-        # Else go to the user profile.
-        href = 'https://mbasic.facebook.com/ivacciniealtricomplottileggendari'
-        self.logger.info('Parse function called on %s', href)
-        return scrapy.Request(
-            url=href,
-            callback=self.parse_page,
-        )
-
-
-
-    def parse_page(self, response):
-#        from scrapy.utils.response import open_in_browser
-#        open_in_browser(response)
-        
-        for post in response.xpath("//div[contains(@id,'u_0_')]"):
-#            self.logger.info('Parse function called on %s', response.url)
-#            self.logger.info('Parsing page number %d', i)
-#            from scrapy.utils.response import open_in_browser
-#            open_in_browser(response)
-            post = post.xpath("//a[contains(text(),'Notizia completa')]/@href").extract()
-#   
-            for i in range(len(post)):
-                temp_post = response.urljoin(post[i])        
-                yield scrapy.Request(temp_post, self.parse_post,dont_filter = True)            
-
-#        next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
-#        if len(next_page) > 0:
-#            next_page = response.urljoin(next_page[0].extract())
-#            yield scrapy.Request(next_page, callback=self.parse_page)
-#    
-#        else:
-#            next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
-#            if len(next_page) > 0:
-#                next_page = response.urljoin(next_page[0].extract())
-#                yield scrapy.Request(next_page, callback=self.parse_page)
-#                
-    def parse_post(self,response):
-        new = ItemLoader(item=FbcrawlItem(),response=response)
-#        from scrapy.utils.response import open_in_browser
-#        open_in_browser(response)
-#         #        ("//div[string-length(@id)=15 or string-length(@id)=16]")
-       # new.add_xpath('comments',"//div[string-length(@id)=15 or string-length(@id)=16]//div/text()")               
-# {}' .format(next_comment_page))
-        new.add_xpath('source', '//span/strong/a/text()')
-        new.add_xpath('date', '//div/div/abbr/text()')
-        new.add_xpath('text','//div[@data-ft]//p//text()')
-        
-        next_comment_page = response.xpath("//div/div[contains(@id,'see_next')]/a/@href")
-        while len(next_comment_page) > 0:
-            next_comment_page = response.urljoin(next_comment_page[0].extract())        
-            yield scrapy.Request(next_comment_page, callback=self.parse_comments, dont_filter = True, \
-                             meta={'new':new})
-#            self.logger.info('Parsing page number %d', i)
-
-#            from scrapy.utils.response import open_in_browser
-#            open_in_browser(response)0
-#            new.load_item()
-
-
-# 
-#        yield new.load_item()
-
-    def parse_comments(self,response):
-        self.logger.info('\n\n PAGINA COMMENTI  \n\n')
-        new = response.meta['new']    
-        new.add_xpath('commentators',"//div[number(@id)>1]/div/h3/a[@href]/text()")
-        yield new.load_item()
--- a/trump.png
+++ b/trump.png