final

2018-08-26 03:42:52 +02:00 · 2018-08-26 03:42:52 +02:00 · 8babf7aa1d
commit 8babf7aa1d
parent cdf6bdc68e
18 changed files with 264 additions and 180 deletions
--- a/README.md
+++ b/README.md
@ -1,11 +1,2 @@
 # fbcrawl
 A Facebook crawler
 ## TODO
 work in progress
 ## DISCLAIMER
 This software is NOT to be used. It violates Facebook's terms and conditions. It is for educational purposes only, to show how a crawler can be made to recursively parse a web page.
 ## Contribute
 Pull requests are welcomed!!
--- a/fbcrawl/init.py
+++ b/fbcrawl/init.py
--- a/fbcrawl/pycache/init.cpython-37.pyc
+++ b/fbcrawl/pycache/init.cpython-37.pyc
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/pycache/pipelines.cpython-37.pyc
+++ b/fbcrawl/pycache/pipelines.cpython-37.pyc
--- a/fbcrawl/pycache/settings.cpython-37.pyc
+++ b/fbcrawl/pycache/settings.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -0,0 +1,123 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
 from scrapy.loader.processors import TakeFirst, Join, MapCompose
 from datetime import datetime, timedelta
 def parse_date(date):
    date = date[0].split()
    mesi = {
    "gennaio":1,
    "febbraio":2,
    "marzo":3,
    "aprile":4,
    "maggio":5,
    "giugno":6,
    "luglio":7,
    "agosto":8,
    "settembre":9,
    "ottobre":10,
    "novembre":11,
    "dicembre":12
    }
    mesi_abbr = {
    "gen":1,
    "feb":2,
    "mar":3,
    "apr":4,
    "mag":5,
    "giu":6,
    "lug":7,
    "ago":8,
    "set":9,
    "ott":10,
    "nov":11,
    "dic":12
    }    
    if len(date) == 0:
        return "Error: no data"
    elif len(date) == 1 or date[1] == 'h': # meaning that date[0] == 'Adesso' or "n hours" ago
        day = int(str(datetime.now().date()).split(sep='-')[2])
        month = int(str(datetime.now().date()).split(sep='-')[1])
        year = int(str(datetime.now().date()).split(sep='-')[0])
    elif date[0] == 'Ieri':
        day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
        month = int(str(datetime.now().date()).split(sep='-')[1])
        year = int(str(datetime.now().date()).split(sep='-')[0])
    elif (len(date) == 2 and len(date[1]) == 3) or (len(date) == 4 and len(date[1]) == 3):
        day = int(date[0])
        month = mesi_abbr[date[1]]
        year = int(str(datetime.now().date()).split(sep='-')[0])
    elif date[2] != 'alle':
        day = int(date[0])
        month = mesi[date[1]]
        year = int(date[2])
    else:
        day = int(date[0])
        month = mesi[date[1]]
        year = int(str(datetime.now().date()).split(sep='-')[0])
    date = datetime(year,month,day)
    return date.date()
 def comments_strip(string):
    return string[0].rstrip(" commenti")
 def reactions_strip(string):
    if len(string) == 1:
        string = string[0]
        while string.rfind('.') != -1:
            string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
        return string
    string = string[0].split()
    string = string[::-1][0]
    while string.rfind('.') != -1:
        string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
    return int(string) + 1
 class FbcrawlItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    source = scrapy.Field(
            output_processor=TakeFirst()
    )                     # page that published the post
    date = scrapy.Field(      # when was the post published
            input_processor=TakeFirst(),
            output_processor=parse_date
    )       
    text = scrapy.Field(
            output_processor=Join(separator=u'')
    )                       # full text of the post
    comments = scrapy.Field(
            output_processor=comments_strip
    )                       
    commentators = scrapy.Field(
            output_processor=Join(separator=u'\n')
    )                    
    reactions = scrapy.Field(
            output_processor=reactions_strip
    )                  # num of reactions
    likes = scrapy.Field(
            output_processor=reactions_strip
    )                      
    ahah = scrapy.Field()                      
    love = scrapy.Field()                      
    wow = scrapy.Field()                      
    sigh = scrapy.Field()                      
    grrr = scrapy.Field()                      
    share = scrapy.Field()                      # num of shares
    num_id = scrapy.Field()                     # progressive int associated to the entry in the final table, not present in the webpage
    url = scrapy.Field()
--- a/fbcrawl/middlewares.py
+++ b/fbcrawl/middlewares.py
--- a/fbcrawl/pipelines.py
+++ b/fbcrawl/pipelines.py
@ -5,7 +5,12 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 from scrapy.exceptions import DropItem
 from datetime import datetime
 class FbcrawlPipeline(object):
    def process_item(self, item, spider):
-        return item
+        if item['date'] < datetime(2017,3,4).date():
            raise DropItem("Dropping element because it's older than 04/03/2017")
        else:
            return item
--- a/fbcrawl/settings.py
+++ b/fbcrawl/settings.py
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = False
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    'fbcrawl.pipelines.FbcrawlPipeline': 300,
+    #'fbcrawl.pipelines.FbcrawlPipeline': 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
@ -88,6 +88,6 @@ ROBOTSTXT_OBEY = False
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
-FEED_EXPORT_FIELDS = ["source", "date", "text", "commentators","comments","like", "share"] # specifies the order of the column to export as CSV
+FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
 FEED_EXPORT_ENCODING = 'utf-8'
 DUPEFILTER_DEBUG = True
--- a/fbcrawl/spiders/init.py
+++ b/fbcrawl/spiders/init.py
--- a/fbcrawl/spiders/pycache/init.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/init.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -0,0 +1,122 @@
 import scrapy
 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
 from fbcrawl.items import FbcrawlItem
 class FacebookSpider(scrapy.Spider):
    """
    Parse FB pages (needs credentials)
    """    
    name = "fb"
    def __init__(self, email='', password='', page='', **kwargs):
        super(FacebookSpider, self).__init__(**kwargs)
        if not email or not password:
            raise ValueError("You need to provide valid email and password!")
        else:
            self.email = email
            self.password = password
        if not page:
            raise ValueError("You need to provide a valid page name to crawl!")
        else:
            self.page = page
        self.start_urls = ['https://mbasic.facebook.com']    
    def parse(self, response):
        return FormRequest.from_response(
                response,
                formxpath='//form[contains(@action, "login")]',
                formdata={'email': self.email,'pass': self.password},
                callback=self.parse_home
        )
    def parse_home(self, response):
        '''Parse user news feed page'''
        if response.css('#approvals_code'):
            # Handle 'Approvals Code' checkpoint (ask user to enter code).
            if not self.code:
                # Show facebook messages via logs
                # and request user for approval code.
                message = response.css('._50f4::text').extract()[0]
                self.log(message)
                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
                self.log(message)
                self.code = input('Enter the code: ')
            self.code = str(self.code)
            if not (self.code and self.code.isdigit()):
                self.log('Bad approvals code detected.')
                return
            return FormRequest.from_response(
                response,
                formdata={'approvals_code': self.code},
                callback=self.parse_home,
            )
        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
            # Handle 'Save Browser' checkpoint.
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home,
                dont_filter=True,
            )
        elif response.css('button#checkpointSubmitButton'):
            # Handle 'Someone tried to log into your account' warning.
            return FormRequest.from_response(
                response, callback=self.parse_home, dont_filter=True,)
        # Else go to the user profile.
        href = response.urljoin(self.page)
        self.logger.info('Parse function called on %s', href)
        return scrapy.Request(
            url=href,
            callback=self.parse_page,
        )
    def parse_page(self, response):        
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
            self.logger.info('Parsing post %s', post)
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()")
            new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href")
            post = post.xpath(".//a[contains(text(),'Notizia completa')]/@href").extract() #returns full post-link in a list
            temp_post = response.urljoin(post[0])        
            yield scrapy.Request(temp_post, self.parse_post,dont_filter = True, meta={'item':new})       
        next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
        if len(next_page) > 0:
            next_page = response.urljoin(next_page[0].extract())
            yield scrapy.Request(next_page, callback=self.parse_page)
        else:
            next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
            if len(next_page) > 0:
                next_page = response.urljoin(next_page[0].extract())
                yield scrapy.Request(next_page, callback=self.parse_page)
    def parse_post(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])            
        new.add_xpath('source', '//span/strong/a/text() | //div/a/strong/text() | //td/div/h3/strong/a/text()')
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text()')
        new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")   
        reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
        reactions = response.urljoin(reactions[0].extract())
        yield scrapy.Request(reactions, callback=self.parse_reactions, dont_filter = True, meta={'item':new})
    def parse_reactions(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        
        yield new.load_item()
--- a/items.py
+++ b/items.py
@ -1,34 +0,0 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
 from scrapy.loader.processors import TakeFirst, Join
 class FbcrawlItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    source = scrapy.Field()                     # page that published the post
    date = scrapy.Field(
            output_processor=TakeFirst()
    )       
                                    # when was the post published
    text = scrapy.Field(
            output_processor=Join(separator=u'')
    )                       # full text of the post
    comments = scrapy.Field(
            output_processor=Join(separator=u'\n')
    )                       # full text of the post
    commentators = scrapy.Field(
            output_processor=Join(separator=u'\n')
    )                       # full text of the post
    like = scrapy.Field()                       # num of likes
    share = scrapy.Field()                      # num of shares
    num_id = scrapy.Field()                     # progressive int associated to the entry in the final table, not present in the webpage
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = fbcrawl.settings
 [deploy]
 #url = http://localhost:6800/
 project = fbcrawl
--- a/spiders/fbcrawl.py
+++ b/spiders/fbcrawl.py
@ -1,134 +0,0 @@
 import scrapy
 from datetime import datetime
 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
 from fbcrawl.items import FbcrawlItem
 class FacebookSpider(scrapy.Spider):
    """
    Parse FB pages (needs credentials)
    """    
    name = "fb"
    def __init__(self, email='', password='', til='2004-1-1', **kwargs):
        super(FacebookSpider, self).__init__(**kwargs)
        til = til.split(sep='-')
        self.til = datetime(int(til[0]),int(til[1]),int(til[2]))
        self.email = email
        self.password = password
        self.start_urls = ['https://mbasic.facebook.com']    
    def parse(self, response):
        return FormRequest.from_response(
                response,
                formxpath='//form[contains(@action, "login")]',
                formdata={'email': self.email,'pass': self.password},
                callback=self.parse_home
        )
    def parse_home(self, response):
        '''Parse user news feed page'''
        if response.css('#approvals_code'):
            # Handle 'Approvals Code' checkpoint (ask user to enter code).
            if not self.code:
                # Show facebook messages via logs
                # and request user for approval code.
                message = response.css('._50f4::text').extract()[0]
                self.log(message)
                message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
                self.log(message)
                self.code = input('Enter the code: ')
            self.code = str(self.code)
            if not (self.code and self.code.isdigit()):
                self.log('Bad approvals code detected.')
                return
            return FormRequest.from_response(
                response,
                formdata={'approvals_code': self.code},
                callback=self.parse_home,
            )
        elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
            # Handle 'Save Browser' checkpoint.
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home,
                dont_filter=True,
            )
        elif response.css('button#checkpointSubmitButton'):
            # Handle `Someone tried to log into your account` warning.
            return FormRequest.from_response(
                response, callback=self.parse_home, dont_filter=True,)
        # Else go to the user profile.
        href = 'https://mbasic.facebook.com/ivacciniealtricomplottileggendari'
        self.logger.info('Parse function called on %s', href)
        return scrapy.Request(
            url=href,
            callback=self.parse_page,
        )
    def parse_page(self, response):
 #        from scrapy.utils.response import open_in_browser
 #        open_in_browser(response)
        for post in response.xpath("//div[contains(@id,'u_0_')]"):
 #            self.logger.info('Parse function called on %s', response.url)
 #            self.logger.info('Parsing page number %d', i)
 #            from scrapy.utils.response import open_in_browser
 #            open_in_browser(response)
            post = post.xpath("//a[contains(text(),'Notizia completa')]/@href").extract()
 #   
            for i in range(len(post)):
                temp_post = response.urljoin(post[i])        
                yield scrapy.Request(temp_post, self.parse_post,dont_filter = True)            
 #        next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
 #        if len(next_page) > 0:
 #            next_page = response.urljoin(next_page[0].extract())
 #            yield scrapy.Request(next_page, callback=self.parse_page)
 #    
 #        else:
 #            next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
 #            if len(next_page) > 0:
 #                next_page = response.urljoin(next_page[0].extract())
 #                yield scrapy.Request(next_page, callback=self.parse_page)
 #                
    def parse_post(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response)
 #        from scrapy.utils.response import open_in_browser
 #        open_in_browser(response)
 #         #        ("//div[string-length(@id)=15 or string-length(@id)=16]")
       # new.add_xpath('comments',"//div[string-length(@id)=15 or string-length(@id)=16]//div/text()")               
 # {}' .format(next_comment_page))
        new.add_xpath('source', '//span/strong/a/text()')
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text()')
        next_comment_page = response.xpath("//div/div[contains(@id,'see_next')]/a/@href")
        while len(next_comment_page) > 0:
            next_comment_page = response.urljoin(next_comment_page[0].extract())        
            yield scrapy.Request(next_comment_page, callback=self.parse_comments, dont_filter = True, \
                             meta={'new':new})
 #            self.logger.info('Parsing page number %d', i)
 #            from scrapy.utils.response import open_in_browser
 #            open_in_browser(response)0
 #            new.load_item()
 # 
 #        yield new.load_item()
    def parse_comments(self,response):
        self.logger.info('\n\n PAGINA COMMENTI  \n\n')
        new = response.meta['new']    
        new.add_xpath('commentators',"//div[number(@id)>1]/div/h3/a[@href]/text()")
        yield new.load_item()
--- a/trump.png
+++ b/trump.png