[fb] in items.py refactoring parse_date, introducing "date" attribute

2019-04-23 07:31:23 +02:00 · 2019-04-23 07:31:23 +02:00 · efda9a956e
commit efda9a956e
parent 1acf5c2106
5 changed files with 48 additions and 17 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -462,12 +462,33 @@ def url_strip(url):
            else:
                return fullurl
    
+def parse_date2(date):
+    import json
+        
+    d = json.loads(date[0]) #nested dict of features
+    flat_d = dict() #only retain 'leaves' of d tree
+    
+    def recursive_items(dictionary):
+        '''
+        Get most nested key:value pair of nested dict
+        '''
+        for key, value in dictionary.items():
+            if type(value) is dict:
+                yield from recursive_items(value)
+            else:
+                yield (key, value)
+
+    for key, value in recursive_items(d):
+        flat_d[key] = value
+        
+    return str(datetime.fromtimestamp(flat_d['publish_time']) - timedelta(hours=5))  
+    
+    

 class FbcrawlItem(scrapy.Item):
    source = scrapy.Field()   
    date = scrapy.Field(      # when was the post published
-        input_processor=TakeFirst(),
-        output_processor=parse_date
+        output_processor=parse_date2
    )       
    text = scrapy.Field(
        output_processor=Join(separator=u'')
--- a/fbcrawl/spiders/pycache/comments.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/comments.cpython-37.pyc
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -3,7 +3,9 @@ import logging

 from scrapy.loader import ItemLoader
 from scrapy.http import FormRequest
-from fbcrawl.items import FbcrawlItem
+from scrapy.exceptions import CloseSpider
+from fbcrawl.items import FbcrawlItem, parse_date2
+from datetime import datetime

 class FacebookSpider(scrapy.Spider):
    '''
@ -45,15 +47,14 @@ class FacebookSpider(scrapy.Spider):
        else:
            self.logger.info('Page attribute provided, scraping "{}"'.format(self.page))
       
-        #parse year
-        if 'year' not in kwargs:
-            self.year = 2018
-            self.logger.info('Year attribute not found, set scraping back to {}'.format(self.year))
+        #parse date
+        if 'date' not in kwargs:
+            self.date = datetime(2014,1,1)
+            self.year = 2014
        else:
-            assert int(self.year) <= 2019 and int(self.year) >= 2006,\
-            'Year must be an int number 2006 <= year <= 2019'
-            self.year = int(self.year)    #arguments are passed as strings
-            self.logger.info('Year attribute found, set scraping back to {}'.format(self.year))
+            print(type(kwargs['date']))
+            self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d')
+            self.year = datetime.now().year - 1

        #parse lang, if not provided (but is supported) it will be guessed in parse_home
        if 'lang' not in kwargs:
@ -138,9 +139,18 @@ class FacebookSpider(scrapy.Spider):
        '''
        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):     
+            many_features = post.xpath('./@data-ft').get()
+            date = []
+            date.append(many_features)
+            date = parse_date2(date)
+            current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S')
+
+            if self.date > current_date:
+                raise CloseSpider('Reached date: {}'.format(self.date))
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')     
+            new.add_xpath('date','./@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")

            #page_url #new.add_value('url',response.url)
@ -151,7 +161,7 @@ class FacebookSpider(scrapy.Spider):
            yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})       

        #load following page, try to click on "more"
-        #after few pages have gone scraped, the "more" link disappears 
+        #after few pages have been scraped, the "more" link might disappears 
        #if not present look for the highest year not parsed yet, click once 
        #and keep looking for "more"
        new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
@ -197,7 +207,7 @@ class FacebookSpider(scrapy.Spider):
        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
-        new.add_xpath('date','//div/div/abbr/text()')
+     #   new.add_xpath('date','//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
        new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")