blocking mitigation

2019-04-25 23:41:33 +02:00 · 2019-04-25 23:41:33 +02:00 · 55dc799374
commit 55dc799374
parent 4a379f3af4
6 changed files with 13 additions and 10 deletions
--- a/fbcrawl/pycache/items.cpython-37.pyc
+++ b/fbcrawl/pycache/items.cpython-37.pyc
--- a/fbcrawl/pycache/settings.cpython-37.pyc
+++ b/fbcrawl/pycache/settings.cpython-37.pyc
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -485,10 +485,8 @@ def parse_date2(date):
 def id_strip(post_id):
    import json
-        
+    d = json.loads(post_id[::-1][0]) #nested dict of features
-    d = json.loads(post_id[0]) #nested dict of features
+    return str(d['top_level_post_id'])
    return d['top_level_post_id']
 class FbcrawlItem(scrapy.Item):
--- a/fbcrawl/settings.py
+++ b/fbcrawl/settings.py
@ -16,7 +16,6 @@ NEWSPIDER_MODULE = 'fbcrawl.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
@ -26,9 +25,10 @@ CONCURRENT_REQUESTS = 16
 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 2
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_DOMAIN = 1
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
--- a/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
+++ b/fbcrawl/spiders/pycache/fbcrawl.cpython-37.pyc
--- a/fbcrawl/spiders/fbcrawl.py
+++ b/fbcrawl/spiders/fbcrawl.py
@ -137,8 +137,13 @@ class FacebookSpider(scrapy.Spider):
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
 #        #open page in browser for debug
 #        from scrapy.utils.response import open_in_browser
 #        open_in_browser(response)
        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):     
            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
@ -197,11 +202,11 @@ class FacebookSpider(scrapy.Spider):
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
-                self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag']))
+                self.logger.info('Page scraped, click on more! new_page = {} flag = {}'.format(new_page,date))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
            else:
-                self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
+#                self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
-                self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k))
+                self.logger.info('First page scraped, click on more {}! Flag not set, default flag = {}'.format(new_page,date))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
    def parse_post(self,response):