From 55dc79937451d260915d830249c0a845c45cea9b Mon Sep 17 00:00:00 2001 From: rugantio Date: Thu, 25 Apr 2019 23:41:33 +0200 Subject: [PATCH] blocking mitigation --- fbcrawl/__pycache__/items.cpython-37.pyc | Bin 9206 -> 9230 bytes fbcrawl/__pycache__/settings.cpython-37.pyc | Bin 496 -> 488 bytes fbcrawl/items.py | 6 ++---- fbcrawl/settings.py | 6 +++--- .../__pycache__/fbcrawl.cpython-37.pyc | Bin 8518 -> 8473 bytes fbcrawl/spiders/fbcrawl.py | 11 ++++++++--- 6 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fbcrawl/__pycache__/items.cpython-37.pyc b/fbcrawl/__pycache__/items.cpython-37.pyc index ec0c82933202449e0cf76fdf6fcbc254f75df941..bc0a412a1e277ecf4a6b98134abd08504450dea7 100644 GIT binary patch delta 189 zcmez7-si#V#LLUY00b2W55(ASn_)^L51lMn>+* L8p@}c*@WZ)>yj*u delta 106 zcmeD4_~y>*#LLUY00cMd55zca6>4bnUj;KZ>ndYpb=qcXszIrnO9n&P+?&f zKXHb!zOkNxp0SyQv7VuUu|jZST4GVAzNxV}P|WS60MMi=*3yzRU5k)gtS+ujq3#oZ Syya$LVPausabaO(VFCb&Q6EYG delta 100 zcmaFC{DGO*iIndYpb?&#my%yztl$@-U}&Ic yV66ZYFf*}MsF>Ji%ncOKGd4B0&@(rmcvhB`#naEtf8vL?+{`Q>=*+^%!UzD$=o=jX diff --git a/fbcrawl/items.py b/fbcrawl/items.py index 4fcde54..c1d6b19 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -485,10 +485,8 @@ def parse_date2(date): def id_strip(post_id): import json - - d = json.loads(post_id[0]) #nested dict of features - return d['top_level_post_id'] - + d = json.loads(post_id[::-1][0]) #nested dict of features + return str(d['top_level_post_id']) class FbcrawlItem(scrapy.Item): diff --git a/fbcrawl/settings.py b/fbcrawl/settings.py index 946b209..c833770 100644 --- a/fbcrawl/settings.py +++ b/fbcrawl/settings.py @@ -16,7 +16,6 @@ NEWSPIDER_MODULE = 'fbcrawl.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' - # Obey robots.txt rules ROBOTSTXT_OBEY = False @@ -26,9 +25,10 @@ CONCURRENT_REQUESTS = 16 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +DOWNLOAD_DELAY = 2 + # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_DOMAIN = 1 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) diff --git a/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc b/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc index 324ab939c5ec2e6b26a78f54d17836de386cfa13..9b2b11754b5a9320560ce6a0d7e224138440e2c0 100644 GIT binary patch delta 184 zcmX@+G}DRCiISO+Q`Qu$mp?ISnw1RW6k7y!jghwH4ItY3wToG7jV>Y z)UYgMn#?1jET{(+Re+0b5SC+0Q4D6#RN7o1GJ~1XWHP6OtYltld3-@)da8o0LUpY| zT25j*m{sLEIaWfE87Q{7PC|{5BMsyNE&-;=Yb0Y>MHm&BMJ7KGkl3s)^^1w|%4Q|m TUM9u|lQ+meVf39mRbe9l(k3&o delta 215 zcmbQ~bj*p*iIEO?5Ev1jrR zvqV#bBpDVmPUa9%7PW%PC@f@TWPq|2C$A7zV@pvAX3$jLoFFoTnNe->PjOlEw4B6r z1zUyc+A1wKA4hiu7k}4a1wa1~1xKH7$H-uXAlHB(*I-vapn!8osH2ZUq^o04mD6N> z3B}F95^9Vb86Y>Z2{2A>l#F4MU{qk@Vch&sQka?X{N}GRy-bX^CpXJKVf2`sp|BAE DwedOb diff --git a/fbcrawl/spiders/fbcrawl.py b/fbcrawl/spiders/fbcrawl.py index 87febcd..d793595 100644 --- a/fbcrawl/spiders/fbcrawl.py +++ b/fbcrawl/spiders/fbcrawl.py @@ -137,8 +137,13 @@ class FacebookSpider(scrapy.Spider): Parse the given page selecting the posts. Then ask recursively for another page. ''' +# #open page in browser for debug +# from scrapy.utils.response import open_in_browser +# open_in_browser(response) + #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): + many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) @@ -197,11 +202,11 @@ class FacebookSpider(scrapy.Spider): else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: - self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag'])) + self.logger.info('Page scraped, click on more! new_page = {} flag = {}'.format(new_page,date)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) else: - self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') - self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k)) +# self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') + self.logger.info('First page scraped, click on more {}! Flag not set, default flag = {}'.format(new_page,date)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) def parse_post(self,response):