blocking mitigation

This commit is contained in:
rugantio 2019-04-25 23:41:33 +02:00
parent 4a379f3af4
commit 55dc799374
6 changed files with 13 additions and 10 deletions

View File

@ -485,10 +485,8 @@ def parse_date2(date):
def id_strip(post_id): def id_strip(post_id):
import json import json
d = json.loads(post_id[::-1][0]) #nested dict of features
d = json.loads(post_id[0]) #nested dict of features return str(d['top_level_post_id'])
return d['top_level_post_id']
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):

View File

@ -16,7 +16,6 @@ NEWSPIDER_MODULE = 'fbcrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = False ROBOTSTXT_OBEY = False
@ -26,9 +25,10 @@ CONCURRENT_REQUESTS = 16
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 1
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)

View File

@ -137,8 +137,13 @@ class FacebookSpider(scrapy.Spider):
Parse the given page selecting the posts. Parse the given page selecting the posts.
Then ask recursively for another page. Then ask recursively for another page.
''' '''
# #open page in browser for debug
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)
#select all posts #select all posts
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
many_features = post.xpath('./@data-ft').get() many_features = post.xpath('./@data-ft').get()
date = [] date = []
date.append(many_features) date.append(many_features)
@ -197,11 +202,11 @@ class FacebookSpider(scrapy.Spider):
else: else:
new_page = response.urljoin(new_page[0]) new_page = response.urljoin(new_page[0])
if 'flag' in response.meta: if 'flag' in response.meta:
self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag'])) self.logger.info('Page scraped, click on more! new_page = {} flag = {}'.format(new_page,date))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
else: else:
self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') # self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k)) self.logger.info('First page scraped, click on more {}! Flag not set, default flag = {}'.format(new_page,date))
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
def parse_post(self,response): def parse_post(self,response):