blocking mitigation
This commit is contained in:
parent
4a379f3af4
commit
55dc799374
Binary file not shown.
Binary file not shown.
@ -485,10 +485,8 @@ def parse_date2(date):
|
||||
|
||||
def id_strip(post_id):
|
||||
import json
|
||||
|
||||
d = json.loads(post_id[0]) #nested dict of features
|
||||
return d['top_level_post_id']
|
||||
|
||||
d = json.loads(post_id[::-1][0]) #nested dict of features
|
||||
return str(d['top_level_post_id'])
|
||||
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
|
@ -16,7 +16,6 @@ NEWSPIDER_MODULE = 'fbcrawl.spiders'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
@ -26,9 +25,10 @@ CONCURRENT_REQUESTS = 16
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
DOWNLOAD_DELAY = 2
|
||||
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
|
Binary file not shown.
@ -137,8 +137,13 @@ class FacebookSpider(scrapy.Spider):
|
||||
Parse the given page selecting the posts.
|
||||
Then ask recursively for another page.
|
||||
'''
|
||||
# #open page in browser for debug
|
||||
# from scrapy.utils.response import open_in_browser
|
||||
# open_in_browser(response)
|
||||
|
||||
#select all posts
|
||||
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||
|
||||
many_features = post.xpath('./@data-ft').get()
|
||||
date = []
|
||||
date.append(many_features)
|
||||
@ -197,11 +202,11 @@ class FacebookSpider(scrapy.Spider):
|
||||
else:
|
||||
new_page = response.urljoin(new_page[0])
|
||||
if 'flag' in response.meta:
|
||||
self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag']))
|
||||
self.logger.info('Page scraped, click on more! new_page = {} flag = {}'.format(new_page,date))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
|
||||
else:
|
||||
self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
|
||||
self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k))
|
||||
# self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
|
||||
self.logger.info('First page scraped, click on more {}! Flag not set, default flag = {}'.format(new_page,date))
|
||||
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||
|
||||
def parse_post(self,response):
|
||||
|
Loading…
Reference in New Issue
Block a user