[fbcrawl] fixing date
attribute parsing
This commit is contained in:
parent
55dc799374
commit
ea431c029c
Binary file not shown.
@ -25,7 +25,7 @@ CONCURRENT_REQUESTS = 16
|
|||||||
# Configure a delay for requests for the same website (default: 0)
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
# See also autothrottle settings and docs
|
# See also autothrottle settings and docs
|
||||||
DOWNLOAD_DELAY = 2
|
DOWNLOAD_DELAY = 3
|
||||||
|
|
||||||
# The download delay setting will honor only one of:
|
# The download delay setting will honor only one of:
|
||||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||||
|
Binary file not shown.
@ -52,9 +52,8 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
self.date = datetime(2014,1,1)
|
self.date = datetime(2014,1,1)
|
||||||
self.year = 2014
|
self.year = 2014
|
||||||
else:
|
else:
|
||||||
print(type(kwargs['date']))
|
|
||||||
self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d')
|
self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d')
|
||||||
self.year = datetime.now().year - 1
|
self.year = self.date.year
|
||||||
|
|
||||||
#parse lang, if not provided (but is supported) it will be guessed in parse_home
|
#parse lang, if not provided (but is supported) it will be guessed in parse_home
|
||||||
if 'lang' not in kwargs:
|
if 'lang' not in kwargs:
|
||||||
@ -71,7 +70,7 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
raise AttributeError('Language provided not currently supported')
|
raise AttributeError('Language provided not currently supported')
|
||||||
|
|
||||||
#current year, this variable is needed for parse_page recursion
|
#current year, this variable is needed for parse_page recursion
|
||||||
self.k = 2019
|
self.k = datetime.now().year
|
||||||
#count number of posts, used to prioritized parsing and correctly insert in the csv
|
#count number of posts, used to prioritized parsing and correctly insert in the csv
|
||||||
self.count = 0
|
self.count = 0
|
||||||
|
|
||||||
|
248
fbcrawl/spiders/new_fbcrawl.py
Normal file
248
fbcrawl/spiders/new_fbcrawl.py
Normal file
@ -0,0 +1,248 @@
|
|||||||
|
import scrapy
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from scrapy.http import FormRequest
|
||||||
|
from scrapy.exceptions import CloseSpider
|
||||||
|
from fbcrawl.items import FbcrawlItem, parse_date2
|
||||||
|
from datetime import datetime
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
class FacebookSpider(scrapy.Spider):
|
||||||
|
'''
|
||||||
|
Parse FB pages (needs credentials)
|
||||||
|
'''
|
||||||
|
name = 'newfb'
|
||||||
|
custom_settings = {
|
||||||
|
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||||
|
'reactions','likes','ahah','love','wow', \
|
||||||
|
'sigh','grrr','comments','post_id','url']
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
|
||||||
|
logger = logging.getLogger('scrapy.middleware')
|
||||||
|
logger.setLevel(logging.WARNING)
|
||||||
|
super().__init__(*args,**kwargs)
|
||||||
|
|
||||||
|
#email & pass need to be passed as attributes!
|
||||||
|
if 'email' not in kwargs or 'password' not in kwargs:
|
||||||
|
raise AttributeError('You need to provide valid email and password:\n'
|
||||||
|
'scrapy fb -a email="EMAIL" -a password="PASSWORD"')
|
||||||
|
else:
|
||||||
|
self.logger.info('Email and password provided, using these as credentials')
|
||||||
|
|
||||||
|
#page name parsing (added support for full urls)
|
||||||
|
if 'page' not in kwargs:
|
||||||
|
raise AttributeError('You need to provide a valid page name to crawl!'
|
||||||
|
'scrapy fb -a page="PAGENAME"')
|
||||||
|
elif self.page.find('https://www.facebook.com/') != -1:
|
||||||
|
self.page = self.page[25:]
|
||||||
|
self.logger.info('Page attribute provided, scraping "{}"'.format(self.page))
|
||||||
|
elif self.page.find('https://mbasic.facebook.com/') != -1:
|
||||||
|
self.page = self.page[28:]
|
||||||
|
self.logger.info('Page attribute provided, scraping "{}"'.format(self.page))
|
||||||
|
elif self.page.find('https://m.facebook.com/') != -1:
|
||||||
|
self.page = self.page[23:]
|
||||||
|
self.logger.info('Page attribute provided, scraping "{}"'.format(self.page))
|
||||||
|
else:
|
||||||
|
self.logger.info('Page attribute provided, scraping "{}"'.format(self.page))
|
||||||
|
|
||||||
|
#parse date
|
||||||
|
if 'date' not in kwargs:
|
||||||
|
self.date = datetime(2014,1,1)
|
||||||
|
self.year = 2014
|
||||||
|
else:
|
||||||
|
self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d')
|
||||||
|
self.year = datetime.now().year - 1
|
||||||
|
|
||||||
|
#parse lang, if not provided (but is supported) it will be guessed in parse_home
|
||||||
|
if 'lang' not in kwargs:
|
||||||
|
self.logger.info('Language attribute not provided, I will try to guess it from the fb interface')
|
||||||
|
self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"')
|
||||||
|
self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"')
|
||||||
|
self.lang = '_'
|
||||||
|
elif self.lang == 'en' or self.lang == 'es' or self.lang == 'fr' or self.lang == 'it' or self.lang == 'pt':
|
||||||
|
self.logger.info('Language attribute recognized, using "{}" for the facebook interface'.format(self.lang))
|
||||||
|
else:
|
||||||
|
self.logger.info('Lang "{}" not currently supported'.format(self.lang))
|
||||||
|
self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"')
|
||||||
|
self.logger.info('Change your interface lang from facebook settings and try again')
|
||||||
|
raise AttributeError('Language provided not currently supported')
|
||||||
|
|
||||||
|
#current year, this variable is needed for parse_page recursion
|
||||||
|
self.k = 2019
|
||||||
|
#count number of posts, used to prioritized parsing and correctly insert in the csv
|
||||||
|
self.count = 0
|
||||||
|
|
||||||
|
self.start_urls = ['https://mbasic.facebook.com/' + self.page]
|
||||||
|
|
||||||
|
def parse(self,response):
|
||||||
|
data = response.xpath("//div[contains(@data-ft,'top_level_post_id')]/@data-ft").extract()
|
||||||
|
import json
|
||||||
|
json_data = json.loads(data[0])
|
||||||
|
page_id = json_data['page_id']
|
||||||
|
first_post = json_data['top_level_post_id']
|
||||||
|
|
||||||
|
magic_link = 'https://m.facebook.com/page_content_list_view/more/?page_id=' + \
|
||||||
|
str(page_id)
|
||||||
|
return scrapy.Request(url=magic_link)#,callback=self.parse_page,meta={'index':1})
|
||||||
|
|
||||||
|
def parse2(self, response):
|
||||||
|
'''
|
||||||
|
Handle login with provided credentials
|
||||||
|
'''
|
||||||
|
return FormRequest.from_response(
|
||||||
|
response,
|
||||||
|
formxpath='//form[contains(@action, "login")]',
|
||||||
|
formdata={'email': self.email,'pass': self.password},
|
||||||
|
callback=self.parse_home
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_home(self, response):
|
||||||
|
'''
|
||||||
|
This method has multiple purposes:
|
||||||
|
1) Handle failed logins due to facebook 'save-device' redirection
|
||||||
|
2) Set language interface, if not already provided
|
||||||
|
3) Navigate to given page
|
||||||
|
'''
|
||||||
|
#handle 'save-device' redirection
|
||||||
|
if response.xpath("//div/a[contains(@href,'save-device')]"):
|
||||||
|
self.logger.info('Got stuck in "save-device" checkpoint')
|
||||||
|
self.logger.info('I will now try to redirect to the correct page')
|
||||||
|
return FormRequest.from_response(
|
||||||
|
response,
|
||||||
|
formdata={'name_action_selected': 'dont_save'},
|
||||||
|
callback=self.parse_home
|
||||||
|
)
|
||||||
|
|
||||||
|
#set language interface
|
||||||
|
if self.lang == '_':
|
||||||
|
if response.xpath("//input[@placeholder='Search Facebook']"):
|
||||||
|
self.logger.info('Language recognized: lang="en"')
|
||||||
|
self.lang = 'en'
|
||||||
|
elif response.xpath("//input[@placeholder='Buscar en Facebook']"):
|
||||||
|
self.logger.info('Language recognized: lang="es"')
|
||||||
|
self.lang = 'es'
|
||||||
|
elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"):
|
||||||
|
self.logger.info('Language recognized: lang="fr"')
|
||||||
|
self.lang = 'fr'
|
||||||
|
elif response.xpath("//input[@placeholder='Cerca su Facebook']"):
|
||||||
|
self.logger.info('Language recognized: lang="it"')
|
||||||
|
self.lang = 'it'
|
||||||
|
elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"):
|
||||||
|
self.logger.info('Language recognized: lang="pt"')
|
||||||
|
self.lang = 'pt'
|
||||||
|
else:
|
||||||
|
raise AttributeError('Language not recognized\n'
|
||||||
|
'Change your interface lang from facebook '
|
||||||
|
'and try again')
|
||||||
|
|
||||||
|
#navigate to provided page
|
||||||
|
href = response.urljoin(self.page)
|
||||||
|
self.logger.info('Scraping facebook page {}'.format(href))
|
||||||
|
return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})
|
||||||
|
|
||||||
|
def parse_page(self, response):
|
||||||
|
'''
|
||||||
|
Parse the given page selecting the posts.
|
||||||
|
Then ask recursively for another page.
|
||||||
|
'''
|
||||||
|
# #open page in browser for debug
|
||||||
|
# from scrapy.utils.response import open_in_browser
|
||||||
|
# open_in_browser(response)
|
||||||
|
|
||||||
|
#select all posts
|
||||||
|
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||||
|
|
||||||
|
many_features = post.xpath('./@data-ft').get()
|
||||||
|
date = []
|
||||||
|
date.append(many_features)
|
||||||
|
date = parse_date2(date)
|
||||||
|
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
if self.date > current_date:
|
||||||
|
raise CloseSpider('Reached date: {}'.format(self.date))
|
||||||
|
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||||
|
self.logger.info('Parsing post n = {}'.format(abs(self.count)))
|
||||||
|
new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
|
||||||
|
new.add_xpath('date','./@data-ft')
|
||||||
|
new.add_xpath('post_id','./@data-ft')
|
||||||
|
new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
|
||||||
|
|
||||||
|
#page_url #new.add_value('url',response.url)
|
||||||
|
#returns full post-link in a list
|
||||||
|
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||||
|
temp_post = response.urljoin(post[0])
|
||||||
|
self.count -= 1
|
||||||
|
sleep(2)
|
||||||
|
yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})
|
||||||
|
|
||||||
|
#load following page, try to click on "more"
|
||||||
|
#after few pages have been scraped, the "more" link might disappears
|
||||||
|
#if not present look for the highest year not parsed yet, click once
|
||||||
|
#and keep looking for "more"
|
||||||
|
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||||
|
if not new_page:
|
||||||
|
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||||
|
self.logger.info('There are no more, flag set at = {}'.format(self.k))
|
||||||
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
|
new_page = response.xpath(xpath).extract()
|
||||||
|
if new_page:
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.k -= 1
|
||||||
|
self.logger.info('Everything OK, new flag: {}'.format(self.k))
|
||||||
|
sleep(2)
|
||||||
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||||
|
else:
|
||||||
|
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||||
|
self.logger.info('XPATH not found for year {}'.format(self.k-1))
|
||||||
|
self.k -= 1
|
||||||
|
self.logger.info('Trying with previous year, flag={}'.format(self.k))
|
||||||
|
if self.k < self.year:
|
||||||
|
self.logger.info('The previous year to crawl is less than the parameter year: {} < {}'.format(self.k,self.year))
|
||||||
|
self.logger.info('This is not handled well, please re-run with -a year="{}" or less'.format(self.k))
|
||||||
|
break
|
||||||
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
|
new_page = response.xpath(xpath).extract()
|
||||||
|
self.logger.info('New page found with flag {}'.format(self.k))
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.k -= 1
|
||||||
|
self.logger.info('Now going with flag {}'.format(self.k))
|
||||||
|
sleep(2)
|
||||||
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||||
|
else:
|
||||||
|
self.logger.info('Crawling has finished with no errors!')
|
||||||
|
else:
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
if 'flag' in response.meta:
|
||||||
|
self.logger.info('Page scraped, click on more! new_page = {} flag = {}'.format(new_page,date))
|
||||||
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
|
||||||
|
else:
|
||||||
|
# self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
|
||||||
|
self.logger.info('First page scraped, click on more {}! Flag not set, default flag = {}'.format(new_page,date))
|
||||||
|
sleep(2)
|
||||||
|
yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
|
||||||
|
|
||||||
|
def parse_post(self,response):
|
||||||
|
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
|
||||||
|
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
|
||||||
|
new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
|
||||||
|
# new.add_xpath('date','//div/div/abbr/text()')
|
||||||
|
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
|
||||||
|
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
|
||||||
|
|
||||||
|
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
|
||||||
|
reactions = response.urljoin(reactions[0].extract())
|
||||||
|
yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
|
||||||
|
|
||||||
|
def parse_reactions(self,response):
|
||||||
|
new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
|
||||||
|
new.context['lang'] = self.lang
|
||||||
|
new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
|
||||||
|
new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
|
||||||
|
new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
|
||||||
|
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
||||||
|
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
||||||
|
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
||||||
|
yield new.load_item()
|
Loading…
Reference in New Issue
Block a user