refactoring comments spider

This commit is contained in:
rugantio 2019-02-18 02:12:52 +01:00
parent bdeae9f4b5
commit b3d12c4e6b
6 changed files with 188 additions and 104 deletions

View File

@ -128,6 +128,39 @@ def parse_date(init_date,loader_context):
month = months[date[1]] month = months[date[1]]
year = int(date[2]) year = int(date[2])
return datetime(year,month,day).date() return datetime(year,month,day).date()
#9 ore fa
elif date[0].isdigit() and date[1] == 'ore':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#9 ore fa (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#ieri alle 20:45
elif date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#oggi alle 11:11
elif date[0].lower() == 'oggi' and date[1] == 'alle':
return datetime(year,month,day).date()
#lunedì alle 12:34
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed #parsing failed
else: else:
return date return date
@ -427,9 +460,7 @@ def url_strip(url):
class FbcrawlItem(scrapy.Item): class FbcrawlItem(scrapy.Item):
source = scrapy.Field( source = scrapy.Field()
output_processor=TakeFirst()
)
date = scrapy.Field( # when was the post published date = scrapy.Field( # when was the post published
input_processor=TakeFirst(), input_processor=TakeFirst(),
output_processor=parse_date output_processor=parse_date
@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item):
output_processor=url_strip output_processor=url_strip
) )
shared_from = scrapy.Field() shared_from = scrapy.Field()
class CommentsItem(scrapy.Item):
source = scrapy.Field()
reply_to=scrapy.Field()
date = scrapy.Field( # when was the post published
output_processor=parse_date
)
text = scrapy.Field(
output_processor=Join(separator=u'')
) # full text of the post
reactions = scrapy.Field(
output_processor=reactions_strip
) # num of reactions
likes = scrapy.Field(
output_processor=reactions_strip
)
ahah = scrapy.Field()
love = scrapy.Field()
wow = scrapy.Field()
sigh = scrapy.Field()
grrr = scrapy.Field()
share = scrapy.Field() # num of shares
url = scrapy.Field(
output_processor=url_strip
)
shared_from = scrapy.Field()

View File

@ -2,106 +2,134 @@ import scrapy
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.http import FormRequest from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem from fbcrawl.spiders.fbcrawl import FacebookSpider
from fbcrawl.items import CommentsItem
class FacebookSpider(scrapy.Spider):
class CommentsSpider(FacebookSpider):
""" """
Parse FB comments, given a page (needs credentials) Parse FB comments, given a post (needs credentials)
""" """
name = "comments" name = "comments"
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
'reactions','likes','ahah','love','wow', \
'sigh','grrr','url']
}
def __init__(self, email='', password='', page='', **kwargs): def __init__(self, *args, **kwargs):
super(FacebookSpider, self).__init__(**kwargs) super().__init__(*args,**kwargs)
if not email or not password:
raise ValueError("You need to provide valid email and password!")
else:
self.email = email
self.password = password
if not page:
raise ValueError("You need to provide a valid page name to crawl!")
else:
self.page = page
self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response):
return FormRequest.from_response(
response,
formxpath='//form[contains(@action, "login")]',
formdata={'email': self.email,'pass': self.password},
callback=self.parse_home
)
def parse_home(self, response):
'''Parse user news feed page'''
if response.css('#approvals_code'):
# Handle 'Approvals Code' checkpoint (ask user to enter code).
if not self.code:
# Show facebook messages via logs
# and request user for approval code.
message = response.css('._50f4::text').extract()[0]
self.log(message)
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
self.log(message)
self.code = input('Enter the code: ')
self.code = str(self.code)
if not (self.code and self.code.isdigit()):
self.log('Bad approvals code detected.')
return
return FormRequest.from_response(
response,
formdata={'approvals_code': self.code},
callback=self.parse_home,
)
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
# Handle 'Save Browser' checkpoint.
return FormRequest.from_response(
response,
formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home,
dont_filter=True,
)
elif response.css('button#checkpointSubmitButton'):
# Handle 'Someone tried to log into your account' warning.
return FormRequest.from_response(
response, callback=self.parse_home, dont_filter=True,)
# Else go to the user profile.
href = response.urljoin(self.page)
self.logger.info('Parse function called on %s', href)
return scrapy.Request(
url=href,
callback=self.parse_page,
)
def parse_page(self, response): def parse_page(self, response):
#answer from page '''
for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): parse page does multiple things:
# resp = ItemLoader(item=FbcrawlItem(),selector=risposta) 1) loads replied-to-comments page one-by-one (for DFS)
rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href') 2) gets common not-replied-to comments
risp = response.urljoin(rispostina[0].extract()) '''
yield scrapy.Request(risp, callback=self.parse_rispostina) #loads replied-to comments pages
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
for reply in response.xpath(path):
# for i in range(len(rispostina)): source = reply.xpath('.//h3/a/text()').extract()
# risp = response.urljoin(rispostina[i].extract()) answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
# ans = response.urljoin(answer[::-1][0])
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts self.logger.info('Nested comment at page {}'.format(ans))
# new = ItemLoader(item=FbcrawlItem(),selector=post) yield scrapy.Request(ans,
# new.add_xpath('source', "./div/h3/a/text()") callback=self.parse_reply,
# new.add_xpath('text',"./div[1]/div[1]/text()") meta={'reply_to':source,
# yield new.load_item() 'url':response.url,
# 'index':response.meta['index'],
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") 'flag':'init'})
# if len(next_page) > 0: #loads regular comments
# next_page = response.urljoin(next_page[0].extract()) if not response.xpath(path):
# yield scrapy.Request(next_page, callback=self.parse_page) path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for reply in response.xpath(path2):
def parse_rispostina(self,response): new = ItemLoader(item=CommentsItem(),selector=reply)
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts new.context['lang'] = self.lang
new = ItemLoader(item=FbcrawlItem(),selector=daje) new.add_xpath('source','.//h3/a/text()')
new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()") new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()") new.add_xpath('date','.//abbr/text()')
yield new.load_item() yield new.load_item()
#
# #previous comments
if not response.xpath(path) and not response.xpath(path2):
for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
new_page = next_page.xpath('.//@href').extract()
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
meta={'index':1})
#
def parse_reply(self,response):
'''
parse reply to comments, root comment is added if flag
'''
if response.meta['flag'] == 'init':
#parse root comment
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
new = ItemLoader(item=CommentsItem(),selector=root)
new.context['lang'] = self.lang
new.add_xpath('source', './/h3/a/text()')
new.add_value('reply_to','ROOT')
new.add_xpath('text','.//div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_value('url',response.url)
yield new.load_item()
#parse all replies in the page
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang
new.add_xpath('source', './/h3/a/text()')
new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_value('url',response.url)
yield new.load_item()
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
if back:
self.logger.info('Back found, trying to go back')
back_page = response.urljoin(back[0])
yield scrapy.Request(back_page,
callback=self.parse_reply,
priority=100,
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
'index':response.meta['index']})
else:
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, dont_filter=True,
callback=self.parse_page,
meta={'index':response.meta['index']+1})
elif response.meta['flag'] == 'back':
#parse all comments
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang
new.add_xpath('source', './/h3/a/text()')
new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_value('url',response.url)
yield new.load_item()
#keep going backwards
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
self.logger.info('Back found, trying to go back')
if back:
back_page = response.urljoin(back[0])
yield scrapy.Request(back_page,
callback=self.parse_reply,
priority=100,
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
'index':response.meta['index']})
else:
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, dont_filter=True,
callback=self.parse_page,
meta={'index':response.meta['index']+1})

View File

@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider):
Parse FB pages (needs credentials) Parse FB pages (needs credentials)
""" """
name = "fb" name = "fb"
is_debug = True
custom_settings = { custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \ 'reactions','likes','ahah','love','wow', \
@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider):
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
logger = logging.getLogger('scrapy.middleware') logger = logging.getLogger('scrapy.middleware')
logger.setLevel(logging.WARNING) logger.setLevel(logging.WARNING)
super().__init__(**kwargs) super().__init__(*args,**kwargs)
#email & pass need to be passed as attributes! #email & pass need to be passed as attributes!
if 'email' not in kwargs or 'password' not in kwargs: if 'email' not in kwargs or 'password' not in kwargs:
@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider):
#navigate to provided page #navigate to provided page
href = response.urljoin(self.page) href = response.urljoin(self.page)
self.logger.info('Scraping facebook page {}'.format(href)) self.logger.info('Scraping facebook page {}'.format(href))
return scrapy.Request(url=href,callback=self.parse_page) return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})
def parse_page(self, response): def parse_page(self, response):
''' '''