refactoring comments spider

This commit is contained in:
rugantio 2019-02-18 02:12:52 +01:00
parent bdeae9f4b5
commit b3d12c4e6b
6 changed files with 188 additions and 104 deletions

View File

@ -128,6 +128,39 @@ def parse_date(init_date,loader_context):
month = months[date[1]]
year = int(date[2])
return datetime(year,month,day).date()
#9 ore fa
elif date[0].isdigit() and date[1] == 'ore':
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
return datetime(year,month,day).date()
#9 ore fa (ieri)
else:
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#ieri alle 20:45
elif date[0].lower() == 'ieri' and date[1] == 'alle':
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
return datetime(year,month,day).date()
#oggi alle 11:11
elif date[0].lower() == 'oggi' and date[1] == 'alle':
return datetime(year,month,day).date()
#lunedì alle 12:34
elif date[0].isalpha() and date[1] == 'alle':
today = datetime.now().weekday() #today as a weekday
weekday = giorni[date[0].lower()] #day to be match as number weekday
#weekday is chronologically always lower than day
delta = today - weekday
if delta >= 0:
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#lunedì = 0 sabato = 6, mar 1 ven 5
else:
delta += 8
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
return datetime(year,month,day).date()
#parsing failed
else:
return date
@ -427,9 +460,7 @@ def url_strip(url):
class FbcrawlItem(scrapy.Item):
source = scrapy.Field(
output_processor=TakeFirst()
)
source = scrapy.Field()
date = scrapy.Field( # when was the post published
input_processor=TakeFirst(),
output_processor=parse_date
@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item):
output_processor=url_strip
)
shared_from = scrapy.Field()
class CommentsItem(scrapy.Item):
source = scrapy.Field()
reply_to=scrapy.Field()
date = scrapy.Field( # when was the post published
output_processor=parse_date
)
text = scrapy.Field(
output_processor=Join(separator=u'')
) # full text of the post
reactions = scrapy.Field(
output_processor=reactions_strip
) # num of reactions
likes = scrapy.Field(
output_processor=reactions_strip
)
ahah = scrapy.Field()
love = scrapy.Field()
wow = scrapy.Field()
sigh = scrapy.Field()
grrr = scrapy.Field()
share = scrapy.Field() # num of shares
url = scrapy.Field(
output_processor=url_strip
)
shared_from = scrapy.Field()

View File

@ -2,106 +2,134 @@ import scrapy
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem
from fbcrawl.spiders.fbcrawl import FacebookSpider
from fbcrawl.items import CommentsItem
class FacebookSpider(scrapy.Spider):
class CommentsSpider(FacebookSpider):
"""
Parse FB comments, given a page (needs credentials)
Parse FB comments, given a post (needs credentials)
"""
name = "comments"
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
'reactions','likes','ahah','love','wow', \
'sigh','grrr','url']
}
def __init__(self, email='', password='', page='', **kwargs):
super(FacebookSpider, self).__init__(**kwargs)
if not email or not password:
raise ValueError("You need to provide valid email and password!")
else:
self.email = email
self.password = password
if not page:
raise ValueError("You need to provide a valid page name to crawl!")
else:
self.page = page
self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response):
return FormRequest.from_response(
response,
formxpath='//form[contains(@action, "login")]',
formdata={'email': self.email,'pass': self.password},
callback=self.parse_home
)
def parse_home(self, response):
'''Parse user news feed page'''
if response.css('#approvals_code'):
# Handle 'Approvals Code' checkpoint (ask user to enter code).
if not self.code:
# Show facebook messages via logs
# and request user for approval code.
message = response.css('._50f4::text').extract()[0]
self.log(message)
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
self.log(message)
self.code = input('Enter the code: ')
self.code = str(self.code)
if not (self.code and self.code.isdigit()):
self.log('Bad approvals code detected.')
return
return FormRequest.from_response(
response,
formdata={'approvals_code': self.code},
callback=self.parse_home,
)
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
# Handle 'Save Browser' checkpoint.
return FormRequest.from_response(
response,
formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home,
dont_filter=True,
)
elif response.css('button#checkpointSubmitButton'):
# Handle 'Someone tried to log into your account' warning.
return FormRequest.from_response(
response, callback=self.parse_home, dont_filter=True,)
# Else go to the user profile.
href = response.urljoin(self.page)
self.logger.info('Parse function called on %s', href)
return scrapy.Request(
url=href,
callback=self.parse_page,
)
def __init__(self, *args, **kwargs):
super().__init__(*args,**kwargs)
def parse_page(self, response):
#answer from page
for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
# resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
risp = response.urljoin(rispostina[0].extract())
yield scrapy.Request(risp, callback=self.parse_rispostina)
# for i in range(len(rispostina)):
# risp = response.urljoin(rispostina[i].extract())
'''
parse page does multiple things:
1) loads replied-to-comments page one-by-one (for DFS)
2) gets common not-replied-to comments
'''
#loads replied-to comments pages
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
for reply in response.xpath(path):
source = reply.xpath('.//h3/a/text()').extract()
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
ans = response.urljoin(answer[::-1][0])
self.logger.info('Nested comment at page {}'.format(ans))
yield scrapy.Request(ans,
callback=self.parse_reply,
meta={'reply_to':source,
'url':response.url,
'index':response.meta['index'],
'flag':'init'})
#loads regular comments
if not response.xpath(path):
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
for reply in response.xpath(path2):
new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang
new.add_xpath('source','.//h3/a/text()')
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
yield new.load_item()
#
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
# new = ItemLoader(item=FbcrawlItem(),selector=post)
# new.add_xpath('source', "./div/h3/a/text()")
# new.add_xpath('text',"./div[1]/div[1]/text()")
# yield new.load_item()
# #previous comments
if not response.xpath(path) and not response.xpath(path2):
for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
new_page = next_page.xpath('.//@href').extract()
new_page = response.urljoin(new_page[0])
self.logger.info('New page to be crawled {}'.format(new_page))
yield scrapy.Request(new_page,
callback=self.parse_page,
meta={'index':1})
#
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
# if len(next_page) > 0:
# next_page = response.urljoin(next_page[0].extract())
# yield scrapy.Request(next_page, callback=self.parse_page)
def parse_reply(self,response):
'''
parse reply to comments, root comment is added if flag
'''
if response.meta['flag'] == 'init':
#parse root comment
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
new = ItemLoader(item=CommentsItem(),selector=root)
new.context['lang'] = self.lang
new.add_xpath('source', './/h3/a/text()')
new.add_value('reply_to','ROOT')
new.add_xpath('text','.//div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_value('url',response.url)
yield new.load_item()
#parse all replies in the page
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang
new.add_xpath('source', './/h3/a/text()')
new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_value('url',response.url)
yield new.load_item()
def parse_rispostina(self,response):
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts
new = ItemLoader(item=FbcrawlItem(),selector=daje)
new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()")
new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()")
yield new.load_item()
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
if back:
self.logger.info('Back found, trying to go back')
back_page = response.urljoin(back[0])
yield scrapy.Request(back_page,
callback=self.parse_reply,
priority=100,
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
'index':response.meta['index']})
else:
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, dont_filter=True,
callback=self.parse_page,
meta={'index':response.meta['index']+1})
elif response.meta['flag'] == 'back':
#parse all comments
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
new = ItemLoader(item=CommentsItem(),selector=reply)
new.context['lang'] = self.lang
new.add_xpath('source', './/h3/a/text()')
new.add_value('reply_to',response.meta['reply_to'])
new.add_xpath('text','.//div[h3]/div[1]//text()')
new.add_xpath('date','.//abbr/text()')
new.add_value('url',response.url)
yield new.load_item()
#keep going backwards
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
self.logger.info('Back found, trying to go back')
if back:
back_page = response.urljoin(back[0])
yield scrapy.Request(back_page,
callback=self.parse_reply,
priority=100,
meta={'reply_to':response.meta['reply_to'],
'flag':'back',
'url':response.meta['url'],
'index':response.meta['index']})
else:
next_reply = response.meta['url']
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
yield scrapy.Request(next_reply, dont_filter=True,
callback=self.parse_page,
meta={'index':response.meta['index']+1})

View File

@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider):
Parse FB pages (needs credentials)
"""
name = "fb"
is_debug = True
custom_settings = {
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
'reactions','likes','ahah','love','wow', \
@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider):
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
logger = logging.getLogger('scrapy.middleware')
logger.setLevel(logging.WARNING)
super().__init__(**kwargs)
super().__init__(*args,**kwargs)
#email & pass need to be passed as attributes!
if 'email' not in kwargs or 'password' not in kwargs:
@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider):
#navigate to provided page
href = response.urljoin(self.page)
self.logger.info('Scraping facebook page {}'.format(href))
return scrapy.Request(url=href,callback=self.parse_page)
return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})
def parse_page(self, response):
'''