refactoring comments spider
This commit is contained in:
parent
bdeae9f4b5
commit
b3d12c4e6b
Binary file not shown.
@ -128,6 +128,39 @@ def parse_date(init_date,loader_context):
|
||||
month = months[date[1]]
|
||||
year = int(date[2])
|
||||
return datetime(year,month,day).date()
|
||||
#9 ore fa
|
||||
elif date[0].isdigit() and date[1] == 'ore':
|
||||
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||
return datetime(year,month,day).date()
|
||||
#9 ore fa (ieri)
|
||||
else:
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#ieri alle 20:45
|
||||
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#oggi alle 11:11
|
||||
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì alle 12:34
|
||||
elif date[0].isalpha() and date[1] == 'alle':
|
||||
today = datetime.now().weekday() #today as a weekday
|
||||
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||
#weekday is chronologically always lower than day
|
||||
delta = today - weekday
|
||||
if delta >= 0:
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||
else:
|
||||
delta += 8
|
||||
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||
return datetime(year,month,day).date()
|
||||
#parsing failed
|
||||
else:
|
||||
return date
|
||||
@ -427,9 +460,7 @@ def url_strip(url):
|
||||
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
source = scrapy.Field(
|
||||
output_processor=TakeFirst()
|
||||
)
|
||||
source = scrapy.Field()
|
||||
date = scrapy.Field( # when was the post published
|
||||
input_processor=TakeFirst(),
|
||||
output_processor=parse_date
|
||||
@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item):
|
||||
output_processor=url_strip
|
||||
)
|
||||
shared_from = scrapy.Field()
|
||||
|
||||
class CommentsItem(scrapy.Item):
|
||||
source = scrapy.Field()
|
||||
reply_to=scrapy.Field()
|
||||
date = scrapy.Field( # when was the post published
|
||||
output_processor=parse_date
|
||||
)
|
||||
text = scrapy.Field(
|
||||
output_processor=Join(separator=u'')
|
||||
) # full text of the post
|
||||
reactions = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
) # num of reactions
|
||||
likes = scrapy.Field(
|
||||
output_processor=reactions_strip
|
||||
)
|
||||
ahah = scrapy.Field()
|
||||
love = scrapy.Field()
|
||||
wow = scrapy.Field()
|
||||
sigh = scrapy.Field()
|
||||
grrr = scrapy.Field()
|
||||
share = scrapy.Field() # num of shares
|
||||
url = scrapy.Field(
|
||||
output_processor=url_strip
|
||||
)
|
||||
shared_from = scrapy.Field()
|
||||
|
Binary file not shown.
Binary file not shown.
@ -2,106 +2,134 @@ import scrapy
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.http import FormRequest
|
||||
from fbcrawl.items import FbcrawlItem
|
||||
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||
from fbcrawl.items import CommentsItem
|
||||
|
||||
class FacebookSpider(scrapy.Spider):
|
||||
|
||||
class CommentsSpider(FacebookSpider):
|
||||
"""
|
||||
Parse FB comments, given a page (needs credentials)
|
||||
Parse FB comments, given a post (needs credentials)
|
||||
"""
|
||||
name = "comments"
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
|
||||
'reactions','likes','ahah','love','wow', \
|
||||
'sigh','grrr','url']
|
||||
}
|
||||
|
||||
def __init__(self, email='', password='', page='', **kwargs):
|
||||
super(FacebookSpider, self).__init__(**kwargs)
|
||||
|
||||
if not email or not password:
|
||||
raise ValueError("You need to provide valid email and password!")
|
||||
else:
|
||||
self.email = email
|
||||
self.password = password
|
||||
|
||||
if not page:
|
||||
raise ValueError("You need to provide a valid page name to crawl!")
|
||||
else:
|
||||
self.page = page
|
||||
|
||||
self.start_urls = ['https://mbasic.facebook.com']
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formxpath='//form[contains(@action, "login")]',
|
||||
formdata={'email': self.email,'pass': self.password},
|
||||
callback=self.parse_home
|
||||
)
|
||||
|
||||
def parse_home(self, response):
|
||||
'''Parse user news feed page'''
|
||||
if response.css('#approvals_code'):
|
||||
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
||||
if not self.code:
|
||||
# Show facebook messages via logs
|
||||
# and request user for approval code.
|
||||
message = response.css('._50f4::text').extract()[0]
|
||||
self.log(message)
|
||||
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
||||
self.log(message)
|
||||
self.code = input('Enter the code: ')
|
||||
self.code = str(self.code)
|
||||
if not (self.code and self.code.isdigit()):
|
||||
self.log('Bad approvals code detected.')
|
||||
return
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'approvals_code': self.code},
|
||||
callback=self.parse_home,
|
||||
)
|
||||
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
||||
# Handle 'Save Browser' checkpoint.
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'name_action_selected': 'dont_save'},
|
||||
callback=self.parse_home,
|
||||
dont_filter=True,
|
||||
)
|
||||
elif response.css('button#checkpointSubmitButton'):
|
||||
# Handle 'Someone tried to log into your account' warning.
|
||||
return FormRequest.from_response(
|
||||
response, callback=self.parse_home, dont_filter=True,)
|
||||
# Else go to the user profile.
|
||||
href = response.urljoin(self.page)
|
||||
self.logger.info('Parse function called on %s', href)
|
||||
return scrapy.Request(
|
||||
url=href,
|
||||
callback=self.parse_page,
|
||||
)
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
#answer from page
|
||||
for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||
# resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
|
||||
rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
|
||||
risp = response.urljoin(rispostina[0].extract())
|
||||
yield scrapy.Request(risp, callback=self.parse_rispostina)
|
||||
|
||||
|
||||
# for i in range(len(rispostina)):
|
||||
# risp = response.urljoin(rispostina[i].extract())
|
||||
'''
|
||||
parse page does multiple things:
|
||||
1) loads replied-to-comments page one-by-one (for DFS)
|
||||
2) gets common not-replied-to comments
|
||||
'''
|
||||
#loads replied-to comments pages
|
||||
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
|
||||
for reply in response.xpath(path):
|
||||
source = reply.xpath('.//h3/a/text()').extract()
|
||||
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
|
||||
ans = response.urljoin(answer[::-1][0])
|
||||
self.logger.info('Nested comment at page {}'.format(ans))
|
||||
yield scrapy.Request(ans,
|
||||
callback=self.parse_reply,
|
||||
meta={'reply_to':source,
|
||||
'url':response.url,
|
||||
'index':response.meta['index'],
|
||||
'flag':'init'})
|
||||
#loads regular comments
|
||||
if not response.xpath(path):
|
||||
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||
for reply in response.xpath(path2):
|
||||
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||
new.context['lang'] = self.lang
|
||||
new.add_xpath('source','.//h3/a/text()')
|
||||
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||
new.add_xpath('date','.//abbr/text()')
|
||||
yield new.load_item()
|
||||
#
|
||||
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
|
||||
# new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||
# new.add_xpath('source', "./div/h3/a/text()")
|
||||
# new.add_xpath('text',"./div[1]/div[1]/text()")
|
||||
# yield new.load_item()
|
||||
# #previous comments
|
||||
if not response.xpath(path) and not response.xpath(path2):
|
||||
for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
|
||||
new_page = next_page.xpath('.//@href').extract()
|
||||
new_page = response.urljoin(new_page[0])
|
||||
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||
yield scrapy.Request(new_page,
|
||||
callback=self.parse_page,
|
||||
meta={'index':1})
|
||||
#
|
||||
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
||||
# if len(next_page) > 0:
|
||||
# next_page = response.urljoin(next_page[0].extract())
|
||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
||||
def parse_reply(self,response):
|
||||
'''
|
||||
parse reply to comments, root comment is added if flag
|
||||
'''
|
||||
if response.meta['flag'] == 'init':
|
||||
#parse root comment
|
||||
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||
new = ItemLoader(item=CommentsItem(),selector=root)
|
||||
new.context['lang'] = self.lang
|
||||
new.add_xpath('source', './/h3/a/text()')
|
||||
new.add_value('reply_to','ROOT')
|
||||
new.add_xpath('text','.//div[1]//text()')
|
||||
new.add_xpath('date','.//abbr/text()')
|
||||
new.add_value('url',response.url)
|
||||
yield new.load_item()
|
||||
#parse all replies in the page
|
||||
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||
new.context['lang'] = self.lang
|
||||
new.add_xpath('source', './/h3/a/text()')
|
||||
new.add_value('reply_to',response.meta['reply_to'])
|
||||
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||
new.add_xpath('date','.//abbr/text()')
|
||||
new.add_value('url',response.url)
|
||||
yield new.load_item()
|
||||
|
||||
def parse_rispostina(self,response):
|
||||
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts
|
||||
new = ItemLoader(item=FbcrawlItem(),selector=daje)
|
||||
new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()")
|
||||
new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()")
|
||||
yield new.load_item()
|
||||
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
|
||||
if back:
|
||||
self.logger.info('Back found, trying to go back')
|
||||
back_page = response.urljoin(back[0])
|
||||
yield scrapy.Request(back_page,
|
||||
callback=self.parse_reply,
|
||||
priority=100,
|
||||
meta={'reply_to':response.meta['reply_to'],
|
||||
'flag':'back',
|
||||
'url':response.meta['url'],
|
||||
'index':response.meta['index']})
|
||||
else:
|
||||
next_reply = response.meta['url']
|
||||
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||
yield scrapy.Request(next_reply, dont_filter=True,
|
||||
callback=self.parse_page,
|
||||
meta={'index':response.meta['index']+1})
|
||||
|
||||
elif response.meta['flag'] == 'back':
|
||||
#parse all comments
|
||||
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||
new.context['lang'] = self.lang
|
||||
new.add_xpath('source', './/h3/a/text()')
|
||||
new.add_value('reply_to',response.meta['reply_to'])
|
||||
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||
new.add_xpath('date','.//abbr/text()')
|
||||
new.add_value('url',response.url)
|
||||
yield new.load_item()
|
||||
#keep going backwards
|
||||
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
|
||||
self.logger.info('Back found, trying to go back')
|
||||
if back:
|
||||
back_page = response.urljoin(back[0])
|
||||
yield scrapy.Request(back_page,
|
||||
callback=self.parse_reply,
|
||||
priority=100,
|
||||
meta={'reply_to':response.meta['reply_to'],
|
||||
'flag':'back',
|
||||
'url':response.meta['url'],
|
||||
'index':response.meta['index']})
|
||||
else:
|
||||
next_reply = response.meta['url']
|
||||
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||
yield scrapy.Request(next_reply, dont_filter=True,
|
||||
callback=self.parse_page,
|
||||
meta={'index':response.meta['index']+1})
|
@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider):
|
||||
Parse FB pages (needs credentials)
|
||||
"""
|
||||
name = "fb"
|
||||
is_debug = True
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||
'reactions','likes','ahah','love','wow', \
|
||||
@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
|
||||
logger = logging.getLogger('scrapy.middleware')
|
||||
logger.setLevel(logging.WARNING)
|
||||
super().__init__(**kwargs)
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
#email & pass need to be passed as attributes!
|
||||
if 'email' not in kwargs or 'password' not in kwargs:
|
||||
@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider):
|
||||
#navigate to provided page
|
||||
href = response.urljoin(self.page)
|
||||
self.logger.info('Scraping facebook page {}'.format(href))
|
||||
return scrapy.Request(url=href,callback=self.parse_page)
|
||||
return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})
|
||||
|
||||
def parse_page(self, response):
|
||||
'''
|
||||
|
Loading…
Reference in New Issue
Block a user