refactoring comments spider
This commit is contained in:
parent
bdeae9f4b5
commit
b3d12c4e6b
Binary file not shown.
@ -128,6 +128,39 @@ def parse_date(init_date,loader_context):
|
|||||||
month = months[date[1]]
|
month = months[date[1]]
|
||||||
year = int(date[2])
|
year = int(date[2])
|
||||||
return datetime(year,month,day).date()
|
return datetime(year,month,day).date()
|
||||||
|
#9 ore fa
|
||||||
|
elif date[0].isdigit() and date[1] == 'ore':
|
||||||
|
if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0:
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#9 ore fa (ieri)
|
||||||
|
else:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#ieri alle 20:45
|
||||||
|
elif date[0].lower() == 'ieri' and date[1] == 'alle':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#oggi alle 11:11
|
||||||
|
elif date[0].lower() == 'oggi' and date[1] == 'alle':
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#lunedì alle 12:34
|
||||||
|
elif date[0].isalpha() and date[1] == 'alle':
|
||||||
|
today = datetime.now().weekday() #today as a weekday
|
||||||
|
weekday = giorni[date[0].lower()] #day to be match as number weekday
|
||||||
|
#weekday is chronologically always lower than day
|
||||||
|
delta = today - weekday
|
||||||
|
if delta >= 0:
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
|
#lunedì = 0 sabato = 6, mar 1 ven 5
|
||||||
|
else:
|
||||||
|
delta += 8
|
||||||
|
day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1])
|
||||||
|
return datetime(year,month,day).date()
|
||||||
#parsing failed
|
#parsing failed
|
||||||
else:
|
else:
|
||||||
return date
|
return date
|
||||||
@ -427,9 +460,7 @@ def url_strip(url):
|
|||||||
|
|
||||||
|
|
||||||
class FbcrawlItem(scrapy.Item):
|
class FbcrawlItem(scrapy.Item):
|
||||||
source = scrapy.Field(
|
source = scrapy.Field()
|
||||||
output_processor=TakeFirst()
|
|
||||||
)
|
|
||||||
date = scrapy.Field( # when was the post published
|
date = scrapy.Field( # when was the post published
|
||||||
input_processor=TakeFirst(),
|
input_processor=TakeFirst(),
|
||||||
output_processor=parse_date
|
output_processor=parse_date
|
||||||
@ -456,3 +487,29 @@ class FbcrawlItem(scrapy.Item):
|
|||||||
output_processor=url_strip
|
output_processor=url_strip
|
||||||
)
|
)
|
||||||
shared_from = scrapy.Field()
|
shared_from = scrapy.Field()
|
||||||
|
|
||||||
|
class CommentsItem(scrapy.Item):
|
||||||
|
source = scrapy.Field()
|
||||||
|
reply_to=scrapy.Field()
|
||||||
|
date = scrapy.Field( # when was the post published
|
||||||
|
output_processor=parse_date
|
||||||
|
)
|
||||||
|
text = scrapy.Field(
|
||||||
|
output_processor=Join(separator=u'')
|
||||||
|
) # full text of the post
|
||||||
|
reactions = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
) # num of reactions
|
||||||
|
likes = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
)
|
||||||
|
ahah = scrapy.Field()
|
||||||
|
love = scrapy.Field()
|
||||||
|
wow = scrapy.Field()
|
||||||
|
sigh = scrapy.Field()
|
||||||
|
grrr = scrapy.Field()
|
||||||
|
share = scrapy.Field() # num of shares
|
||||||
|
url = scrapy.Field(
|
||||||
|
output_processor=url_strip
|
||||||
|
)
|
||||||
|
shared_from = scrapy.Field()
|
||||||
|
Binary file not shown.
Binary file not shown.
@ -2,106 +2,134 @@ import scrapy
|
|||||||
|
|
||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
from scrapy.http import FormRequest
|
from scrapy.http import FormRequest
|
||||||
from fbcrawl.items import FbcrawlItem
|
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||||
|
from fbcrawl.items import CommentsItem
|
||||||
|
|
||||||
class FacebookSpider(scrapy.Spider):
|
|
||||||
|
class CommentsSpider(FacebookSpider):
|
||||||
"""
|
"""
|
||||||
Parse FB comments, given a page (needs credentials)
|
Parse FB comments, given a post (needs credentials)
|
||||||
"""
|
"""
|
||||||
name = "comments"
|
name = "comments"
|
||||||
|
custom_settings = {
|
||||||
|
'FEED_EXPORT_FIELDS': ['source','reply_to','date','text', \
|
||||||
|
'reactions','likes','ahah','love','wow', \
|
||||||
|
'sigh','grrr','url']
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, email='', password='', page='', **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(FacebookSpider, self).__init__(**kwargs)
|
super().__init__(*args,**kwargs)
|
||||||
|
|
||||||
if not email or not password:
|
|
||||||
raise ValueError("You need to provide valid email and password!")
|
|
||||||
else:
|
|
||||||
self.email = email
|
|
||||||
self.password = password
|
|
||||||
|
|
||||||
if not page:
|
|
||||||
raise ValueError("You need to provide a valid page name to crawl!")
|
|
||||||
else:
|
|
||||||
self.page = page
|
|
||||||
|
|
||||||
self.start_urls = ['https://mbasic.facebook.com']
|
|
||||||
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formxpath='//form[contains(@action, "login")]',
|
|
||||||
formdata={'email': self.email,'pass': self.password},
|
|
||||||
callback=self.parse_home
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_home(self, response):
|
|
||||||
'''Parse user news feed page'''
|
|
||||||
if response.css('#approvals_code'):
|
|
||||||
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
|
||||||
if not self.code:
|
|
||||||
# Show facebook messages via logs
|
|
||||||
# and request user for approval code.
|
|
||||||
message = response.css('._50f4::text').extract()[0]
|
|
||||||
self.log(message)
|
|
||||||
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
|
||||||
self.log(message)
|
|
||||||
self.code = input('Enter the code: ')
|
|
||||||
self.code = str(self.code)
|
|
||||||
if not (self.code and self.code.isdigit()):
|
|
||||||
self.log('Bad approvals code detected.')
|
|
||||||
return
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formdata={'approvals_code': self.code},
|
|
||||||
callback=self.parse_home,
|
|
||||||
)
|
|
||||||
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
|
||||||
# Handle 'Save Browser' checkpoint.
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formdata={'name_action_selected': 'dont_save'},
|
|
||||||
callback=self.parse_home,
|
|
||||||
dont_filter=True,
|
|
||||||
)
|
|
||||||
elif response.css('button#checkpointSubmitButton'):
|
|
||||||
# Handle 'Someone tried to log into your account' warning.
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response, callback=self.parse_home, dont_filter=True,)
|
|
||||||
# Else go to the user profile.
|
|
||||||
href = response.urljoin(self.page)
|
|
||||||
self.logger.info('Parse function called on %s', href)
|
|
||||||
return scrapy.Request(
|
|
||||||
url=href,
|
|
||||||
callback=self.parse_page,
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
#answer from page
|
'''
|
||||||
for risposta in response.xpath('./div[string-length(@class) = 5 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
parse page does multiple things:
|
||||||
# resp = ItemLoader(item=FbcrawlItem(),selector=risposta)
|
1) loads replied-to-comments page one-by-one (for DFS)
|
||||||
rispostina = risposta.xpath('./a[@href and text()="Altro"]/@href')
|
2) gets common not-replied-to comments
|
||||||
risp = response.urljoin(rispostina[0].extract())
|
'''
|
||||||
yield scrapy.Request(risp, callback=self.parse_rispostina)
|
#loads replied-to comments pages
|
||||||
|
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
|
||||||
|
for reply in response.xpath(path):
|
||||||
# for i in range(len(rispostina)):
|
source = reply.xpath('.//h3/a/text()').extract()
|
||||||
# risp = response.urljoin(rispostina[i].extract())
|
answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()
|
||||||
#
|
ans = response.urljoin(answer[::-1][0])
|
||||||
# for post in response.xpath('//div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): #select all posts
|
self.logger.info('Nested comment at page {}'.format(ans))
|
||||||
# new = ItemLoader(item=FbcrawlItem(),selector=post)
|
yield scrapy.Request(ans,
|
||||||
# new.add_xpath('source', "./div/h3/a/text()")
|
callback=self.parse_reply,
|
||||||
# new.add_xpath('text',"./div[1]/div[1]/text()")
|
meta={'reply_to':source,
|
||||||
# yield new.load_item()
|
'url':response.url,
|
||||||
#
|
'index':response.meta['index'],
|
||||||
# next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
|
'flag':'init'})
|
||||||
# if len(next_page) > 0:
|
#loads regular comments
|
||||||
# next_page = response.urljoin(next_page[0].extract())
|
if not response.xpath(path):
|
||||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||||
|
for reply in response.xpath(path2):
|
||||||
def parse_rispostina(self,response):
|
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||||
for daje in response.xpath("//div[contains(@id,'root')]/div/div/div"): #select all posts
|
new.context['lang'] = self.lang
|
||||||
new = ItemLoader(item=FbcrawlItem(),selector=daje)
|
new.add_xpath('source','.//h3/a/text()')
|
||||||
new.add_xpath('source', ".//h3/a/text()")#| ./div/div/h3/a/text()")
|
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||||
new.add_xpath('text',".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()")
|
new.add_xpath('date','.//abbr/text()')
|
||||||
yield new.load_item()
|
yield new.load_item()
|
||||||
|
#
|
||||||
|
# #previous comments
|
||||||
|
if not response.xpath(path) and not response.xpath(path2):
|
||||||
|
for next_page in response.xpath('.//div[contains(@id,"see_next")]'):
|
||||||
|
new_page = next_page.xpath('.//@href').extract()
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
meta={'index':1})
|
||||||
|
#
|
||||||
|
def parse_reply(self,response):
|
||||||
|
'''
|
||||||
|
parse reply to comments, root comment is added if flag
|
||||||
|
'''
|
||||||
|
if response.meta['flag'] == 'init':
|
||||||
|
#parse root comment
|
||||||
|
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
|
new = ItemLoader(item=CommentsItem(),selector=root)
|
||||||
|
new.context['lang'] = self.lang
|
||||||
|
new.add_xpath('source', './/h3/a/text()')
|
||||||
|
new.add_value('reply_to','ROOT')
|
||||||
|
new.add_xpath('text','.//div[1]//text()')
|
||||||
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_value('url',response.url)
|
||||||
|
yield new.load_item()
|
||||||
|
#parse all replies in the page
|
||||||
|
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
|
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||||
|
new.context['lang'] = self.lang
|
||||||
|
new.add_xpath('source', './/h3/a/text()')
|
||||||
|
new.add_value('reply_to',response.meta['reply_to'])
|
||||||
|
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||||
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_value('url',response.url)
|
||||||
|
yield new.load_item()
|
||||||
|
|
||||||
|
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
|
||||||
|
if back:
|
||||||
|
self.logger.info('Back found, trying to go back')
|
||||||
|
back_page = response.urljoin(back[0])
|
||||||
|
yield scrapy.Request(back_page,
|
||||||
|
callback=self.parse_reply,
|
||||||
|
priority=100,
|
||||||
|
meta={'reply_to':response.meta['reply_to'],
|
||||||
|
'flag':'back',
|
||||||
|
'url':response.meta['url'],
|
||||||
|
'index':response.meta['index']})
|
||||||
|
else:
|
||||||
|
next_reply = response.meta['url']
|
||||||
|
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||||
|
yield scrapy.Request(next_reply, dont_filter=True,
|
||||||
|
callback=self.parse_page,
|
||||||
|
meta={'index':response.meta['index']+1})
|
||||||
|
|
||||||
|
elif response.meta['flag'] == 'back':
|
||||||
|
#parse all comments
|
||||||
|
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
|
new = ItemLoader(item=CommentsItem(),selector=reply)
|
||||||
|
new.context['lang'] = self.lang
|
||||||
|
new.add_xpath('source', './/h3/a/text()')
|
||||||
|
new.add_value('reply_to',response.meta['reply_to'])
|
||||||
|
new.add_xpath('text','.//div[h3]/div[1]//text()')
|
||||||
|
new.add_xpath('date','.//abbr/text()')
|
||||||
|
new.add_value('url',response.url)
|
||||||
|
yield new.load_item()
|
||||||
|
#keep going backwards
|
||||||
|
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
|
||||||
|
self.logger.info('Back found, trying to go back')
|
||||||
|
if back:
|
||||||
|
back_page = response.urljoin(back[0])
|
||||||
|
yield scrapy.Request(back_page,
|
||||||
|
callback=self.parse_reply,
|
||||||
|
priority=100,
|
||||||
|
meta={'reply_to':response.meta['reply_to'],
|
||||||
|
'flag':'back',
|
||||||
|
'url':response.meta['url'],
|
||||||
|
'index':response.meta['index']})
|
||||||
|
else:
|
||||||
|
next_reply = response.meta['url']
|
||||||
|
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||||
|
yield scrapy.Request(next_reply, dont_filter=True,
|
||||||
|
callback=self.parse_page,
|
||||||
|
meta={'index':response.meta['index']+1})
|
@ -10,7 +10,6 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
Parse FB pages (needs credentials)
|
Parse FB pages (needs credentials)
|
||||||
"""
|
"""
|
||||||
name = "fb"
|
name = "fb"
|
||||||
is_debug = True
|
|
||||||
custom_settings = {
|
custom_settings = {
|
||||||
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \
|
||||||
'reactions','likes','ahah','love','wow', \
|
'reactions','likes','ahah','love','wow', \
|
||||||
@ -21,7 +20,7 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
|
#turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs
|
||||||
logger = logging.getLogger('scrapy.middleware')
|
logger = logging.getLogger('scrapy.middleware')
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
super().__init__(**kwargs)
|
super().__init__(*args,**kwargs)
|
||||||
|
|
||||||
#email & pass need to be passed as attributes!
|
#email & pass need to be passed as attributes!
|
||||||
if 'email' not in kwargs or 'password' not in kwargs:
|
if 'email' not in kwargs or 'password' not in kwargs:
|
||||||
@ -130,7 +129,7 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
#navigate to provided page
|
#navigate to provided page
|
||||||
href = response.urljoin(self.page)
|
href = response.urljoin(self.page)
|
||||||
self.logger.info('Scraping facebook page {}'.format(href))
|
self.logger.info('Scraping facebook page {}'.format(href))
|
||||||
return scrapy.Request(url=href,callback=self.parse_page)
|
return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
'''
|
'''
|
||||||
|
Loading…
Reference in New Issue
Block a user