Adding profile crawler
This commit is contained in:
parent
dad1a19435
commit
0cda010e64
Binary file not shown.
@ -627,3 +627,14 @@ class CommentsItem(scrapy.Item):
|
|||||||
sigh = scrapy.Field()
|
sigh = scrapy.Field()
|
||||||
grrr = scrapy.Field()
|
grrr = scrapy.Field()
|
||||||
share = scrapy.Field() # num of shares
|
share = scrapy.Field() # num of shares
|
||||||
|
|
||||||
|
class ProfileItem(scrapy.Item):
|
||||||
|
name = scrapy.Field()
|
||||||
|
gender = scrapy.Field()
|
||||||
|
birthday = scrapy.Field()
|
||||||
|
current_city = scrapy.Field()
|
||||||
|
hometown = scrapy.Field()
|
||||||
|
work = scrapy.Field()
|
||||||
|
education = scrapy.Field()
|
||||||
|
interested_in = scrapy.Field()
|
||||||
|
page = scrapy.Field()
|
||||||
|
289
fbcrawl/spiders/profiles.py
Normal file
289
fbcrawl/spiders/profiles.py
Normal file
@ -0,0 +1,289 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from scrapy.exceptions import CloseSpider
|
||||||
|
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||||
|
from fbcrawl.items import ProfileItem, parse_date, parse_date2
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
class ProfileSpider(FacebookSpider):
|
||||||
|
"""
|
||||||
|
Parse FB profiles
|
||||||
|
"""
|
||||||
|
name = "profiles"
|
||||||
|
custom_settings = {
|
||||||
|
'FEED_EXPORT_FIELDS': ['name','gender','birthday','current_city',
|
||||||
|
'hometown','work','education','interested_in',
|
||||||
|
'page'],
|
||||||
|
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||||
|
'CONCURRENT_REQUESTS' : 1
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
if 'post' in kwargs and 'page' in kwargs:
|
||||||
|
raise AttributeError('You need to specifiy only one between post and page')
|
||||||
|
elif 'post' in kwargs:
|
||||||
|
self.page = kwargs['post']
|
||||||
|
self.type = 'post'
|
||||||
|
elif 'page' in kwargs:
|
||||||
|
self.type = 'page'
|
||||||
|
|
||||||
|
super().__init__(*args,**kwargs)
|
||||||
|
|
||||||
|
def parse_page(self, response):
|
||||||
|
'''
|
||||||
|
'''
|
||||||
|
if self.type == 'post':
|
||||||
|
yield scrapy.Request(url=response.url,
|
||||||
|
callback=self.parse_post,
|
||||||
|
priority=10,
|
||||||
|
meta={'index':1})
|
||||||
|
elif self.type == 'page':
|
||||||
|
#select all posts
|
||||||
|
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):
|
||||||
|
many_features = post.xpath('./@data-ft').get()
|
||||||
|
date = []
|
||||||
|
date.append(many_features)
|
||||||
|
date = parse_date(date,{'lang':self.lang})
|
||||||
|
current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
|
||||||
|
|
||||||
|
if current_date is None:
|
||||||
|
date_string = post.xpath('.//abbr/text()').get()
|
||||||
|
date = parse_date2([date_string],{'lang':self.lang})
|
||||||
|
current_date = datetime(date.year,date.month,date.day) if date is not None else date
|
||||||
|
date = str(date)
|
||||||
|
|
||||||
|
if abs(self.count) + 1 > self.max:
|
||||||
|
raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
|
||||||
|
self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
|
||||||
|
|
||||||
|
#returns full post-link in a list
|
||||||
|
post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
|
||||||
|
temp_post = response.urljoin(post[0])
|
||||||
|
self.count -= 1
|
||||||
|
yield scrapy.Request(temp_post,
|
||||||
|
self.parse_post,
|
||||||
|
priority = self.count,
|
||||||
|
meta={'index':1})
|
||||||
|
|
||||||
|
#load following page, try to click on "more"
|
||||||
|
#after few pages have been scraped, the "more" link might disappears
|
||||||
|
#if not present look for the highest year not parsed yet
|
||||||
|
#click once on the year and go back to clicking "more"
|
||||||
|
|
||||||
|
#new_page is different for groups
|
||||||
|
if self.group == 1:
|
||||||
|
new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
|
||||||
|
else:
|
||||||
|
new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
|
||||||
|
#this is why lang is needed
|
||||||
|
|
||||||
|
if not new_page:
|
||||||
|
self.logger.info('[!] "more" link not found, will look for a "year" link')
|
||||||
|
#self.k is the year link that we look for
|
||||||
|
if response.meta['flag'] == self.k and self.k >= self.year:
|
||||||
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
|
new_page = response.xpath(xpath).extract()
|
||||||
|
if new_page:
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.k -= 1
|
||||||
|
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':self.k})
|
||||||
|
else:
|
||||||
|
while not new_page: #sometimes the years are skipped this handles small year gaps
|
||||||
|
self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
|
||||||
|
self.k -= 1
|
||||||
|
if self.k < self.year:
|
||||||
|
raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
|
||||||
|
xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
|
||||||
|
new_page = response.xpath(xpath).extract()
|
||||||
|
self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.k -= 1
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':self.k})
|
||||||
|
else:
|
||||||
|
self.logger.info('Crawling has finished with no errors!')
|
||||||
|
else:
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
if 'flag' in response.meta:
|
||||||
|
self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':response.meta['flag']})
|
||||||
|
else:
|
||||||
|
self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_page,
|
||||||
|
priority = -1000,
|
||||||
|
meta={'flag':self.k})
|
||||||
|
|
||||||
|
def parse_post(self, response):
|
||||||
|
'''
|
||||||
|
parse post does multiple things:
|
||||||
|
1) loads replied-to-comments page one-by-one (for DFS)
|
||||||
|
2) call parse_reply on the nested comments
|
||||||
|
3) adds simple (not-replied-to) comments
|
||||||
|
4) follows to new comment page
|
||||||
|
'''
|
||||||
|
#load replied-to comments pages
|
||||||
|
#select nested comment one-by-one matching with the index: response.meta['index']
|
||||||
|
path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']'
|
||||||
|
group_flag = response.meta['group'] if 'group' in response.meta else None
|
||||||
|
|
||||||
|
for reply in response.xpath(path):
|
||||||
|
rep = reply.xpath('.//h3/a/@href').get()
|
||||||
|
profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'
|
||||||
|
yield scrapy.Request(profile,
|
||||||
|
callback=self.parse_profile,
|
||||||
|
priority=1000,
|
||||||
|
meta={'url':response.url,
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'flag':'init',
|
||||||
|
'group':group_flag})
|
||||||
|
#load regular comments
|
||||||
|
if not response.xpath(path): #prevents from exec
|
||||||
|
path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
|
||||||
|
for i,reply in enumerate(response.xpath(path2)):
|
||||||
|
self.logger.info('{} regular comment'.format(i+1))
|
||||||
|
rep = reply.xpath('.//h3/a/@href').get()
|
||||||
|
profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'
|
||||||
|
yield scrapy.Request(profile,
|
||||||
|
callback=self.parse_profile,
|
||||||
|
priority=1000,
|
||||||
|
meta={'url':response.url,
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'flag':'init',
|
||||||
|
'group':group_flag})
|
||||||
|
|
||||||
|
#new comment page
|
||||||
|
if not response.xpath(path):
|
||||||
|
#for groups
|
||||||
|
next_xpath = './/div[contains(@id,"see_next")]'
|
||||||
|
prev_xpath = './/div[contains(@id,"see_prev")]'
|
||||||
|
if not response.xpath(next_xpath) or group_flag == 1:
|
||||||
|
for next_page in response.xpath(prev_xpath):
|
||||||
|
new_page = next_page.xpath('.//@href').extract()
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_post,
|
||||||
|
meta={'index':1,
|
||||||
|
'group':1})
|
||||||
|
else:
|
||||||
|
for next_page in response.xpath(next_xpath):
|
||||||
|
new_page = next_page.xpath('.//@href').extract()
|
||||||
|
new_page = response.urljoin(new_page[0])
|
||||||
|
self.logger.info('New page to be crawled {}'.format(new_page))
|
||||||
|
yield scrapy.Request(new_page,
|
||||||
|
callback=self.parse_post,
|
||||||
|
meta={'index':1,
|
||||||
|
'group':group_flag})
|
||||||
|
|
||||||
|
def parse_reply(self,response):
|
||||||
|
'''
|
||||||
|
parse reply to comments, root comment is added if flag
|
||||||
|
'''
|
||||||
|
# from scrapy.utils.response import open_in_browser
|
||||||
|
# open_in_browser(response)
|
||||||
|
|
||||||
|
if response.meta['flag'] == 'init':
|
||||||
|
#parse root comment
|
||||||
|
for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
|
rep = root.xpath('.//h3/a/@href').get()
|
||||||
|
profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'
|
||||||
|
yield scrapy.Request(profile,
|
||||||
|
callback=self.parse_profile,
|
||||||
|
priority=1000,
|
||||||
|
meta={'url':response.url,
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'flag':'init',
|
||||||
|
'group':response['group_flag']})
|
||||||
|
#parse all replies in the page
|
||||||
|
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
|
rep = reply.xpath('.//h3/a/@href').get()
|
||||||
|
profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'
|
||||||
|
yield scrapy.Request(profile,
|
||||||
|
callback=self.parse_profile,
|
||||||
|
priority=1000,
|
||||||
|
meta={'url':response.url,
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'flag':'init',
|
||||||
|
'group':response['group_flag']})
|
||||||
|
|
||||||
|
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
|
||||||
|
if back:
|
||||||
|
self.logger.info('Back found, more nested comments')
|
||||||
|
back_page = response.urljoin(back[0])
|
||||||
|
yield scrapy.Request(back_page,
|
||||||
|
callback=self.parse_reply,
|
||||||
|
priority = 1000,
|
||||||
|
meta={'reply_to':response.meta['reply_to'],
|
||||||
|
'flag':'back',
|
||||||
|
'url':response.meta['url'],
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'group':response.meta['group']})
|
||||||
|
|
||||||
|
else:
|
||||||
|
next_reply = response.meta['url']
|
||||||
|
self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
|
||||||
|
yield scrapy.Request(next_reply,
|
||||||
|
callback=self.parse_post,
|
||||||
|
meta={'index':response.meta['index']+1,
|
||||||
|
'group':response.meta['group']})
|
||||||
|
|
||||||
|
elif response.meta['flag'] == 'back':
|
||||||
|
#parse all comments
|
||||||
|
for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
|
||||||
|
rep = reply.xpath('.//h3/a/@href').extract()[0]
|
||||||
|
profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'
|
||||||
|
yield scrapy.Request(profile,
|
||||||
|
callback=self.parse_profile,
|
||||||
|
priority=1000,
|
||||||
|
meta={'url':response.url,
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'flag':'init',
|
||||||
|
'group':response['group_flag']})
|
||||||
|
#keep going backwards
|
||||||
|
back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
|
||||||
|
self.logger.info('Back found, more nested comments')
|
||||||
|
if back:
|
||||||
|
back_page = response.urljoin(back[0])
|
||||||
|
yield scrapy.Request(back_page,
|
||||||
|
callback=self.parse_reply,
|
||||||
|
priority=1000,
|
||||||
|
meta={'reply_to':response.meta['reply_to'],
|
||||||
|
'flag':'back',
|
||||||
|
'url':response.meta['url'],
|
||||||
|
'index':response.meta['index'],
|
||||||
|
'group':response.meta['group']})
|
||||||
|
|
||||||
|
else:
|
||||||
|
next_reply = response.meta['url']
|
||||||
|
self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
|
||||||
|
yield scrapy.Request(next_reply,
|
||||||
|
callback=self.parse_post,
|
||||||
|
meta={'index':response.meta['index']+1,
|
||||||
|
'group':response.meta['group']})
|
||||||
|
|
||||||
|
|
||||||
|
def parse_profile(self,response):
|
||||||
|
new = ItemLoader(item=ProfileItem(),response=response)
|
||||||
|
self.logger.info('Crawling profile info')
|
||||||
|
new.add_xpath('name','//span/div/span/strong/text()')
|
||||||
|
new.add_xpath('gender',"//div[@id='basic-info']//div[@title='Gender']//div/text()")
|
||||||
|
new.add_xpath('birthday',"//div[@id='basic-info']//div[@title='Birthday']//div/text()")
|
||||||
|
new.add_xpath('current_city',"//div[@id='living']//div[@title='Current City']//a/text()")
|
||||||
|
new.add_xpath('hometown',"//div[@id='living']//div[@title='Hometown']//a/text()")
|
||||||
|
new.add_xpath('work',"//div[@id='work']//a/text()")
|
||||||
|
new.add_xpath('education',"//div[@id='education']//a/text()")
|
||||||
|
new.add_xpath('interested_in',"//div[@id='interested-in']//div[not(contains(text(),'Interested In'))]/text()")
|
||||||
|
new.add_xpath('page',"//div[@id='contact-info']//div[@title='Facebook']//div/text()")
|
||||||
|
yield new.load_item()
|
3780
trump_comments.csv
Normal file
3780
trump_comments.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user