Adding events crawler
This commit is contained in:
parent
be8a9c2f5f
commit
5c3128cfef
@ -638,3 +638,12 @@ class ProfileItem(scrapy.Item):
|
||||
education = scrapy.Field()
|
||||
interested_in = scrapy.Field()
|
||||
page = scrapy.Field()
|
||||
|
||||
class EventsItem(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
location = scrapy.Field()
|
||||
where = scrapy.Field()
|
||||
photo = scrapy.Field()
|
||||
start_date = scrapy.Field()
|
||||
end_date = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
|
58
fbcrawl/spiders/events.py
Normal file
58
fbcrawl/spiders/events.py
Normal file
@ -0,0 +1,58 @@
|
||||
import scrapy
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.exceptions import CloseSpider
|
||||
from fbcrawl.spiders.fbcrawl import FacebookSpider
|
||||
from fbcrawl.items import EventsItem, parse_date, parse_date2
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
class EventsSpider(FacebookSpider):
|
||||
"""
|
||||
Parse FB events, given a page (needs credentials)
|
||||
"""
|
||||
name = "events"
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \
|
||||
'end_date','description'],
|
||||
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
|
||||
'CONCURRENT_REQUESTS' : 1
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.page = kwargs['page']
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
yield scrapy.Request(url=response.urljoin('%s/events' % self.page),
|
||||
callback=self.parse_events,
|
||||
priority=10,
|
||||
meta={'index':1})
|
||||
|
||||
def parse_events(self, response):
|
||||
TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr'
|
||||
for event in response.xpath(TABLE_XPATH):
|
||||
url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first()
|
||||
yield response.follow(url, callback=self.parse_event)
|
||||
|
||||
def parse_event(self, response):
|
||||
EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()'
|
||||
EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()'
|
||||
EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()'
|
||||
DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()'
|
||||
EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()'
|
||||
EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src'
|
||||
date = response.xpath(DATE).extract_first()
|
||||
start_date = date.split('–')[0] or None
|
||||
end_date = date.split('–')[1] or None
|
||||
name = response.xpath(EVENT_NAME).extract_first()
|
||||
self.logger.info('Parsing event %s' % name)
|
||||
yield EventsItem(
|
||||
name=name,
|
||||
where=response.xpath(EVENT_WHERE).extract_first(),
|
||||
location=response.xpath(EVENT_LOCATION).extract_first(),
|
||||
photo=response.xpath(EVENT_COVER).extract_first(),
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
description=response.xpath(EVENT_DESCRIPTION).extract_first()
|
||||
)
|
Loading…
Reference in New Issue
Block a user