Adding events crawler

2019-06-29 22:01:04 -03:00 · 2019-06-29 22:01:04 -03:00 · 5c3128cfef
commit 5c3128cfef
parent be8a9c2f5f
2 changed files with 171 additions and 104 deletions
--- a/fbcrawl/items.py
+++ b/fbcrawl/items.py
@ -638,3 +638,12 @@ class ProfileItem(scrapy.Item):
    education = scrapy.Field()
    interested_in = scrapy.Field()
    page = scrapy.Field()
 class EventsItem(scrapy.Item):
    name = scrapy.Field()
    location = scrapy.Field()
    where = scrapy.Field()
    photo = scrapy.Field()
    start_date = scrapy.Field()
    end_date = scrapy.Field()
    description = scrapy.Field()
--- a/fbcrawl/spiders/events.py
+++ b/fbcrawl/spiders/events.py
@ -0,0 +1,58 @@
 import scrapy
 from scrapy.loader import ItemLoader
 from scrapy.exceptions import CloseSpider
 from fbcrawl.spiders.fbcrawl import FacebookSpider
 from fbcrawl.items import EventsItem, parse_date, parse_date2
 from datetime import datetime
 class EventsSpider(FacebookSpider):
    """
    Parse FB events, given a page (needs credentials)
    """
    name = "events"
    custom_settings = {
        'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \
                               'end_date','description'],
        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
        'CONCURRENT_REQUESTS' : 1
    }
    def __init__(self, *args, **kwargs):
        self.page = kwargs['page']
        super().__init__(*args,**kwargs)
    def parse_page(self, response):
        yield scrapy.Request(url=response.urljoin('%s/events' % self.page),
                             callback=self.parse_events,
                             priority=10,
                             meta={'index':1})
    def parse_events(self, response):
        TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr'
        for event in response.xpath(TABLE_XPATH):
            url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first()
            yield response.follow(url, callback=self.parse_event)
    def parse_event(self, response):
        EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()'
        EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()'
        EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()'
        DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()'
        EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()'
        EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src'
        date = response.xpath(DATE).extract_first()
        start_date = date.split('–')[0] or None
        end_date = date.split('–')[1] or None
        name = response.xpath(EVENT_NAME).extract_first()
        self.logger.info('Parsing event %s' % name)
        yield EventsItem(
            name=name,
            where=response.xpath(EVENT_WHERE).extract_first(),
            location=response.xpath(EVENT_LOCATION).extract_first(),
            photo=response.xpath(EVENT_COVER).extract_first(),
            start_date=start_date,
            end_date=end_date,
            description=response.xpath(EVENT_DESCRIPTION).extract_first()
        )