From 5c3128cfeff15e6b85610d4a098641db63d55a20 Mon Sep 17 00:00:00 2001 From: StefanYohansson Date: Sat, 29 Jun 2019 22:01:04 -0300 Subject: [PATCH] Adding events crawler --- fbcrawl/items.py | 217 ++++++++++++++++++++------------------ fbcrawl/spiders/events.py | 58 ++++++++++ 2 files changed, 171 insertions(+), 104 deletions(-) create mode 100644 fbcrawl/spiders/events.py diff --git a/fbcrawl/items.py b/fbcrawl/items.py index 794821d..360f41f 100644 --- a/fbcrawl/items.py +++ b/fbcrawl/items.py @@ -8,7 +8,7 @@ import scrapy from scrapy.loader.processors import TakeFirst, Join, MapCompose from datetime import datetime, timedelta - + def comments_strip(string,loader_context): lang = loader_context['lang'] if lang == 'it': @@ -16,7 +16,7 @@ def comments_strip(string,loader_context): return else: return string[0].rstrip(' commenti') - + elif lang == 'en': if(string[0] == 'Share'): return '0' @@ -31,13 +31,13 @@ def reactions_strip(string,loader_context): lang = loader_context['lang'] if lang == 'it': newstring = string[0] - #19.298.873 + #19.298.873 if len(newstring.split()) == 1: while newstring.rfind('.') != -1: newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:] return newstring #Pamela, Luigi e altri 4 - else: + else: return string friends = newstring.count(' e ') + newstring.count(',') newstring = newstring.split()[::-1][0] @@ -46,13 +46,13 @@ def reactions_strip(string,loader_context): return int(newstring) + friends elif lang == 'en': newstring = string[0] - #19,298,873 + #19,298,873 if len(newstring.split()) == 1: while newstring.rfind(',') != -1: newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] return newstring - #Mark and other 254,134 - elif newstring.split()[::-1][1].isdigit(): + #Mark and other 254,134 + elif newstring.split()[::-1][1].isdigit(): friends = newstring.count(' and ') + newstring.count(',') newstring = newstring.split()[::-1][1] while newstring.rfind(',') != -1: @@ -70,7 +70,7 @@ def url_strip(url): i = fullurl.find('&id=') if i != -1: return fullurl[:i+4] + fullurl[i+4:].split('&')[0] - else: #catch photos + else: #catch photos i = fullurl.find('/photos/') if i != -1: return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] @@ -80,13 +80,13 @@ def url_strip(url): return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] else: return fullurl - + def parse_date(date,loader_context): import json - + d = json.loads(date[0]) #nested dict of features flat_d = dict() #only retain 'leaves' of d tree - + def recursive_items(dictionary): ''' Get most nested key:value pair of nested dict @@ -138,7 +138,7 @@ def parse_date2(init_date,loader_context): 'ott':10, 'nov':11, 'dic':12 - } + } giorni = { 'lunedì':0, @@ -148,8 +148,8 @@ def parse_date2(init_date,loader_context): 'venerdì':4, 'sabato':5, 'domenica':6 - } - + } + date = init_date[0].split() year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today @@ -161,7 +161,7 @@ def parse_date2(init_date,loader_context): #adesso, ieri, 4h, 50min elif l == 1: - if date[0].isalpha(): + if date[0].isalpha(): if date[0].lower() == 'ieri': day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) #check that yesterday was not in another month @@ -169,15 +169,15 @@ def parse_date2(init_date,loader_context): elif date[0].lower() == 'adesso': return datetime(year,month,day).date() #return today else: #not recognized, (return date or init_date) - return date - else: + return date + else: #4h, 50min (exploit future parsing) l = 2 new_date = [x for x in date[0] if x.isdigit()] date[0] = ''.join(new_date) new_date = [x for x in date[0] if not(x.isdigit())] - date[1] = ''.join(new_date) -# l = 2 + date[1] = ''.join(new_date) +# l = 2 elif l == 2: #22 min (oggi) if date[1] == 'min': @@ -187,7 +187,7 @@ def parse_date2(init_date,loader_context): else: day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #4 h (oggi) elif date[1] == 'h': if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: @@ -196,34 +196,34 @@ def parse_date2(init_date,loader_context): else: day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #2 gen elif len(date[1]) == 3 and date[1].isalpha(): day = int(date[0]) - month = months_abbr[date[1].lower()] - return datetime(year,month,day).date() + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() #2 gennaio elif len(date[1]) > 3 and date[1].isalpha(): day = int(date[0]) month = months[date[1]] - return datetime(year,month,day).date() + return datetime(year,month,day).date() #parsing failed else: return date # l = 3 elif l == 3: - #21 giu 2017 + #21 giu 2017 if len(date[1]) == 3 and date[2].isdigit(): day = int(date[0]) month = months_abbr[date[1]] year = int(date[2]) - return datetime(year,month,day).date() - #21 giugno 2017 + return datetime(year,month,day).date() + #21 giugno 2017 elif len(date[1]) > 3 and date[2].isdigit(): day = int(date[0]) month = months[date[1]] year = int(date[2]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #9 ore fa elif date[0].isdigit() and date[1][:2] == 'or': if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: @@ -232,25 +232,25 @@ def parse_date2(init_date,loader_context): else: day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #7 minuti fa elif date[0].isdigit() and date[1][:3] == 'min': - return datetime(year,month,day).date() + return datetime(year,month,day).date() - #ieri alle 20:45 + #ieri alle 20:45 elif date[0].lower() == 'ieri' and date[1] == 'alle': day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() - #oggi alle 11:11 + return datetime(year,month,day).date() + #oggi alle 11:11 elif date[0].lower() == 'oggi' and date[1] == 'alle': - return datetime(year,month,day).date() + return datetime(year,month,day).date() #lunedì alle 12:34 elif date[0].isalpha() and date[1] == 'alle': today = datetime.now().weekday() #today as a weekday weekday = giorni[date[0].lower()] #day to be match as number weekday #weekday is chronologically always lower than day - delta = today - weekday + delta = today - weekday if delta >= 0: day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) @@ -270,13 +270,13 @@ def parse_date2(init_date,loader_context): if date[0].lower() == 'ieri' and date[1] == 'alle': day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #domenica alle ore 19:29 elif date[0].isalpha() and date[1] == 'alle': today = datetime.now().weekday() #today as a weekday weekday = giorni[date[0].lower()] #day to be match as number weekday #weekday is chronologically always lower than day - delta = today - weekday + delta = today - weekday if delta >= 0: day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) @@ -286,7 +286,7 @@ def parse_date2(init_date,loader_context): delta += 8 day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #parsing failed else: return date @@ -297,16 +297,16 @@ def parse_date2(init_date,loader_context): if len(date[1]) == 3: day = int(date[0]) month = months_abbr[date[1].lower()] - return datetime(year,month,day).date() - #29 febbraio alle ore 21:49 + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 else: day = int(date[0]) month = months[date[1].lower()] - return datetime(year,month,day).date() + return datetime(year,month,day).date() #parsing failed else: return date -# l = 6 +# l = 6 elif l == 6: if date[3] == 'alle': #29 feb 2016 alle ore 21:49 @@ -314,14 +314,14 @@ def parse_date2(init_date,loader_context): day = int(date[0]) month = months_abbr[date[1].lower()] year = int(date[2]) - return datetime(year,month,day).date() - #29 febbraio 2016 alle ore 21:49 + return datetime(year,month,day).date() + #29 febbraio 2016 alle ore 21:49 else: day = int(date[0]) month = months[date[1].lower()] year = int(date[2]) - return datetime(year,month,day).date() - #parsing failed + return datetime(year,month,day).date() + #parsing failed else: return date # ============================================================================= @@ -356,7 +356,7 @@ def parse_date2(init_date,loader_context): 'oct':10, 'nov':11, 'dec':12 - } + } days = { 'monday':0, @@ -366,7 +366,7 @@ def parse_date2(init_date,loader_context): 'friday':4, 'saturday':5, 'sunday':6 - } + } date = init_date[0].split() year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today @@ -379,7 +379,7 @@ def parse_date2(init_date,loader_context): #Yesterday, Now, 4hr, 50mins elif l == 1: - if date[0].isalpha(): + if date[0].isalpha(): if date[0].lower() == 'yesterday': day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) #check that yesterday was not in another month @@ -387,15 +387,15 @@ def parse_date2(init_date,loader_context): elif date[0].lower() == 'now': return datetime(year,month,day).date() #return today else: #not recognized, (return date or init_date) - return date - else: + return date + else: #4h, 50min (exploit future parsing) l = 2 new_date = [x for x in date[0] if x.isdigit()] date[0] = ''.join(new_date) new_date = [x for x in date[0] if not(x.isdigit())] - date[1] = ''.join(new_date) -# l = 2 + date[1] = ''.join(new_date) +# l = 2 elif l == 2: if date[1] == 'now': return datetime(year,month,day).date() @@ -414,31 +414,31 @@ def parse_date2(init_date,loader_context): if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #4 h (oggi) - else: + else: return datetime(year,month,day).date() #2 jan elif len(date[1]) == 3 and date[1].isalpha(): day = int(date[0]) - month = months_abbr[date[1].lower()] - return datetime(year,month,day).date() + month = months_abbr[date[1].lower()] + return datetime(year,month,day).date() #2 january elif len(date[1]) > 3 and date[1].isalpha(): day = int(date[0]) month = months[date[1]] - return datetime(year,month,day).date() + return datetime(year,month,day).date() #jan 2 elif len(date[0]) == 3 and date[0].isalpha(): day = int(date[1]) - month = months_abbr[date[0].lower()] - return datetime(year,month,day).date() + month = months_abbr[date[0].lower()] + return datetime(year,month,day).date() #january 2 elif len(date[0]) > 3 and date[0].isalpha(): day = int(date[1]) month = months[date[0]] - return datetime(year,month,day).date() + return datetime(year,month,day).date() #parsing failed else: return date @@ -452,35 +452,35 @@ def parse_date2(init_date,loader_context): if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() - # 5 hours ago (today) - else: return datetime(year,month,day).date() - #10 minutes ago + # 5 hours ago (today) + else: + return datetime(year,month,day).date() + #10 minutes ago elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins': #22 minutes ago (yesterday) if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #22 minutes ago (today) else: return datetime(year,month,day).date() else: - return date + return date else: #21 Jun 2017 if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit(): day = int(date[0]) month = months_abbr[date[1].lower()] year = int(date[2]) - return datetime(year,month,day).date() - #21 June 2017 + return datetime(year,month,day).date() + #21 June 2017 elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit(): day = int(date[0]) month = months[date[1].lower()] year = int(date[2]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #Jul 11, 2016 elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha(): day = int(date[1][:-1]) @@ -496,13 +496,13 @@ def parse_date2(init_date,loader_context): if date[0].lower() == 'yesterday' and date[1] == 'at': day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) - return datetime(year,month,day).date() + return datetime(year,month,day).date() #Thursday at 4:27 PM elif date[1] == 'at': today = datetime.now().weekday() #today as a weekday weekday = days[date[0].lower()] #day to be match as number weekday #weekday is chronologically always lower than day - delta = today - weekday + delta = today - weekday if delta >= 0: day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) @@ -519,82 +519,82 @@ def parse_date2(init_date,loader_context): # l = 5 elif l == 5: if date[2] == 'at': - #Jan 29 at 10:00 PM + #Jan 29 at 10:00 PM if len(date[0]) == 3: day = int(date[1]) month = months_abbr[date[0].lower()] - return datetime(year,month,day).date() - #29 febbraio alle ore 21:49 + return datetime(year,month,day).date() + #29 febbraio alle ore 21:49 else: day = int(date[1]) month = months[date[0].lower()] - return datetime(year,month,day).date() + return datetime(year,month,day).date() #parsing failed else: return date -# l = 6 +# l = 6 elif l == 6: if date[3] == 'at': date[1] - #Aug 25, 2016 at 7:00 PM + #Aug 25, 2016 at 7:00 PM if len(date[0]) == 3: day = int(date[1][:-1]) month = months_abbr[date[0].lower()] year = int(date[2]) - return datetime(year,month,day).date() - #August 25, 2016 at 7:00 PM + return datetime(year,month,day).date() + #August 25, 2016 at 7:00 PM else: day = int(date[1][:-1]) month = months[date[0].lower()] year = int(date[2]) - return datetime(year,month,day).date() - #parsing failed + return datetime(year,month,day).date() + #parsing failed else: return date -# l > 6 +# l > 6 #parsing failed - l too big else: return date #parsing failed - language not supported else: return init_date - + def id_strip(post_id): import json d = json.loads(post_id[::-1][0]) #nested dict of features return str(d['top_level_post_id']) - + class FbcrawlItem(scrapy.Item): - source = scrapy.Field() - date = scrapy.Field() + source = scrapy.Field() + date = scrapy.Field() text = scrapy.Field( output_processor=Join(separator=u'') ) # full text of the post comments = scrapy.Field( output_processor=comments_strip - ) + ) reactions = scrapy.Field( output_processor=reactions_strip ) # num of reactions likes = scrapy.Field( output_processor=reactions_strip - ) + ) ahah = scrapy.Field( output_processor=reactions_strip - ) + ) love = scrapy.Field( output_processor=reactions_strip - ) + ) wow = scrapy.Field( output_processor=reactions_strip - ) + ) sigh = scrapy.Field( output_processor=reactions_strip - ) + ) grrr = scrapy.Field( output_processor=reactions_strip - ) + ) share = scrapy.Field() # num of shares url = scrapy.Field( output_processor=url_strip @@ -605,11 +605,11 @@ class FbcrawlItem(scrapy.Item): shared_from = scrapy.Field() class CommentsItem(scrapy.Item): - source = scrapy.Field() + source = scrapy.Field() reply_to=scrapy.Field() date = scrapy.Field( # when was the post published output_processor=parse_date2 - ) + ) text = scrapy.Field( output_processor=Join(separator=u'') ) # full text of the post @@ -618,18 +618,18 @@ class CommentsItem(scrapy.Item): ) # num of reactions likes = scrapy.Field( output_processor=reactions_strip - ) - source_url = scrapy.Field() + ) + source_url = scrapy.Field() url = scrapy.Field() - ahah = scrapy.Field() - love = scrapy.Field() - wow = scrapy.Field() - sigh = scrapy.Field() - grrr = scrapy.Field() + ahah = scrapy.Field() + love = scrapy.Field() + wow = scrapy.Field() + sigh = scrapy.Field() + grrr = scrapy.Field() share = scrapy.Field() # num of shares class ProfileItem(scrapy.Item): - name = scrapy.Field() + name = scrapy.Field() gender = scrapy.Field() birthday = scrapy.Field() current_city = scrapy.Field() @@ -638,3 +638,12 @@ class ProfileItem(scrapy.Item): education = scrapy.Field() interested_in = scrapy.Field() page = scrapy.Field() + +class EventsItem(scrapy.Item): + name = scrapy.Field() + location = scrapy.Field() + where = scrapy.Field() + photo = scrapy.Field() + start_date = scrapy.Field() + end_date = scrapy.Field() + description = scrapy.Field() diff --git a/fbcrawl/spiders/events.py b/fbcrawl/spiders/events.py new file mode 100644 index 0000000..a1152ed --- /dev/null +++ b/fbcrawl/spiders/events.py @@ -0,0 +1,58 @@ +import scrapy + +from scrapy.loader import ItemLoader +from scrapy.exceptions import CloseSpider +from fbcrawl.spiders.fbcrawl import FacebookSpider +from fbcrawl.items import EventsItem, parse_date, parse_date2 + +from datetime import datetime + +class EventsSpider(FacebookSpider): + """ + Parse FB events, given a page (needs credentials) + """ + name = "events" + custom_settings = { + 'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \ + 'end_date','description'], + 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', + 'CONCURRENT_REQUESTS' : 1 + } + + def __init__(self, *args, **kwargs): + self.page = kwargs['page'] + super().__init__(*args,**kwargs) + + def parse_page(self, response): + yield scrapy.Request(url=response.urljoin('%s/events' % self.page), + callback=self.parse_events, + priority=10, + meta={'index':1}) + + def parse_events(self, response): + TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr' + for event in response.xpath(TABLE_XPATH): + url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first() + yield response.follow(url, callback=self.parse_event) + + def parse_event(self, response): + EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()' + EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()' + EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()' + DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()' + EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()' + EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src' + date = response.xpath(DATE).extract_first() + start_date = date.split('–')[0] or None + end_date = date.split('–')[1] or None + name = response.xpath(EVENT_NAME).extract_first() + self.logger.info('Parsing event %s' % name) + yield EventsItem( + name=name, + where=response.xpath(EVENT_WHERE).extract_first(), + location=response.xpath(EVENT_LOCATION).extract_first(), + photo=response.xpath(EVENT_COVER).extract_first(), + start_date=start_date, + end_date=end_date, + description=response.xpath(EVENT_DESCRIPTION).extract_first() + )