This commit is contained in:
rugantio 2018-07-25 21:43:53 +02:00
parent fa320a2978
commit 2e00226d52
7 changed files with 379 additions and 0 deletions

0
fbcrawl/__init__.py Normal file
View File

34
fbcrawl/items.py Normal file
View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.loader.processors import TakeFirst, Join
class FbcrawlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
source = scrapy.Field() # page that published the post
date = scrapy.Field(
output_processor=TakeFirst()
)
# when was the post published
text = scrapy.Field(
output_processor=Join(separator=u'')
) # full text of the post
comments = scrapy.Field(
output_processor=Join(separator=u'\n')
) # full text of the post
commentators = scrapy.Field(
output_processor=Join(separator=u'\n')
) # full text of the post
like = scrapy.Field() # num of likes
share = scrapy.Field() # num of shares
num_id = scrapy.Field() # progressive int associated to the entry in the final table, not present in the webpage

103
fbcrawl/middlewares.py Normal file
View File

@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class FbcrawlSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class FbcrawlDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

11
fbcrawl/pipelines.py Normal file
View File

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FbcrawlPipeline(object):
def process_item(self, item, spider):
return item

93
fbcrawl/settings.py Normal file
View File

@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
# Scrapy settings for fbcrawl project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'fbcrawl'
SPIDER_MODULES = ['fbcrawl.spiders']
NEWSPIDER_MODULE = 'fbcrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fbcrawl (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fbcrawl.middlewares.FbcrawlSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'fbcrawl.middlewares.FbcrawlDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'fbcrawl.pipelines.FbcrawlPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_EXPORT_FIELDS = ["source", "date", "text", "commentators","comments","like", "share"] # specifies the order of the column to export as CSV
FEED_EXPORT_ENCODING = 'utf-8'
DUPEFILTER_DEBUG = True

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

134
fbcrawl/spiders/fbcrawl.py Normal file
View File

@ -0,0 +1,134 @@
import scrapy
from datetime import datetime
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from fbcrawl.items import FbcrawlItem
class FacebookSpider(scrapy.Spider):
"""
Parse FB pages (needs credentials)
"""
name = "fb"
def __init__(self, email='', password='', til='2004-1-1', **kwargs):
super(FacebookSpider, self).__init__(**kwargs)
til = til.split(sep='-')
self.til = datetime(int(til[0]),int(til[1]),int(til[2]))
self.email = email
self.password = password
self.start_urls = ['https://mbasic.facebook.com']
def parse(self, response):
return FormRequest.from_response(
response,
formxpath='//form[contains(@action, "login")]',
formdata={'email': self.email,'pass': self.password},
callback=self.parse_home
)
def parse_home(self, response):
'''Parse user news feed page'''
if response.css('#approvals_code'):
# Handle 'Approvals Code' checkpoint (ask user to enter code).
if not self.code:
# Show facebook messages via logs
# and request user for approval code.
message = response.css('._50f4::text').extract()[0]
self.log(message)
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
self.log(message)
self.code = input('Enter the code: ')
self.code = str(self.code)
if not (self.code and self.code.isdigit()):
self.log('Bad approvals code detected.')
return
return FormRequest.from_response(
response,
formdata={'approvals_code': self.code},
callback=self.parse_home,
)
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
# Handle 'Save Browser' checkpoint.
return FormRequest.from_response(
response,
formdata={'name_action_selected': 'dont_save'},
callback=self.parse_home,
dont_filter=True,
)
elif response.css('button#checkpointSubmitButton'):
# Handle `Someone tried to log into your account` warning.
return FormRequest.from_response(
response, callback=self.parse_home, dont_filter=True,)
# Else go to the user profile.
href = 'https://mbasic.facebook.com/ivacciniealtricomplottileggendari'
self.logger.info('Parse function called on %s', href)
return scrapy.Request(
url=href,
callback=self.parse_page,
)
def parse_page(self, response):
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)
for post in response.xpath("//div[contains(@id,'u_0_')]"):
# self.logger.info('Parse function called on %s', response.url)
# self.logger.info('Parsing page number %d', i)
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)
post = post.xpath("//a[contains(text(),'Notizia completa')]/@href").extract()
#
for i in range(len(post)):
temp_post = response.urljoin(post[i])
yield scrapy.Request(temp_post, self.parse_post,dont_filter = True)
# next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
# if len(next_page) > 0:
# next_page = response.urljoin(next_page[0].extract())
# yield scrapy.Request(next_page, callback=self.parse_page)
#
# else:
# next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
# if len(next_page) > 0:
# next_page = response.urljoin(next_page[0].extract())
# yield scrapy.Request(next_page, callback=self.parse_page)
#
def parse_post(self,response):
new = ItemLoader(item=FbcrawlItem(),response=response)
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)
# # ("//div[string-length(@id)=15 or string-length(@id)=16]")
# new.add_xpath('comments',"//div[string-length(@id)=15 or string-length(@id)=16]//div/text()")
# {}' .format(next_comment_page))
new.add_xpath('source', '//span/strong/a/text()')
new.add_xpath('date', '//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text()')
next_comment_page = response.xpath("//div/div[contains(@id,'see_next')]/a/@href")
while len(next_comment_page) > 0:
next_comment_page = response.urljoin(next_comment_page[0].extract())
yield scrapy.Request(next_comment_page, callback=self.parse_comments, dont_filter = True, \
meta={'new':new})
# self.logger.info('Parsing page number %d', i)
# from scrapy.utils.response import open_in_browser
# open_in_browser(response)0
# new.load_item()
#
# yield new.load_item()
def parse_comments(self,response):
self.logger.info('\n\n PAGINA COMMENTI \n\n')
new = response.meta['new']
new.add_xpath('commentators',"//div[number(@id)>1]/div/h3/a[@href]/text()")
yield new.load_item()