update
This commit is contained in:
parent
fa320a2978
commit
2e00226d52
0
fbcrawl/__init__.py
Normal file
0
fbcrawl/__init__.py
Normal file
34
fbcrawl/items.py
Normal file
34
fbcrawl/items.py
Normal file
@ -0,0 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
from scrapy.loader.processors import TakeFirst, Join
|
||||
|
||||
class FbcrawlItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
source = scrapy.Field() # page that published the post
|
||||
|
||||
date = scrapy.Field(
|
||||
output_processor=TakeFirst()
|
||||
)
|
||||
# when was the post published
|
||||
text = scrapy.Field(
|
||||
output_processor=Join(separator=u'')
|
||||
) # full text of the post
|
||||
|
||||
comments = scrapy.Field(
|
||||
output_processor=Join(separator=u'\n')
|
||||
) # full text of the post
|
||||
commentators = scrapy.Field(
|
||||
output_processor=Join(separator=u'\n')
|
||||
) # full text of the post
|
||||
|
||||
like = scrapy.Field() # num of likes
|
||||
share = scrapy.Field() # num of shares
|
||||
num_id = scrapy.Field() # progressive int associated to the entry in the final table, not present in the webpage
|
||||
|
103
fbcrawl/middlewares.py
Normal file
103
fbcrawl/middlewares.py
Normal file
@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class FbcrawlSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class FbcrawlDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
11
fbcrawl/pipelines.py
Normal file
11
fbcrawl/pipelines.py
Normal file
@ -0,0 +1,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
class FbcrawlPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
93
fbcrawl/settings.py
Normal file
93
fbcrawl/settings.py
Normal file
@ -0,0 +1,93 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for fbcrawl project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'fbcrawl'
|
||||
|
||||
SPIDER_MODULES = ['fbcrawl.spiders']
|
||||
NEWSPIDER_MODULE = 'fbcrawl.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'fbcrawl (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'fbcrawl.middlewares.FbcrawlSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'fbcrawl.middlewares.FbcrawlDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 'fbcrawl.pipelines.FbcrawlPipeline': 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
FEED_EXPORT_FIELDS = ["source", "date", "text", "commentators","comments","like", "share"] # specifies the order of the column to export as CSV
|
||||
FEED_EXPORT_ENCODING = 'utf-8'
|
||||
DUPEFILTER_DEBUG = True
|
4
fbcrawl/spiders/__init__.py
Normal file
4
fbcrawl/spiders/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
134
fbcrawl/spiders/fbcrawl.py
Normal file
134
fbcrawl/spiders/fbcrawl.py
Normal file
@ -0,0 +1,134 @@
|
||||
import scrapy
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.http import FormRequest
|
||||
from fbcrawl.items import FbcrawlItem
|
||||
|
||||
|
||||
class FacebookSpider(scrapy.Spider):
|
||||
"""
|
||||
Parse FB pages (needs credentials)
|
||||
"""
|
||||
name = "fb"
|
||||
|
||||
def __init__(self, email='', password='', til='2004-1-1', **kwargs):
|
||||
super(FacebookSpider, self).__init__(**kwargs)
|
||||
|
||||
til = til.split(sep='-')
|
||||
self.til = datetime(int(til[0]),int(til[1]),int(til[2]))
|
||||
|
||||
self.email = email
|
||||
self.password = password
|
||||
self.start_urls = ['https://mbasic.facebook.com']
|
||||
|
||||
def parse(self, response):
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formxpath='//form[contains(@action, "login")]',
|
||||
formdata={'email': self.email,'pass': self.password},
|
||||
callback=self.parse_home
|
||||
)
|
||||
|
||||
def parse_home(self, response):
|
||||
'''Parse user news feed page'''
|
||||
if response.css('#approvals_code'):
|
||||
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
||||
if not self.code:
|
||||
# Show facebook messages via logs
|
||||
# and request user for approval code.
|
||||
message = response.css('._50f4::text').extract()[0]
|
||||
self.log(message)
|
||||
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
||||
self.log(message)
|
||||
self.code = input('Enter the code: ')
|
||||
self.code = str(self.code)
|
||||
if not (self.code and self.code.isdigit()):
|
||||
self.log('Bad approvals code detected.')
|
||||
return
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'approvals_code': self.code},
|
||||
callback=self.parse_home,
|
||||
)
|
||||
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
||||
# Handle 'Save Browser' checkpoint.
|
||||
return FormRequest.from_response(
|
||||
response,
|
||||
formdata={'name_action_selected': 'dont_save'},
|
||||
callback=self.parse_home,
|
||||
dont_filter=True,
|
||||
)
|
||||
elif response.css('button#checkpointSubmitButton'):
|
||||
# Handle `Someone tried to log into your account` warning.
|
||||
return FormRequest.from_response(
|
||||
response, callback=self.parse_home, dont_filter=True,)
|
||||
# Else go to the user profile.
|
||||
href = 'https://mbasic.facebook.com/ivacciniealtricomplottileggendari'
|
||||
self.logger.info('Parse function called on %s', href)
|
||||
return scrapy.Request(
|
||||
url=href,
|
||||
callback=self.parse_page,
|
||||
)
|
||||
|
||||
|
||||
|
||||
def parse_page(self, response):
|
||||
# from scrapy.utils.response import open_in_browser
|
||||
# open_in_browser(response)
|
||||
|
||||
for post in response.xpath("//div[contains(@id,'u_0_')]"):
|
||||
# self.logger.info('Parse function called on %s', response.url)
|
||||
# self.logger.info('Parsing page number %d', i)
|
||||
# from scrapy.utils.response import open_in_browser
|
||||
# open_in_browser(response)
|
||||
post = post.xpath("//a[contains(text(),'Notizia completa')]/@href").extract()
|
||||
#
|
||||
for i in range(len(post)):
|
||||
temp_post = response.urljoin(post[i])
|
||||
yield scrapy.Request(temp_post, self.parse_post,dont_filter = True)
|
||||
|
||||
# next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
|
||||
# if len(next_page) > 0:
|
||||
# next_page = response.urljoin(next_page[0].extract())
|
||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
||||
#
|
||||
# else:
|
||||
# next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
|
||||
# if len(next_page) > 0:
|
||||
# next_page = response.urljoin(next_page[0].extract())
|
||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
||||
#
|
||||
def parse_post(self,response):
|
||||
new = ItemLoader(item=FbcrawlItem(),response=response)
|
||||
# from scrapy.utils.response import open_in_browser
|
||||
# open_in_browser(response)
|
||||
# # ("//div[string-length(@id)=15 or string-length(@id)=16]")
|
||||
# new.add_xpath('comments',"//div[string-length(@id)=15 or string-length(@id)=16]//div/text()")
|
||||
# {}' .format(next_comment_page))
|
||||
new.add_xpath('source', '//span/strong/a/text()')
|
||||
new.add_xpath('date', '//div/div/abbr/text()')
|
||||
new.add_xpath('text','//div[@data-ft]//p//text()')
|
||||
|
||||
next_comment_page = response.xpath("//div/div[contains(@id,'see_next')]/a/@href")
|
||||
while len(next_comment_page) > 0:
|
||||
next_comment_page = response.urljoin(next_comment_page[0].extract())
|
||||
yield scrapy.Request(next_comment_page, callback=self.parse_comments, dont_filter = True, \
|
||||
meta={'new':new})
|
||||
# self.logger.info('Parsing page number %d', i)
|
||||
|
||||
# from scrapy.utils.response import open_in_browser
|
||||
# open_in_browser(response)0
|
||||
# new.load_item()
|
||||
|
||||
|
||||
#
|
||||
# yield new.load_item()
|
||||
|
||||
def parse_comments(self,response):
|
||||
self.logger.info('\n\n PAGINA COMMENTI \n\n')
|
||||
new = response.meta['new']
|
||||
new.add_xpath('commentators',"//div[number(@id)>1]/div/h3/a[@href]/text()")
|
||||
yield new.load_item()
|
Loading…
Reference in New Issue
Block a user