final
This commit is contained in:
parent
cdf6bdc68e
commit
8babf7aa1d
@ -1,11 +1,2 @@
|
|||||||
# fbcrawl
|
# fbcrawl
|
||||||
A Facebook crawler
|
A Facebook crawler
|
||||||
|
|
||||||
## TODO
|
|
||||||
work in progress
|
|
||||||
|
|
||||||
## DISCLAIMER
|
|
||||||
This software is NOT to be used. It violates Facebook's terms and conditions. It is for educational purposes only, to show how a crawler can be made to recursively parse a web page.
|
|
||||||
|
|
||||||
## Contribute
|
|
||||||
Pull requests are welcomed!!
|
|
||||||
|
BIN
fbcrawl/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
fbcrawl/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
BIN
fbcrawl/__pycache__/items.cpython-37.pyc
Normal file
BIN
fbcrawl/__pycache__/items.cpython-37.pyc
Normal file
Binary file not shown.
BIN
fbcrawl/__pycache__/pipelines.cpython-37.pyc
Normal file
BIN
fbcrawl/__pycache__/pipelines.cpython-37.pyc
Normal file
Binary file not shown.
BIN
fbcrawl/__pycache__/settings.cpython-37.pyc
Normal file
BIN
fbcrawl/__pycache__/settings.cpython-37.pyc
Normal file
Binary file not shown.
123
fbcrawl/items.py
Normal file
123
fbcrawl/items.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy.loader.processors import TakeFirst, Join, MapCompose
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
def parse_date(date):
|
||||||
|
date = date[0].split()
|
||||||
|
|
||||||
|
mesi = {
|
||||||
|
"gennaio":1,
|
||||||
|
"febbraio":2,
|
||||||
|
"marzo":3,
|
||||||
|
"aprile":4,
|
||||||
|
"maggio":5,
|
||||||
|
"giugno":6,
|
||||||
|
"luglio":7,
|
||||||
|
"agosto":8,
|
||||||
|
"settembre":9,
|
||||||
|
"ottobre":10,
|
||||||
|
"novembre":11,
|
||||||
|
"dicembre":12
|
||||||
|
}
|
||||||
|
|
||||||
|
mesi_abbr = {
|
||||||
|
"gen":1,
|
||||||
|
"feb":2,
|
||||||
|
"mar":3,
|
||||||
|
"apr":4,
|
||||||
|
"mag":5,
|
||||||
|
"giu":6,
|
||||||
|
"lug":7,
|
||||||
|
"ago":8,
|
||||||
|
"set":9,
|
||||||
|
"ott":10,
|
||||||
|
"nov":11,
|
||||||
|
"dic":12
|
||||||
|
}
|
||||||
|
if len(date) == 0:
|
||||||
|
return "Error: no data"
|
||||||
|
elif len(date) == 1 or date[1] == 'h': # meaning that date[0] == 'Adesso' or "n hours" ago
|
||||||
|
day = int(str(datetime.now().date()).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()).split(sep='-')[1])
|
||||||
|
year = int(str(datetime.now().date()).split(sep='-')[0])
|
||||||
|
elif date[0] == 'Ieri':
|
||||||
|
day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2])
|
||||||
|
month = int(str(datetime.now().date()).split(sep='-')[1])
|
||||||
|
year = int(str(datetime.now().date()).split(sep='-')[0])
|
||||||
|
elif (len(date) == 2 and len(date[1]) == 3) or (len(date) == 4 and len(date[1]) == 3):
|
||||||
|
day = int(date[0])
|
||||||
|
month = mesi_abbr[date[1]]
|
||||||
|
year = int(str(datetime.now().date()).split(sep='-')[0])
|
||||||
|
elif date[2] != 'alle':
|
||||||
|
day = int(date[0])
|
||||||
|
month = mesi[date[1]]
|
||||||
|
year = int(date[2])
|
||||||
|
else:
|
||||||
|
day = int(date[0])
|
||||||
|
month = mesi[date[1]]
|
||||||
|
year = int(str(datetime.now().date()).split(sep='-')[0])
|
||||||
|
date = datetime(year,month,day)
|
||||||
|
return date.date()
|
||||||
|
|
||||||
|
def comments_strip(string):
|
||||||
|
return string[0].rstrip(" commenti")
|
||||||
|
|
||||||
|
def reactions_strip(string):
|
||||||
|
if len(string) == 1:
|
||||||
|
string = string[0]
|
||||||
|
while string.rfind('.') != -1:
|
||||||
|
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
||||||
|
return string
|
||||||
|
string = string[0].split()
|
||||||
|
string = string[::-1][0]
|
||||||
|
|
||||||
|
while string.rfind('.') != -1:
|
||||||
|
string = string[0:string.rfind('.')] + string[string.rfind('.')+1:]
|
||||||
|
|
||||||
|
return int(string) + 1
|
||||||
|
|
||||||
|
class FbcrawlItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
source = scrapy.Field(
|
||||||
|
output_processor=TakeFirst()
|
||||||
|
) # page that published the post
|
||||||
|
|
||||||
|
date = scrapy.Field( # when was the post published
|
||||||
|
input_processor=TakeFirst(),
|
||||||
|
output_processor=parse_date
|
||||||
|
)
|
||||||
|
|
||||||
|
text = scrapy.Field(
|
||||||
|
output_processor=Join(separator=u'')
|
||||||
|
) # full text of the post
|
||||||
|
|
||||||
|
comments = scrapy.Field(
|
||||||
|
output_processor=comments_strip
|
||||||
|
)
|
||||||
|
commentators = scrapy.Field(
|
||||||
|
output_processor=Join(separator=u'\n')
|
||||||
|
)
|
||||||
|
|
||||||
|
reactions = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
) # num of reactions
|
||||||
|
|
||||||
|
likes = scrapy.Field(
|
||||||
|
output_processor=reactions_strip
|
||||||
|
)
|
||||||
|
ahah = scrapy.Field()
|
||||||
|
love = scrapy.Field()
|
||||||
|
wow = scrapy.Field()
|
||||||
|
sigh = scrapy.Field()
|
||||||
|
grrr = scrapy.Field()
|
||||||
|
share = scrapy.Field() # num of shares
|
||||||
|
num_id = scrapy.Field() # progressive int associated to the entry in the final table, not present in the webpage
|
||||||
|
url = scrapy.Field()
|
@ -5,7 +5,12 @@
|
|||||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
class FbcrawlPipeline(object):
|
class FbcrawlPipeline(object):
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
return item
|
if item['date'] < datetime(2017,3,4).date():
|
||||||
|
raise DropItem("Dropping element because it's older than 04/03/2017")
|
||||||
|
else:
|
||||||
|
return item
|
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = False
|
|||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
#ITEM_PIPELINES = {
|
#ITEM_PIPELINES = {
|
||||||
# 'fbcrawl.pipelines.FbcrawlPipeline': 300,
|
#'fbcrawl.pipelines.FbcrawlPipeline': 300,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
@ -88,6 +88,6 @@ ROBOTSTXT_OBEY = False
|
|||||||
#HTTPCACHE_DIR = 'httpcache'
|
#HTTPCACHE_DIR = 'httpcache'
|
||||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
FEED_EXPORT_FIELDS = ["source", "date", "text", "commentators","comments","like", "share"] # specifies the order of the column to export as CSV
|
FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV
|
||||||
FEED_EXPORT_ENCODING = 'utf-8'
|
FEED_EXPORT_ENCODING = 'utf-8'
|
||||||
DUPEFILTER_DEBUG = True
|
DUPEFILTER_DEBUG = True
|
BIN
fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
BIN
fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc
Normal file
BIN
fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc
Normal file
Binary file not shown.
122
fbcrawl/spiders/fbcrawl.py
Normal file
122
fbcrawl/spiders/fbcrawl.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from scrapy.http import FormRequest
|
||||||
|
from fbcrawl.items import FbcrawlItem
|
||||||
|
|
||||||
|
|
||||||
|
class FacebookSpider(scrapy.Spider):
|
||||||
|
"""
|
||||||
|
Parse FB pages (needs credentials)
|
||||||
|
"""
|
||||||
|
name = "fb"
|
||||||
|
|
||||||
|
def __init__(self, email='', password='', page='', **kwargs):
|
||||||
|
super(FacebookSpider, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if not email or not password:
|
||||||
|
raise ValueError("You need to provide valid email and password!")
|
||||||
|
else:
|
||||||
|
self.email = email
|
||||||
|
self.password = password
|
||||||
|
|
||||||
|
if not page:
|
||||||
|
raise ValueError("You need to provide a valid page name to crawl!")
|
||||||
|
else:
|
||||||
|
self.page = page
|
||||||
|
|
||||||
|
self.start_urls = ['https://mbasic.facebook.com']
|
||||||
|
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
return FormRequest.from_response(
|
||||||
|
response,
|
||||||
|
formxpath='//form[contains(@action, "login")]',
|
||||||
|
formdata={'email': self.email,'pass': self.password},
|
||||||
|
callback=self.parse_home
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_home(self, response):
|
||||||
|
'''Parse user news feed page'''
|
||||||
|
if response.css('#approvals_code'):
|
||||||
|
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
||||||
|
if not self.code:
|
||||||
|
# Show facebook messages via logs
|
||||||
|
# and request user for approval code.
|
||||||
|
message = response.css('._50f4::text').extract()[0]
|
||||||
|
self.log(message)
|
||||||
|
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
||||||
|
self.log(message)
|
||||||
|
self.code = input('Enter the code: ')
|
||||||
|
self.code = str(self.code)
|
||||||
|
if not (self.code and self.code.isdigit()):
|
||||||
|
self.log('Bad approvals code detected.')
|
||||||
|
return
|
||||||
|
return FormRequest.from_response(
|
||||||
|
response,
|
||||||
|
formdata={'approvals_code': self.code},
|
||||||
|
callback=self.parse_home,
|
||||||
|
)
|
||||||
|
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
||||||
|
# Handle 'Save Browser' checkpoint.
|
||||||
|
return FormRequest.from_response(
|
||||||
|
response,
|
||||||
|
formdata={'name_action_selected': 'dont_save'},
|
||||||
|
callback=self.parse_home,
|
||||||
|
dont_filter=True,
|
||||||
|
)
|
||||||
|
elif response.css('button#checkpointSubmitButton'):
|
||||||
|
# Handle 'Someone tried to log into your account' warning.
|
||||||
|
return FormRequest.from_response(
|
||||||
|
response, callback=self.parse_home, dont_filter=True,)
|
||||||
|
# Else go to the user profile.
|
||||||
|
href = response.urljoin(self.page)
|
||||||
|
self.logger.info('Parse function called on %s', href)
|
||||||
|
return scrapy.Request(
|
||||||
|
url=href,
|
||||||
|
callback=self.parse_page,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(self, response):
|
||||||
|
for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): #select all posts
|
||||||
|
self.logger.info('Parsing post %s', post)
|
||||||
|
|
||||||
|
new = ItemLoader(item=FbcrawlItem(),selector=post)
|
||||||
|
new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()")
|
||||||
|
new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href")
|
||||||
|
|
||||||
|
post = post.xpath(".//a[contains(text(),'Notizia completa')]/@href").extract() #returns full post-link in a list
|
||||||
|
temp_post = response.urljoin(post[0])
|
||||||
|
yield scrapy.Request(temp_post, self.parse_post,dont_filter = True, meta={'item':new})
|
||||||
|
|
||||||
|
next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
|
||||||
|
if len(next_page) > 0:
|
||||||
|
next_page = response.urljoin(next_page[0].extract())
|
||||||
|
yield scrapy.Request(next_page, callback=self.parse_page)
|
||||||
|
else:
|
||||||
|
next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
|
||||||
|
if len(next_page) > 0:
|
||||||
|
next_page = response.urljoin(next_page[0].extract())
|
||||||
|
yield scrapy.Request(next_page, callback=self.parse_page)
|
||||||
|
|
||||||
|
def parse_post(self,response):
|
||||||
|
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
|
||||||
|
new.add_xpath('source', '//span/strong/a/text() | //div/a/strong/text() | //td/div/h3/strong/a/text()')
|
||||||
|
new.add_xpath('date', '//div/div/abbr/text()')
|
||||||
|
new.add_xpath('text','//div[@data-ft]//p//text()')
|
||||||
|
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
|
||||||
|
|
||||||
|
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
|
||||||
|
reactions = response.urljoin(reactions[0].extract())
|
||||||
|
yield scrapy.Request(reactions, callback=self.parse_reactions, dont_filter = True, meta={'item':new})
|
||||||
|
|
||||||
|
def parse_reactions(self,response):
|
||||||
|
new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
|
||||||
|
new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
|
||||||
|
new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
|
||||||
|
new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
|
||||||
|
new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
|
||||||
|
new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
|
||||||
|
new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
|
||||||
|
yield new.load_item()
|
34
items.py
34
items.py
@ -1,34 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Define here the models for your scraped items
|
|
||||||
#
|
|
||||||
# See documentation in:
|
|
||||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
from scrapy.loader.processors import TakeFirst, Join
|
|
||||||
|
|
||||||
class FbcrawlItem(scrapy.Item):
|
|
||||||
# define the fields for your item here like:
|
|
||||||
# name = scrapy.Field()
|
|
||||||
source = scrapy.Field() # page that published the post
|
|
||||||
|
|
||||||
date = scrapy.Field(
|
|
||||||
output_processor=TakeFirst()
|
|
||||||
)
|
|
||||||
# when was the post published
|
|
||||||
text = scrapy.Field(
|
|
||||||
output_processor=Join(separator=u'')
|
|
||||||
) # full text of the post
|
|
||||||
|
|
||||||
comments = scrapy.Field(
|
|
||||||
output_processor=Join(separator=u'\n')
|
|
||||||
) # full text of the post
|
|
||||||
commentators = scrapy.Field(
|
|
||||||
output_processor=Join(separator=u'\n')
|
|
||||||
) # full text of the post
|
|
||||||
|
|
||||||
like = scrapy.Field() # num of likes
|
|
||||||
share = scrapy.Field() # num of shares
|
|
||||||
num_id = scrapy.Field() # progressive int associated to the entry in the final table, not present in the webpage
|
|
||||||
|
|
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = fbcrawl.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = fbcrawl
|
@ -1,134 +0,0 @@
|
|||||||
import scrapy
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
from scrapy.loader import ItemLoader
|
|
||||||
from scrapy.http import FormRequest
|
|
||||||
from fbcrawl.items import FbcrawlItem
|
|
||||||
|
|
||||||
|
|
||||||
class FacebookSpider(scrapy.Spider):
|
|
||||||
"""
|
|
||||||
Parse FB pages (needs credentials)
|
|
||||||
"""
|
|
||||||
name = "fb"
|
|
||||||
|
|
||||||
def __init__(self, email='', password='', til='2004-1-1', **kwargs):
|
|
||||||
super(FacebookSpider, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
til = til.split(sep='-')
|
|
||||||
self.til = datetime(int(til[0]),int(til[1]),int(til[2]))
|
|
||||||
|
|
||||||
self.email = email
|
|
||||||
self.password = password
|
|
||||||
self.start_urls = ['https://mbasic.facebook.com']
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formxpath='//form[contains(@action, "login")]',
|
|
||||||
formdata={'email': self.email,'pass': self.password},
|
|
||||||
callback=self.parse_home
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_home(self, response):
|
|
||||||
'''Parse user news feed page'''
|
|
||||||
if response.css('#approvals_code'):
|
|
||||||
# Handle 'Approvals Code' checkpoint (ask user to enter code).
|
|
||||||
if not self.code:
|
|
||||||
# Show facebook messages via logs
|
|
||||||
# and request user for approval code.
|
|
||||||
message = response.css('._50f4::text').extract()[0]
|
|
||||||
self.log(message)
|
|
||||||
message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
|
|
||||||
self.log(message)
|
|
||||||
self.code = input('Enter the code: ')
|
|
||||||
self.code = str(self.code)
|
|
||||||
if not (self.code and self.code.isdigit()):
|
|
||||||
self.log('Bad approvals code detected.')
|
|
||||||
return
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formdata={'approvals_code': self.code},
|
|
||||||
callback=self.parse_home,
|
|
||||||
)
|
|
||||||
elif response.xpath("//div/input[@value='Ok' and @type='submit']"):
|
|
||||||
# Handle 'Save Browser' checkpoint.
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response,
|
|
||||||
formdata={'name_action_selected': 'dont_save'},
|
|
||||||
callback=self.parse_home,
|
|
||||||
dont_filter=True,
|
|
||||||
)
|
|
||||||
elif response.css('button#checkpointSubmitButton'):
|
|
||||||
# Handle `Someone tried to log into your account` warning.
|
|
||||||
return FormRequest.from_response(
|
|
||||||
response, callback=self.parse_home, dont_filter=True,)
|
|
||||||
# Else go to the user profile.
|
|
||||||
href = 'https://mbasic.facebook.com/ivacciniealtricomplottileggendari'
|
|
||||||
self.logger.info('Parse function called on %s', href)
|
|
||||||
return scrapy.Request(
|
|
||||||
url=href,
|
|
||||||
callback=self.parse_page,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_page(self, response):
|
|
||||||
# from scrapy.utils.response import open_in_browser
|
|
||||||
# open_in_browser(response)
|
|
||||||
|
|
||||||
for post in response.xpath("//div[contains(@id,'u_0_')]"):
|
|
||||||
# self.logger.info('Parse function called on %s', response.url)
|
|
||||||
# self.logger.info('Parsing page number %d', i)
|
|
||||||
# from scrapy.utils.response import open_in_browser
|
|
||||||
# open_in_browser(response)
|
|
||||||
post = post.xpath("//a[contains(text(),'Notizia completa')]/@href").extract()
|
|
||||||
#
|
|
||||||
for i in range(len(post)):
|
|
||||||
temp_post = response.urljoin(post[i])
|
|
||||||
yield scrapy.Request(temp_post, self.parse_post,dont_filter = True)
|
|
||||||
|
|
||||||
# next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
|
|
||||||
# if len(next_page) > 0:
|
|
||||||
# next_page = response.urljoin(next_page[0].extract())
|
|
||||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
|
||||||
#
|
|
||||||
# else:
|
|
||||||
# next_page = response.xpath("//div/a[contains(text(),'2017')]/@href")
|
|
||||||
# if len(next_page) > 0:
|
|
||||||
# next_page = response.urljoin(next_page[0].extract())
|
|
||||||
# yield scrapy.Request(next_page, callback=self.parse_page)
|
|
||||||
#
|
|
||||||
def parse_post(self,response):
|
|
||||||
new = ItemLoader(item=FbcrawlItem(),response=response)
|
|
||||||
# from scrapy.utils.response import open_in_browser
|
|
||||||
# open_in_browser(response)
|
|
||||||
# # ("//div[string-length(@id)=15 or string-length(@id)=16]")
|
|
||||||
# new.add_xpath('comments',"//div[string-length(@id)=15 or string-length(@id)=16]//div/text()")
|
|
||||||
# {}' .format(next_comment_page))
|
|
||||||
new.add_xpath('source', '//span/strong/a/text()')
|
|
||||||
new.add_xpath('date', '//div/div/abbr/text()')
|
|
||||||
new.add_xpath('text','//div[@data-ft]//p//text()')
|
|
||||||
|
|
||||||
next_comment_page = response.xpath("//div/div[contains(@id,'see_next')]/a/@href")
|
|
||||||
while len(next_comment_page) > 0:
|
|
||||||
next_comment_page = response.urljoin(next_comment_page[0].extract())
|
|
||||||
yield scrapy.Request(next_comment_page, callback=self.parse_comments, dont_filter = True, \
|
|
||||||
meta={'new':new})
|
|
||||||
# self.logger.info('Parsing page number %d', i)
|
|
||||||
|
|
||||||
# from scrapy.utils.response import open_in_browser
|
|
||||||
# open_in_browser(response)0
|
|
||||||
# new.load_item()
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# yield new.load_item()
|
|
||||||
|
|
||||||
def parse_comments(self,response):
|
|
||||||
self.logger.info('\n\n PAGINA COMMENTI \n\n')
|
|
||||||
new = response.meta['new']
|
|
||||||
new.add_xpath('commentators',"//div[number(@id)>1]/div/h3/a[@href]/text()")
|
|
||||||
yield new.load_item()
|
|
Loading…
Reference in New Issue
Block a user