更新图片5

This commit is contained in:
ruben 2019-11-21 23:52:33 +08:00
parent 97252ff6b6
commit 47231ff9a8
2 changed files with 23 additions and 20 deletions

View File

@ -15,6 +15,8 @@ if not os.path.exists('logs/'):
os.mkdir('logs/')
logging.basicConfig(filename=f'logs/{logfile_name}.log', filemode='a+',
format='%(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
class BaikeSpider(scrapy.Spider):
name = 'baike'
allowed_domains = ['baike.baidu.com']
@ -23,8 +25,8 @@ class BaikeSpider(scrapy.Spider):
db_baike = db['db_baike']
db_triples = db['db_triples']
olds = set([item['_id'] for item in db_baike.find({}, {'_id': 1})])
if len(olds)>0:
start_urls=['https://baike.baidu.com/item/'+olds.pop()]
if len(olds) > 0:
start_urls = ['https://baike.baidu.com/item/'+olds.pop()]
driver = GraphDatabase.driver(
"bolt://localhost:7687", auth=("neo4j", "123"))
@ -69,30 +71,31 @@ class BaikeSpider(scrapy.Spider):
'//dt[contains(@class,"basicInfo-item name")]').getall()
values = response.xpath(
'//dd[contains(@class,"basicInfo-item value")]').getall()
if len(attrs)!= len(values):
if len(attrs) != len(values):
return
with self.driver.session() as session:
try:
for attr,value in zip(attrs,values):
for attr, value in zip(attrs, values):
# attr
temp = Selector(text=attr).xpath(
'//dt//text()').getall()
attr = ''.join(temp).replace('\xa0', '')
# value
values = Selector(text=value).xpath(
'//dd/text()|//dd/a//text()').getall()
for value in values:
try:
value=value.replace('\n','')
logging.warning(entity+'_'+attr+'_'+value)
self.db_triples.insert_one({
"_id": entity+'_'+attr+'_'+value,
"item_name": entity,
"attr": attr,
"value": value, }
)
except pymongo.errors.DuplicateKeyError:
pass
session.write_transaction(self.add_node, entity, attr, value)
value = ''.join(Selector(text=value).xpath(
'//dd/text()|//dd/a//text()').getall())
try:
value = value.replace('\n', '')
logging.warning(entity+'_'+attr+'_'+value)
self.db_triples.insert_one({
"_id": entity+'_'+attr+'_'+value,
"item_name": entity,
"attr": attr,
"value": value, }
)
except pymongo.errors.DuplicateKeyError:
pass
session.write_transaction(
self.add_node, entity, attr, value)
except Exception:
logging.error('\n---'.join(attrs)+'\n_________________'+'\n---'.join(values))
logging.error('\n---'.join(attrs) +
'\n_________________'+'\n---'.join(values))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 312 KiB

After

Width:  |  Height:  |  Size: 334 KiB