创建医药知识图谱及基于知识图谱的问答项目
This commit is contained in:
commit
bbc60f3a96
242
build_medicalgraph.py
Normal file
242
build_medicalgraph.py
Normal file
@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
# File: MedicalGraph.py
|
||||
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
||||
# Date: 18-10-3
|
||||
|
||||
import os
|
||||
import json
|
||||
from py2neo import Graph,Node
|
||||
|
||||
class MedicalGraph:
|
||||
def __init__(self):
|
||||
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
|
||||
self.data_path = os.path.join(cur_dir, 'medical.json')
|
||||
self.g = Graph(
|
||||
host="127.0.0.1", # neo4j 搭载服务器的ip地址,ifconfig可获取到
|
||||
http_port=7474, # neo4j 服务器监听的端口号
|
||||
user="lhy", # 数据库user name,如果没有更改过,应该是neo4j
|
||||
password="lhy123")
|
||||
|
||||
'''读取文件'''
|
||||
def read_nodes(self):
|
||||
# 共7类节点
|
||||
drugs = [] # 药品
|
||||
foods = [] # 食物
|
||||
checks = [] # 检查
|
||||
departments = [] #科室
|
||||
producers = [] #药品大类
|
||||
diseases = [] #疾病
|
||||
symptoms = []#症状
|
||||
|
||||
disease_infos = []#疾病信息
|
||||
|
||||
# 构建节点实体关系
|
||||
rels_department = [] # 科室-科室关系
|
||||
rels_noteat = [] # 疾病-忌吃食物关系
|
||||
rels_doeat = [] # 疾病-宜吃食物关系
|
||||
rels_recommandeat = [] # 疾病-推荐吃食物关系
|
||||
rels_commonddrug = [] # 疾病-通用药品关系
|
||||
rels_recommanddrug = [] # 疾病-热门药品关系
|
||||
rels_check = [] # 疾病-检查关系
|
||||
rels_drug_producer = [] # 厂商-药物关系
|
||||
|
||||
rels_symptom = [] #疾病症状关系
|
||||
rels_acompany = [] # 疾病并发关系
|
||||
rels_category = [] # 疾病与科室之间的关系
|
||||
|
||||
|
||||
count = 0
|
||||
for data in open(self.data_path):
|
||||
disease_dict = {}
|
||||
count += 1
|
||||
print(count)
|
||||
data_json = json.loads(data)
|
||||
disease = data_json['name']
|
||||
disease_dict['name'] = disease
|
||||
diseases.append(disease)
|
||||
disease_dict['desc'] = ''
|
||||
disease_dict['prevent'] = ''
|
||||
disease_dict['cause'] = ''
|
||||
disease_dict['easy_get'] = ''
|
||||
disease_dict['cure_department'] = ''
|
||||
disease_dict['cure_way'] = ''
|
||||
disease_dict['cure_lasttime'] = ''
|
||||
disease_dict['symptom'] = ''
|
||||
disease_dict['cured_prob'] = ''
|
||||
|
||||
if 'symptom' in data_json:
|
||||
symptoms += data_json['symptom']
|
||||
for symptom in data_json['symptom']:
|
||||
rels_symptom.append([disease, symptom])
|
||||
|
||||
if 'acompany' in data_json:
|
||||
for acompany in data_json['acompany']:
|
||||
rels_acompany.append([disease, acompany])
|
||||
|
||||
if 'desc' in data_json:
|
||||
disease_dict['desc'] = data_json['desc']
|
||||
|
||||
if 'prevent' in data_json:
|
||||
disease_dict['prevent'] = data_json['prevent']
|
||||
|
||||
if 'cause' in data_json:
|
||||
disease_dict['cause'] = data_json['cause']
|
||||
|
||||
if 'get_prob' in data_json:
|
||||
disease_dict['get_prob'] = data_json['get_prob']
|
||||
|
||||
if 'easy_get' in data_json:
|
||||
disease_dict['easy_get'] = data_json['easy_get']
|
||||
|
||||
if 'cure_department' in data_json:
|
||||
cure_department = data_json['cure_department']
|
||||
if len(cure_department) == 1:
|
||||
rels_category.append([disease, cure_department[0]])
|
||||
if len(cure_department) == 2:
|
||||
big = cure_department[0]
|
||||
small = cure_department[1]
|
||||
rels_department.append([small, big])
|
||||
rels_category.append([disease, small])
|
||||
|
||||
disease_dict['cure_department'] = cure_department
|
||||
departments += cure_department
|
||||
|
||||
if 'cure_way' in data_json:
|
||||
disease_dict['cure_way'] = data_json['cure_way']
|
||||
|
||||
if 'cure_lasttime' in data_json:
|
||||
disease_dict['cure_lasttime'] = data_json['cure_lasttime']
|
||||
|
||||
if 'cured_prob' in data_json:
|
||||
disease_dict['cured_prob'] = data_json['cured_prob']
|
||||
|
||||
if 'common_drug' in data_json:
|
||||
common_drug = data_json['common_drug']
|
||||
for drug in common_drug:
|
||||
rels_commonddrug.append([disease, drug])
|
||||
drugs += common_drug
|
||||
|
||||
if 'recommand_drug' in data_json:
|
||||
recommand_drug = data_json['recommand_drug']
|
||||
drugs += recommand_drug
|
||||
for drug in recommand_drug:
|
||||
rels_recommanddrug.append([disease, drug])
|
||||
|
||||
if 'not_eat' in data_json:
|
||||
not_eat = data_json['not_eat']
|
||||
for _not in not_eat:
|
||||
rels_noteat.append([disease, _not])
|
||||
|
||||
foods += not_eat
|
||||
do_eat = data_json['do_eat']
|
||||
for _do in do_eat:
|
||||
rels_doeat.append([disease, _do])
|
||||
|
||||
foods += do_eat
|
||||
recommand_eat = data_json['recommand_eat']
|
||||
|
||||
for _recommand in recommand_eat:
|
||||
rels_recommandeat.append([disease, _recommand])
|
||||
foods += recommand_eat
|
||||
|
||||
if 'check' in data_json:
|
||||
check = data_json['check']
|
||||
for _check in check:
|
||||
rels_check.append([disease, _check])
|
||||
checks += check
|
||||
if 'drug_detail' in data_json:
|
||||
drug_detail = data_json['drug_detail']
|
||||
producer = [i.split('(')[0] for i in drug_detail]
|
||||
rels_drug_producer += [[i.split('(')[0], i.split('(')[-1].replace(')', '')] for i in drug_detail]
|
||||
producers += producer
|
||||
disease_infos.append(disease_dict)
|
||||
return set(drugs), set(foods), set(checks), set(departments), set(producers), set(symptoms), set(diseases), disease_infos,\
|
||||
rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,\
|
||||
rels_symptom, rels_acompany, rels_category
|
||||
|
||||
'''建立节点'''
|
||||
def create_node(self, label, nodes):
|
||||
count = 0
|
||||
for node_name in nodes:
|
||||
node = Node(label, name=node_name)
|
||||
self.g.create(node)
|
||||
count += 1
|
||||
print(count, len(nodes))
|
||||
return
|
||||
|
||||
'''创建知识图谱中心疾病的节点'''
|
||||
def create_diseases_nodes(self, disease_infos):
|
||||
count = 0
|
||||
for disease_dict in disease_infos:
|
||||
node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'],
|
||||
prevent=disease_dict['prevent'] ,cause=disease_dict['cause'],
|
||||
easy_get=disease_dict['easy_get'],cure_lasttime=disease_dict['cure_lasttime'],
|
||||
cure_department=disease_dict['cure_department']
|
||||
,cure_way=disease_dict['cure_way'] , cured_prob=disease_dict['cured_prob'])
|
||||
self.g.create(node)
|
||||
count += 1
|
||||
print(count)
|
||||
return
|
||||
|
||||
'''创建知识图谱实体节点类型schema'''
|
||||
def create_graphnodes(self):
|
||||
Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos,rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,rels_symptom, rels_acompany, rels_category = self.read_nodes()
|
||||
self.create_diseases_nodes(disease_infos)
|
||||
self.create_node('Drug', Drugs)
|
||||
print(len(Drugs))
|
||||
self.create_node('Food', Foods)
|
||||
print(len(Foods))
|
||||
self.create_node('Check', Checks)
|
||||
print(len(Checks))
|
||||
self.create_node('Department', Departments)
|
||||
print(len(Departments))
|
||||
self.create_node('Producer', Producers)
|
||||
print(len(Producers))
|
||||
self.create_node('Symptom', Symptoms)
|
||||
return
|
||||
|
||||
|
||||
'''创建实体关系边'''
|
||||
def create_graphrels(self):
|
||||
Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,rels_symptom, rels_acompany, rels_category = self.read_nodes()
|
||||
self.create_relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱')
|
||||
self.create_relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃')
|
||||
self.create_relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃')
|
||||
self.create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于')
|
||||
self.create_relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品')
|
||||
self.create_relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品')
|
||||
self.create_relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品')
|
||||
self.create_relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查')
|
||||
self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
|
||||
self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
|
||||
self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')
|
||||
|
||||
|
||||
'''创建实体关联边'''
|
||||
def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
|
||||
count = 0
|
||||
# 去重处理
|
||||
set_edges = []
|
||||
for edge in edges:
|
||||
set_edges.append('###'.join(edge))
|
||||
all = len(set(set_edges))
|
||||
for edge in set(set_edges):
|
||||
edge = edge.split('###')
|
||||
p = edge[0]
|
||||
q = edge[1]
|
||||
query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
|
||||
start_node, end_node, p, q, rel_type, rel_name)
|
||||
try:
|
||||
self.g.run(query)
|
||||
count += 1
|
||||
print(rel_type, count, all)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
handler = MedicalGraph()
|
||||
handler.create_graphrels()
|
8808
data/medical.json
Normal file
8808
data/medical.json
Normal file
File diff suppressed because one or more lines are too long
166
data_spider.py
Normal file
166
data_spider.py
Normal file
@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
# File: data_spider.py
|
||||
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
||||
# Date: 18-10-3
|
||||
|
||||
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
from lxml import etree
|
||||
import pymongo
|
||||
import re
|
||||
|
||||
'''基于司法网的犯罪案件采集'''
|
||||
class CrimeSpider:
|
||||
def __init__(self):
|
||||
self.conn = pymongo.MongoClient()
|
||||
self.db = self.conn['medical']
|
||||
self.col = self.db['data']
|
||||
|
||||
'''根据url,请求html'''
|
||||
def get_html(self, url):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/51.0.2704.63 Safari/537.36'}
|
||||
req = urllib.request.Request(url=url, headers=headers)
|
||||
res = urllib.request.urlopen(req)
|
||||
html = res.read().decode('gbk')
|
||||
return html
|
||||
|
||||
'''url解析'''
|
||||
def url_parser(self, content):
|
||||
selector = etree.HTML(content)
|
||||
urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')]
|
||||
return urls
|
||||
|
||||
'''测试'''
|
||||
def spider_main(self):
|
||||
for page in range(1, 11000):
|
||||
try:
|
||||
basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'%page
|
||||
cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page
|
||||
prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page
|
||||
symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page
|
||||
inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page
|
||||
treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page
|
||||
food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page
|
||||
drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page
|
||||
data = {}
|
||||
data['url'] = basic_url
|
||||
data['basic_info'] = self.basicinfo_spider(basic_url)
|
||||
data['cause_info'] = self.common_spider(cause_url)
|
||||
data['prevent_info'] = self.common_spider(prevent_url)
|
||||
data['symptom_info'] = self.symptom_spider(symptom_url)
|
||||
data['inspect_info'] = self.inspect_spider(inspect_url)
|
||||
data['treat_info'] = self.treat_spider(treat_url)
|
||||
data['food_info'] = self.food_spider(food_url)
|
||||
data['drug_info'] = self.drug_spider(drug_url)
|
||||
print(page, basic_url)
|
||||
self.col.insert(data)
|
||||
|
||||
except Exception as e:
|
||||
print(e, page)
|
||||
return
|
||||
|
||||
'''基本信息解析'''
|
||||
def basicinfo_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
title = selector.xpath('//title/text()')[0]
|
||||
category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')
|
||||
desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')
|
||||
ps = selector.xpath('//div[@class="mt20 articl-know"]/p')
|
||||
infobox = []
|
||||
for p in ps:
|
||||
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
|
||||
infobox.append(info)
|
||||
basic_data = {}
|
||||
basic_data['category'] = category
|
||||
basic_data['name'] = title.split('的简介')[0]
|
||||
basic_data['desc'] = desc
|
||||
basic_data['attributes'] = infobox
|
||||
return basic_data
|
||||
|
||||
'''treat_infobox治疗解析'''
|
||||
def treat_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p')
|
||||
infobox = []
|
||||
for p in ps:
|
||||
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
|
||||
infobox.append(info)
|
||||
return infobox
|
||||
|
||||
'''treat_infobox治疗解析'''
|
||||
def drug_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
|
||||
return drugs
|
||||
|
||||
'''food治疗解析'''
|
||||
def food_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
|
||||
try:
|
||||
food_data = {}
|
||||
food_data['good'] = divs[0].xpath('./div/p/text()')
|
||||
food_data['bad'] = divs[1].xpath('./div/p/text()')
|
||||
food_data['recommand'] = divs[2].xpath('./div/p/text()')
|
||||
except:
|
||||
return {}
|
||||
|
||||
return food_data
|
||||
|
||||
'''症状信息解析'''
|
||||
def symptom_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
symptoms = selector.xpath('//a[@class="gre" ]/text()')
|
||||
ps = selector.xpath('//p')
|
||||
detail = []
|
||||
for p in ps:
|
||||
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
|
||||
detail.append(info)
|
||||
symptoms_data = {}
|
||||
symptoms_data['symptoms'] = symptoms
|
||||
symptoms_data['symptoms_detail'] = detail
|
||||
return symptoms, detail
|
||||
|
||||
'''检查信息解析'''
|
||||
def inspect_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
inspects = selector.xpath('//li[@class="check-item"]/a/@href')
|
||||
return inspects
|
||||
|
||||
'''通用解析模块'''
|
||||
def common_spider(self, url):
|
||||
html = self.get_html(url)
|
||||
selector = etree.HTML(html)
|
||||
ps = selector.xpath('//p')
|
||||
infobox = []
|
||||
for p in ps:
|
||||
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '')
|
||||
if info:
|
||||
infobox.append(info)
|
||||
return '\n'.join(infobox)
|
||||
'''检查项抓取模块'''
|
||||
def inspect_crawl(self):
|
||||
for page in range(1, 3685):
|
||||
try:
|
||||
url = 'http://jck.xywy.com/jc_%s.html'%page
|
||||
html = self.get_html(url)
|
||||
data = {}
|
||||
data['url']= url
|
||||
data['html'] = html
|
||||
self.db['jc'].insert(data)
|
||||
print(url)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
handler = CrimeSpider()
|
||||
handler.inspect_crawl()
|
8808
disease.txt
Normal file
8808
disease.txt
Normal file
File diff suppressed because it is too large
Load Diff
BIN
img/graph_summary.png
Normal file
BIN
img/graph_summary.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
137
prepare_data/build_data.py
Normal file
137
prepare_data/build_data.py
Normal file
@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
# File: build_data.py
|
||||
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
||||
# Date: 18-10-3
|
||||
import pymongo
|
||||
from lxml import etree
|
||||
import os
|
||||
from max_cut import *
|
||||
|
||||
class MedicalGraph:
|
||||
def __init__(self):
|
||||
self.conn = pymongo.MongoClient()
|
||||
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
|
||||
self.db = self.conn['medical']
|
||||
self.col = self.db['data']
|
||||
first_words = [i.strip() for i in open(os.path.join(cur_dir, 'first_name.txt'))]
|
||||
alphabets = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y', 'z']
|
||||
nums = ['1','2','3','4','5','6','7','8','9','0']
|
||||
self.stop_words = first_words + alphabets + nums
|
||||
self.key_dict = {
|
||||
'医保疾病' : 'yibao_status',
|
||||
"患病比例" : "get_prob",
|
||||
"易感人群" : "easy_get",
|
||||
"传染方式" : "get_way",
|
||||
"就诊科室" : "cure_department",
|
||||
"治疗方式" : "cure_way",
|
||||
"治疗周期" : "cure_lasttime",
|
||||
"治愈率" : "cured_prob",
|
||||
'药品明细': 'drug_detail',
|
||||
'药品推荐': 'recommand_drug',
|
||||
'推荐': 'recommand_eat',
|
||||
'忌食': 'not_eat',
|
||||
'宜食': 'do_eat',
|
||||
'症状': 'symptom',
|
||||
'检查': 'check',
|
||||
'成因': 'cause',
|
||||
'预防措施': 'prevent',
|
||||
'所属类别': 'category',
|
||||
'简介': 'desc',
|
||||
'名称': 'name',
|
||||
'常用药品' : 'common_drug',
|
||||
'治疗费用': 'cost_money',
|
||||
'并发症': 'acompany'
|
||||
}
|
||||
self.cuter = CutWords()
|
||||
|
||||
def collect_medical(self):
|
||||
cates = []
|
||||
inspects = []
|
||||
count = 0
|
||||
for item in self.col.find():
|
||||
data = {}
|
||||
basic_info = item['basic_info']
|
||||
name = basic_info['name']
|
||||
if not name:
|
||||
continue
|
||||
# 基本信息
|
||||
data['名称'] = name
|
||||
data['简介'] = '\n'.join(basic_info['desc']).replace('\r\n\t', '').replace('\r\n\n\n','').replace(' ','').replace('\r\n','\n')
|
||||
category = basic_info['category']
|
||||
data['所属类别'] = category
|
||||
cates += category
|
||||
inspect = item['inspect_info']
|
||||
inspects += inspect
|
||||
attributes = basic_info['attributes']
|
||||
# 成因及预防
|
||||
data['预防措施'] = item['prevent_info']
|
||||
data['成因'] = item['cause_info']
|
||||
# 并发症
|
||||
data['症状'] = list(set([i for i in item["symptom_info"][0] if i[0] not in self.stop_words]))
|
||||
for attr in attributes:
|
||||
attr_pair = attr.split(':')
|
||||
if len(attr_pair) == 2:
|
||||
key = attr_pair[0]
|
||||
value = attr_pair[1]
|
||||
data[key] = value
|
||||
# 检查
|
||||
inspects = item['inspect_info']
|
||||
jcs = []
|
||||
for inspect in inspects:
|
||||
jc_name = self.get_inspect(inspect)
|
||||
if jc_name:
|
||||
jcs.append(jc_name)
|
||||
data['检查'] = jcs
|
||||
# 食物
|
||||
food_info = item['food_info']
|
||||
if food_info:
|
||||
data['宜食'] = food_info['good']
|
||||
data['忌食'] = food_info['bad']
|
||||
data['推荐'] = food_info['recommand']
|
||||
# 药品
|
||||
drug_info = item['drug_info']
|
||||
data['药品推荐'] = list(set([i.split('(')[-1].replace(')','') for i in drug_info]))
|
||||
data['药品明细'] = drug_info
|
||||
data_modify = {}
|
||||
for attr, value in data.items():
|
||||
attr_en = self.key_dict.get(attr)
|
||||
if attr_en:
|
||||
data_modify[attr_en] = value
|
||||
if attr_en in ['yibao_status', 'get_prob', 'easy_get', 'get_way', "cure_lasttime", "cured_prob"]:
|
||||
data_modify[attr_en] = value.replace(' ','').replace('\t','')
|
||||
elif attr_en in ['cure_department', 'cure_way', 'common_drug']:
|
||||
data_modify[attr_en] = [i for i in value.split(' ') if i]
|
||||
elif attr_en in ['acompany']:
|
||||
acompany = [i for i in self.cuter.max_biward_cut(data_modify[attr_en]) if len(i) > 1]
|
||||
data_modify[attr_en] = acompany
|
||||
|
||||
try:
|
||||
self.db['medical'].insert(data_modify)
|
||||
count += 1
|
||||
print(count)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
return
|
||||
|
||||
|
||||
def get_inspect(self, url):
|
||||
res = self.db['jc'].find_one({'url':url})
|
||||
if not res:
|
||||
return ''
|
||||
else:
|
||||
return res['name']
|
||||
|
||||
def modify_jc(self):
|
||||
for item in self.db['jc'].find():
|
||||
url = item['url']
|
||||
content = item['html']
|
||||
selector = etree.HTML(content)
|
||||
name = selector.xpath('//title/text()')[0].split('结果分析')[0]
|
||||
desc = selector.xpath('//meta[@name="description"]/@content')[0].replace('\r\n\t','')
|
||||
self.db['jc'].update({'url':url}, {'$set':{'name':name, 'desc':desc}})
|
||||
|
||||
|
||||
handler = MedicalGraph()
|
||||
handler.collect_medical()
|
102
prepare_data/max_cut.py
Normal file
102
prepare_data/max_cut.py
Normal file
@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
# File: maxmatch.py
|
||||
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
||||
# Date: 18-3-26
|
||||
|
||||
class CutWords:
|
||||
def __init__(self):
|
||||
dict_path = './disease.txt'
|
||||
self.word_dict, self.max_wordlen = self.load_words(dict_path)
|
||||
|
||||
# 加载词典
|
||||
def load_words(self, dict_path):
|
||||
words = list()
|
||||
max_len = 0
|
||||
for line in open(dict_path):
|
||||
wd = line.strip()
|
||||
if not wd:
|
||||
continue
|
||||
if len(wd) > max_len:
|
||||
max_len = len(wd)
|
||||
words.append(wd)
|
||||
return words, max_len
|
||||
|
||||
# 最大向前匹配
|
||||
def max_forward_cut(self, sent):
|
||||
# 1.从左向右取待切分汉语句的m个字符作为匹配字段,m为大机器词典中最长词条个数。
|
||||
# 2.查找大机器词典并进行匹配。若匹配成功,则将这个匹配字段作为一个词切分出来。
|
||||
cutlist = []
|
||||
index = 0
|
||||
while index < len(sent):
|
||||
matched = False
|
||||
for i in range(self.max_wordlen, 0, -1):
|
||||
cand_word = sent[index: index + i]
|
||||
if cand_word in self.word_dict:
|
||||
cutlist.append(cand_word)
|
||||
matched = True
|
||||
break
|
||||
|
||||
# 如果没有匹配上,则按字符切分
|
||||
if not matched:
|
||||
i = 1
|
||||
cutlist.append(sent[index])
|
||||
index += i
|
||||
return cutlist
|
||||
|
||||
# 最大向后匹配
|
||||
def max_backward_cut(self, sent):
|
||||
# 1.从右向左取待切分汉语句的m个字符作为匹配字段,m为大机器词典中最长词条个数。
|
||||
# 2.查找大机器词典并进行匹配。若匹配成功,则将这个匹配字段作为一个词切分出来。
|
||||
cutlist = []
|
||||
index = len(sent)
|
||||
max_wordlen = 5
|
||||
while index > 0:
|
||||
matched = False
|
||||
for i in range(self.max_wordlen, 0, -1):
|
||||
tmp = (i + 1)
|
||||
cand_word = sent[index - tmp: index]
|
||||
# 如果匹配上,则将字典中的字符加入到切分字符中
|
||||
if cand_word in self.word_dict:
|
||||
cutlist.append(cand_word)
|
||||
matched = True
|
||||
break
|
||||
# 如果没有匹配上,则按字符切分
|
||||
if not matched:
|
||||
tmp = 1
|
||||
cutlist.append(sent[index - 1])
|
||||
|
||||
index -= tmp
|
||||
|
||||
return cutlist[::-1]
|
||||
|
||||
# 双向最大向前匹配
|
||||
def max_biward_cut(self, sent):
|
||||
# 双向最大匹配法是将正向最大匹配法得到的分词结果和逆向最大匹配法的到的结果进行比较,从而决定正确的分词方法。
|
||||
# 启发式规则:
|
||||
# 1.如果正反向分词结果词数不同,则取分词数量较少的那个。
|
||||
# 2.如果分词结果词数相同 a.分词结果相同,就说明没有歧义,可返回任意一个。 b.分词结果不同,返回其中单字较少的那个。
|
||||
forward_cutlist = self.max_forward_cut(sent)
|
||||
backward_cutlist = self.max_backward_cut(sent)
|
||||
count_forward = len(forward_cutlist)
|
||||
count_backward = len(backward_cutlist)
|
||||
|
||||
def compute_single(word_list):
|
||||
num = 0
|
||||
for word in word_list:
|
||||
if len(word) == 1:
|
||||
num += 1
|
||||
return num
|
||||
|
||||
if count_forward == count_backward:
|
||||
if compute_single(forward_cutlist) > compute_single(backward_cutlist):
|
||||
return backward_cutlist
|
||||
else:
|
||||
return forward_cutlist
|
||||
|
||||
elif count_backward > count_forward:
|
||||
return forward_cutlist
|
||||
|
||||
else:
|
||||
return backward_cutlist
|
||||
|
Loading…
Reference in New Issue
Block a user