Macropodus/macropodus/base/seg_basic.py
2020-01-19 18:46:08 +08:00

150 lines
6.2 KiB
Python

# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2019/11/28 20:17
# @author : Mo
# @function: basic of segment, dictionary
from macropodus.preprocess.tools_common import load_json, save_json, txt_read
from macropodus.conf.path_config import path_dict_macropodus, path_dict_user
from macropodus.conf.path_config import path_macropodus_dict_freq_cache
from macropodus.conf.path_log import get_logger_root
from collections import defaultdict
import pickle
import time
import os
logger = get_logger_root()
logger.info("path of dict cache is {}!".format(path_macropodus_dict_freq_cache))
class SegBasic:
def __init__(self, use_cache=True):
# time_start = time.time()
# 存在缓存则直接读取, 序列化加速缓存读取速度
if use_cache and os.path.exists(path_macropodus_dict_freq_cache):
with open(path_macropodus_dict_freq_cache, "rb") as fpmc:
[self.dict_words_freq, self.num_words, self.dict_user] = pickle.load(fpmc)
fpmc.close()
# logger.info("seg: " + str(time.time()-time_start)) # 5.29, 5.26
else:
self.dict_words_freq = defaultdict()
self.dict_user = {}
self.load_macropodus_dict() # 默认字典
self.load_user_dict() # 用户字典
# logger.info("seg: " + str(time.time() - time_start)) # 10.13, 10.33
# 第一次跑macropodus, 序列化需要的缓存
if use_cache and not os.path.exists(path_macropodus_dict_freq_cache):
with open(path_macropodus_dict_freq_cache, "wb") as fpmc:
pickle.dump([self.dict_words_freq, self.num_words, self.dict_user], fpmc)
def load_macropodus_dict(self):
"""
加载默认的基础字典
:return: None
"""
dict_macropodus = load_json(path_dict_macropodus)[0] # (path_dict_jiagu)[0] # (path_dict_macropodus)[0] # 加载json字典文件
dict_macropodus_def = defaultdict() # 转为defaultdict
for k,v in dict_macropodus.items():
dict_macropodus_def[k] = v
self.dict_words_freq = dict_macropodus_def # {}词-词频字典
def load_user_dict(self, path_user=path_dict_user, type_user="json"):
"""
加载用户词典
:param path_user:str, like '/home/user.dict'
:return: None
"""
if not os.path.exists(path_user):
raise RuntimeError("your path_user is not exist!")
if type_user == "json":
self.dict_user = load_json(path_user)[0] # 加载json字典文件
for k, v in self.dict_user.items():
if k not in self.dict_words_freq:
self.dict_words_freq[k] = v # 更新到总字典, words_freq
else:
self.dict_words_freq[k] = self.dict_words_freq[k] + v # 更新到总字典, words_freq
self.num_words = sum(self.dict_words_freq.values())
elif type_user == "txt":
words_all = txt_read(path_user)
for word_freq in words_all:
wf = word_freq.split(" ") # 空格' '区分带不带词频的情况
if len(wf) == 2:
word = wf[0]
freq = wf[1]
else:
word = wf[0]
freq = 132
if word not in self.dict_words_freq:
self.dict_words_freq[word] = freq # 更新到总字典, words_freq
else:
self.dict_words_freq[word] = self.dict_words_freq[word] + freq # 更新到总字典, words_freq
self.num_words = sum(self.dict_words_freq.values())
elif type_user == "csv":
words_all = txt_read(path_user)
for word_freq in words_all:
wf = word_freq.split(",") # 逗号','区分带不带词频的情况
if len(wf)==2:
word = wf[0]
freq = wf[1]
else:
word = wf[0]
freq = 132
if word not in self.dict_words_freq:
self.dict_words_freq[word] = freq # 更新到总字典, words_freq
else:
self.dict_words_freq[word] = self.dict_words_freq[word] + freq # 更新到总字典, words_freq
self.num_words = sum(self.dict_words_freq.values())
else:
raise EOFError
def add_word(self, word, freq=132):
"""
新增词典到词语, 不可持久化, 重载消失
:param word: str, like '大漠帝国'
:param freq: int, like 132
:return: None
"""
assert type(word) == str
if word in self.dict_words_freq:
self.dict_words_freq[word] = self.dict_words_freq[word] if freq !=132 else freq
else:
self.dict_words_freq[word] = freq
self.num_words += freq
def delete_word(self, word):
"""
删除词语, 不可持久化, 重载消失
:param word_freqs: str, like '大漠帝国'
:return: None
"""
assert type(word) == str
if word in self.dict_words_freq:
self.num_words -= self.dict_words_freq[word]
self.dict_words_freq.pop(word)
def save_add_words(self, word_freqs):
"""
新增词语到用户词典, 可持久化, 重载有效
:param word_freqs: dict, like {'大漠帝国':132}
:return: None
"""
assert type(word_freqs) == dict
for k, v in word_freqs.items():
self.add_word(k, v) # 新增到总字典, 不持久化
self.dict_user[k] = v # 新增到用户字典, 持久化
save_json([self.dict_user], path_dict_user)
def save_delete_words(self, words):
"""
删除词语到用户词典, 可持久化, 重载有效
:param word_freqs: list, like ['大漠帝国']
:return: None
"""
assert type(words) == list
for w in words:
self.delete_word(w) # 删除到总字典, 不持久化
if w in self.dict_user: self.dict_user.pop(w) # 删除到用户字典, 持久化
save_json([self.dict_user], path_dict_user)