add use_cache, fix macropodus_cut of cut-block

This commit is contained in:
yongzhuo 2020-10-21 15:46:50 +08:00
parent 9de38c06d3
commit e3b74cc561
8 changed files with 116 additions and 72 deletions

View File

@ -10,9 +10,9 @@ from macropodus.tookit import calculate, chi2num, num2chi, Trie, roman2num, num2
from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find
from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word
from macropodus.summarize import keyword, textrank, summarization
from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
from macropodus.version import __version__ # 版本
from macropodus.similarity import sim
import os
# 机械分词
cut_bidirectional = cut_bidirectional
@ -49,3 +49,6 @@ num2roman = num2roman
han2zh = han2zh
zh2han = zh2han
pinyin = pinyin
if os.environ.get("macropodus_use_dl", False)=="1":
from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects

View File

@ -7,9 +7,13 @@
from macropodus.segment.seg_statistics.seg_statistics import SegStatistics
from macropodus.segment.word_discovery.word_discovery import WordDiscovery
import os
# 机械分词
use_cache = True # 使用缓存
# 机械分词,默认使用缓存
use_cache = True
if not os.environ.get("macropodus_use_seg_cache", True):
use_cache = False # 不使用缓存,重新加载
segs = SegStatistics(use_cache)
cut_bidirectional = segs.cut_bidirectional
cut_forward = segs.cut_forward

View File

@ -8,7 +8,7 @@
from macropodus.preprocess.tools_common import re_continue
from macropodus.base.seg_basic import SegBasic
from math import log
import re
__all__ = ["cut_dag",
"cut_forward",
@ -16,6 +16,9 @@ __all__ = ["cut_dag",
"cut_bidirectional",
"cut_search"]
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
re_skip = re.compile("(\r\n|\s)", re.U)
class SegStatistics(SegBasic):
def __init__(self, use_cache):
@ -170,22 +173,42 @@ class SegStatistics(SegBasic):
def cut(self, sentence, type_cut="cut_dag"):
"""
切词总函数
cut_block, 代码来自jieba项目
code from: https://github.com/fxsjy/jieba
:param sentence:str, like '大漠帝国, macropodus, 中国斗鱼'
:param type_cut: str, like 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'
:return: list, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
:return: yield, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
"""
if type_cut=="cut_dag":
return list(self.cut_dag(sentence))
cut_block = self.cut_dag
elif type_cut=="cut_forward":
return list(self.cut_dag(sentence))
cut_block = self.cut_forward
elif type_cut=="cut_reverse":
return list(self.cut_dag(sentence))
cut_block = self.cut_reverse
elif type_cut=="cut_bidirectional":
return list(self.cut_dag(sentence))
cut_block = self.cut_bidirectional
elif type_cut=="cut_search":
return list(self.cut_dag(sentence))
cut_block = self.cut_search
else:
raise RuntimeError("type_cut must be 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'")
blocks = re_han.split(sentence)
cut_all = False
for block in blocks:
if not block:
continue
if re_han.match(block):
for word in cut_block(block):
yield word
else:
tmp = re_skip.split(block)
for x in tmp:
if re_skip.match(x):
yield x
elif not cut_all:
for xx in x:
yield xx
else:
yield x
if __name__ == '__main__':

View File

@ -25,6 +25,7 @@ class WordDiscovery:
self.total_words = 0
self.freq_min = 3
self.len_max = 7
self.round = 6
self.eps = 1e-9
self.empty_words = [sw for sw in stop_words.values() if len(sw)==1] # 虚词
@ -35,8 +36,10 @@ class WordDiscovery:
:param use_type: str, "text" or "file", file of "utf-8" of "txt"
:return: class<Counter>, word-freq
"""
import macropodus
self.words_count = Counter()
if use_type=="text": # 输入为文本形式
text = macropodus.han2zh(text)
texts = cut_sentence(use_type=self.algorithm,
text=text) # 切句子, 如中英文的逗号/句号/感叹号
for text in texts:
@ -50,6 +53,7 @@ class WordDiscovery:
fr8 = open(text, "r", encoding="utf-8")
for text in fr8:
if text.strip():
text = macropodus.han2zh(text)
texts = cut_sentence(use_type=self.algorithm,
text=text) # 切句子, 如中英文的逗号/句号/感叹号
for text in texts:
@ -108,9 +112,9 @@ class WordDiscovery:
if (k[0] in self.empty_words or k[-1] in self.empty_words):
entroy_boundary = entroy_boundary / len(k)
if boundary_type == "right":
self.right_entropy[k] = entroy_boundary
self.right_entropy[k] = round(entroy_boundary, self.round)
else:
self.left_entropy[k] = entroy_boundary
self.left_entropy[k] = round(entroy_boundary, self.round)
def compute_entropys(self):
"""
@ -146,8 +150,38 @@ class WordDiscovery:
probability_chars = reduce(mul,([wf for wf in words_freq])) / (twl_1**(len(word)))
pmi = math.log(probability_word / probability_chars, 2)
# AMI=PMI/length_word. 惩罚虚词(避免"的", "得", "了"开头结尾的情况)
self.aggregation[word] = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
else pmi/len_word # pmi / len_word / len_word
word_aggregation = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
else pmi/len_word # pmi / len_word / len_word
self.aggregation[word] = round(word_aggregation, self.round)
def compute_score(self, word, value, a, r, l, rl, lambda_0, lambda_3):
"""
计算最终得分
:param word: str, word with prepare
:param value: float, word freq
:param a: float, aggregation of word
:param r: float, right entropy of word
:param l: float, left entropy of word
:param rl: float, right_entropy * left_entropy
:param lambda_0: lambda 0
:param lambda_3: lambda 3
:return:
"""
self.new_words[word] = {}
# math.log10(self.aggregation[word]) - math.log10(self.total_words)
self.new_words[word]["a"] = a
self.new_words[word]["r"] = r
self.new_words[word]["l"] = l
self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(r)
m2 = lambda_0(l)
m3 = lambda_0(a)
score_ns = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = round(score_ns, self.round)
# 乘以词频word-freq, 连乘是为了防止出现较小项
score_s = value * a * rl * score_ns
self.new_words[word]["s"] = round(score_s, self.round)
def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2,
use_output=True, use_avg=False, use_filter=False):
@ -175,66 +209,25 @@ class WordDiscovery:
lambda_0 = lambda x: -self.eps * x + self.eps if x <= 0 else x
# 输出
for word, value in self.words_select.items():
# 过滤通用词
if use_filter and word in self.dict_words_freq:
continue
# 过滤停用词
if word in self.stop_words:
continue
if use_output:
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
self.new_words[word] = {}
# math.log10(self.aggregation[word]) - math.log10(self.total_words)
self.new_words[word]["a"] = self.aggregation[word]
self.new_words[word]["r"] = self.right_entropy[word]
self.new_words[word]["l"] = self.left_entropy[word]
self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(self.right_entropy[word])
m2 = lambda_0(self.left_entropy[word])
m3 = lambda_0(self.aggregation[word])
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = score_3
# 乘以freq效果没那么好, 连乘是为了防止出现较小项
# self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
# self.right_entropy[word] * self.left_entropy[word]
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
self.right_entropy[word] * self.left_entropy[word] * score_3
# {"aggregation":"a", "right_entropy":"r", "left_entropy":"l", "frequency":"f",
# "word-liberalization":"ns", "score":"s"}
a = self.aggregation[word]
r = self.right_entropy[word]
l = self.left_entropy[word]
rl = (r+l) / 2 if use_avg else r * l
if use_output or (use_avg and a > self.aggregation_min and rl > self.entropy_min) or \
(not use_avg and a > self.aggregation_min and r > self.entropy_min and l > self.entropy_min):
self.compute_score(word, value, a, r, l, rl, lambda_0, lambda_3)
elif not use_avg and self.aggregation[word] > self.aggregation_min \
and self.right_entropy[word] > self.entropy_min and self.left_entropy[word] > self.entropy_min:
self.new_words[word] = {}
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
self.new_words[word]["a"] = self.aggregation[word] # math.log10(self.aggregation[word]) - math.log10(self.total_words)
self.new_words[word]["r"] = self.right_entropy[word]
self.new_words[word]["l"] = self.left_entropy[word]
self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(self.right_entropy[word])
m2 = lambda_0(self.left_entropy[word])
m3 = lambda_0(self.aggregation[word])
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = score_3
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
(self.right_entropy[word] + self.left_entropy[word])/2 * score_3
elif use_avg and self.aggregation[word] > self.aggregation_min \
and (self.right_entropy[word] + self.left_entropy[word]) > 2 * self.entropy_min:
self.new_words[word] = {}
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
self.new_words[word]["a"] = self.aggregation[word]
self.new_words[word]["r"] = self.right_entropy[word]
self.new_words[word]["l"] = self.left_entropy[word]
self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(self.right_entropy[word])
m2 = lambda_0(self.left_entropy[word])
m3 = lambda_0(self.aggregation[word])
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = score_3
self.new_words[word]["s"] = self.new_words[word]["a"] * (self.right_entropy[word] + self.left_entropy[word])/2
# mul, 相乘
self.new_words[word]["s"] *= score_3
# 排序
new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
self.new_words = OrderedDict(new_words)
self.new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
self.new_words = OrderedDict(self.new_words)
return self.new_words

View File

@ -6,9 +6,12 @@
from macropodus.similarity.similarity_word2vec_char import SimW2vChar
import os
# 词向量, 默认使用缓存
use_cache = True
if not os.environ.get("macropodus_use_w2v_cache", True):
use_cache = False # 不使用缓存,重新加载
# 文本相似度
use_cache = True # 使用缓存
swc = SimW2vChar(use_cache)
sim = swc.similarity

View File

@ -9,7 +9,7 @@ from macropodus.base.word2vec import W2v
class SimW2vChar(W2v):
def __init__(self, use_cache):
def __init__(self, use_cache=True):
super().__init__(use_cache)
def encode(self, sent, type_encode="other"):

View File

@ -8,8 +8,12 @@
from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec
from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum
from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn
import os
# 词向量, 默认使用缓存
use_cache = True
if not os.environ.get("macropodus_use_w2v_cache", True):
use_cache = False # 不使用缓存,重新加载
# textrank of gensim
trgs = TextrankGensimSum()
# textrank of word2vec

View File

@ -89,3 +89,17 @@ PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0
* ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer)
* fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp)
* NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR)
###
新词发现:
1. Matrix67: The Aha Moments的信息熵方法: [互联网时代的社会语言学基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044)
1.词频、左右熵(丰度,字符组合左右邻字的丰富程度, -p*log(p))、
2.互信息(凝固度,内部凝聚程度, pmi = p(x,y)*log(p(x,y)/(p(x)*p(y))))等构建得分函数
2. HanLP的长短语构造方法: [基于互信息和左右信息熵的短语提取识别](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html)
1.切词(只统计词典),统计词语共现(一阶、二阶、三阶)
2.左右熵、互信息。合并词典词语,构建短语
3. SmoothNLP:["新词发现"算法探讨与优化-SmoothNLP](https://zhuanlan.zhihu.com/p/80385615)
1.左右熵权重: Ew =log((El*e^Er+Er*e^EL)/|Er-El|)
2.平均互信息AMI:(1/n) * log(p(w)/(p(1)p(2)...p(n)))
3.过滤条件:对在candidate ngram中, 首字或者尾字出现次数特别多的进行筛选, 如"XX的,美丽的,漂亮的"剔出字典