add use_cache, fix macropodus_cut of cut-block
This commit is contained in:
parent
9de38c06d3
commit
e3b74cc561
@ -10,9 +10,9 @@ from macropodus.tookit import calculate, chi2num, num2chi, Trie, roman2num, num2
|
||||
from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find
|
||||
from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word
|
||||
from macropodus.summarize import keyword, textrank, summarization
|
||||
from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
|
||||
from macropodus.version import __version__ # 版本
|
||||
from macropodus.similarity import sim
|
||||
import os
|
||||
|
||||
# 机械分词
|
||||
cut_bidirectional = cut_bidirectional
|
||||
@ -49,3 +49,6 @@ num2roman = num2roman
|
||||
han2zh = han2zh
|
||||
zh2han = zh2han
|
||||
pinyin = pinyin
|
||||
|
||||
if os.environ.get("macropodus_use_dl", False)=="1":
|
||||
from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
|
||||
|
@ -7,9 +7,13 @@
|
||||
|
||||
from macropodus.segment.seg_statistics.seg_statistics import SegStatistics
|
||||
from macropodus.segment.word_discovery.word_discovery import WordDiscovery
|
||||
import os
|
||||
|
||||
# 机械分词
|
||||
use_cache = True # 使用缓存
|
||||
|
||||
# 机械分词,默认使用缓存
|
||||
use_cache = True
|
||||
if not os.environ.get("macropodus_use_seg_cache", True):
|
||||
use_cache = False # 不使用缓存,重新加载
|
||||
segs = SegStatistics(use_cache)
|
||||
cut_bidirectional = segs.cut_bidirectional
|
||||
cut_forward = segs.cut_forward
|
||||
|
@ -8,7 +8,7 @@
|
||||
from macropodus.preprocess.tools_common import re_continue
|
||||
from macropodus.base.seg_basic import SegBasic
|
||||
from math import log
|
||||
|
||||
import re
|
||||
|
||||
__all__ = ["cut_dag",
|
||||
"cut_forward",
|
||||
@ -16,6 +16,9 @@ __all__ = ["cut_dag",
|
||||
"cut_bidirectional",
|
||||
"cut_search"]
|
||||
|
||||
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
||||
re_skip = re.compile("(\r\n|\s)", re.U)
|
||||
|
||||
|
||||
class SegStatistics(SegBasic):
|
||||
def __init__(self, use_cache):
|
||||
@ -170,22 +173,42 @@ class SegStatistics(SegBasic):
|
||||
def cut(self, sentence, type_cut="cut_dag"):
|
||||
"""
|
||||
切词总函数
|
||||
cut_block, 代码来自jieba项目
|
||||
code from: https://github.com/fxsjy/jieba
|
||||
:param sentence:str, like '大漠帝国, macropodus, 中国斗鱼'
|
||||
:param type_cut: str, like 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'
|
||||
:return: list, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
|
||||
:return: yield, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
|
||||
"""
|
||||
if type_cut=="cut_dag":
|
||||
return list(self.cut_dag(sentence))
|
||||
cut_block = self.cut_dag
|
||||
elif type_cut=="cut_forward":
|
||||
return list(self.cut_dag(sentence))
|
||||
cut_block = self.cut_forward
|
||||
elif type_cut=="cut_reverse":
|
||||
return list(self.cut_dag(sentence))
|
||||
cut_block = self.cut_reverse
|
||||
elif type_cut=="cut_bidirectional":
|
||||
return list(self.cut_dag(sentence))
|
||||
cut_block = self.cut_bidirectional
|
||||
elif type_cut=="cut_search":
|
||||
return list(self.cut_dag(sentence))
|
||||
cut_block = self.cut_search
|
||||
else:
|
||||
raise RuntimeError("type_cut must be 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'")
|
||||
blocks = re_han.split(sentence)
|
||||
cut_all = False
|
||||
for block in blocks:
|
||||
if not block:
|
||||
continue
|
||||
if re_han.match(block):
|
||||
for word in cut_block(block):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(block)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield x
|
||||
elif not cut_all:
|
||||
for xx in x:
|
||||
yield xx
|
||||
else:
|
||||
yield x
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -25,6 +25,7 @@ class WordDiscovery:
|
||||
self.total_words = 0
|
||||
self.freq_min = 3
|
||||
self.len_max = 7
|
||||
self.round = 6
|
||||
self.eps = 1e-9
|
||||
self.empty_words = [sw for sw in stop_words.values() if len(sw)==1] # 虚词
|
||||
|
||||
@ -35,8 +36,10 @@ class WordDiscovery:
|
||||
:param use_type: str, "text" or "file", file of "utf-8" of "txt"
|
||||
:return: class<Counter>, word-freq
|
||||
"""
|
||||
import macropodus
|
||||
self.words_count = Counter()
|
||||
if use_type=="text": # 输入为文本形式
|
||||
text = macropodus.han2zh(text)
|
||||
texts = cut_sentence(use_type=self.algorithm,
|
||||
text=text) # 切句子, 如中英文的逗号/句号/感叹号
|
||||
for text in texts:
|
||||
@ -50,6 +53,7 @@ class WordDiscovery:
|
||||
fr8 = open(text, "r", encoding="utf-8")
|
||||
for text in fr8:
|
||||
if text.strip():
|
||||
text = macropodus.han2zh(text)
|
||||
texts = cut_sentence(use_type=self.algorithm,
|
||||
text=text) # 切句子, 如中英文的逗号/句号/感叹号
|
||||
for text in texts:
|
||||
@ -108,9 +112,9 @@ class WordDiscovery:
|
||||
if (k[0] in self.empty_words or k[-1] in self.empty_words):
|
||||
entroy_boundary = entroy_boundary / len(k)
|
||||
if boundary_type == "right":
|
||||
self.right_entropy[k] = entroy_boundary
|
||||
self.right_entropy[k] = round(entroy_boundary, self.round)
|
||||
else:
|
||||
self.left_entropy[k] = entroy_boundary
|
||||
self.left_entropy[k] = round(entroy_boundary, self.round)
|
||||
|
||||
def compute_entropys(self):
|
||||
"""
|
||||
@ -146,8 +150,38 @@ class WordDiscovery:
|
||||
probability_chars = reduce(mul,([wf for wf in words_freq])) / (twl_1**(len(word)))
|
||||
pmi = math.log(probability_word / probability_chars, 2)
|
||||
# AMI=PMI/length_word. 惩罚虚词(避免"的", "得", "了"开头结尾的情况)
|
||||
self.aggregation[word] = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
|
||||
else pmi/len_word # pmi / len_word / len_word
|
||||
word_aggregation = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
|
||||
else pmi/len_word # pmi / len_word / len_word
|
||||
self.aggregation[word] = round(word_aggregation, self.round)
|
||||
|
||||
def compute_score(self, word, value, a, r, l, rl, lambda_0, lambda_3):
|
||||
"""
|
||||
计算最终得分
|
||||
:param word: str, word with prepare
|
||||
:param value: float, word freq
|
||||
:param a: float, aggregation of word
|
||||
:param r: float, right entropy of word
|
||||
:param l: float, left entropy of word
|
||||
:param rl: float, right_entropy * left_entropy
|
||||
:param lambda_0: lambda 0
|
||||
:param lambda_3: lambda 3
|
||||
:return:
|
||||
"""
|
||||
self.new_words[word] = {}
|
||||
# math.log10(self.aggregation[word]) - math.log10(self.total_words)
|
||||
self.new_words[word]["a"] = a
|
||||
self.new_words[word]["r"] = r
|
||||
self.new_words[word]["l"] = l
|
||||
self.new_words[word]["f"] = value
|
||||
# word-liberalization
|
||||
m1 = lambda_0(r)
|
||||
m2 = lambda_0(l)
|
||||
m3 = lambda_0(a)
|
||||
score_ns = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
|
||||
self.new_words[word]["ns"] = round(score_ns, self.round)
|
||||
# 乘以词频word-freq, 连乘是为了防止出现较小项
|
||||
score_s = value * a * rl * score_ns
|
||||
self.new_words[word]["s"] = round(score_s, self.round)
|
||||
|
||||
def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2,
|
||||
use_output=True, use_avg=False, use_filter=False):
|
||||
@ -175,66 +209,25 @@ class WordDiscovery:
|
||||
lambda_0 = lambda x: -self.eps * x + self.eps if x <= 0 else x
|
||||
# 输出
|
||||
for word, value in self.words_select.items():
|
||||
# 过滤通用词
|
||||
if use_filter and word in self.dict_words_freq:
|
||||
continue
|
||||
# 过滤停用词
|
||||
if word in self.stop_words:
|
||||
continue
|
||||
if use_output:
|
||||
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
|
||||
self.new_words[word] = {}
|
||||
# math.log10(self.aggregation[word]) - math.log10(self.total_words)
|
||||
self.new_words[word]["a"] = self.aggregation[word]
|
||||
self.new_words[word]["r"] = self.right_entropy[word]
|
||||
self.new_words[word]["l"] = self.left_entropy[word]
|
||||
self.new_words[word]["f"] = value
|
||||
# word-liberalization
|
||||
m1 = lambda_0(self.right_entropy[word])
|
||||
m2 = lambda_0(self.left_entropy[word])
|
||||
m3 = lambda_0(self.aggregation[word])
|
||||
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
|
||||
self.new_words[word]["ns"] = score_3
|
||||
# 乘以freq效果没那么好, 连乘是为了防止出现较小项
|
||||
# self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
|
||||
# self.right_entropy[word] * self.left_entropy[word]
|
||||
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
|
||||
self.right_entropy[word] * self.left_entropy[word] * score_3
|
||||
# {"aggregation":"a", "right_entropy":"r", "left_entropy":"l", "frequency":"f",
|
||||
# "word-liberalization":"ns", "score":"s"}
|
||||
a = self.aggregation[word]
|
||||
r = self.right_entropy[word]
|
||||
l = self.left_entropy[word]
|
||||
rl = (r+l) / 2 if use_avg else r * l
|
||||
if use_output or (use_avg and a > self.aggregation_min and rl > self.entropy_min) or \
|
||||
(not use_avg and a > self.aggregation_min and r > self.entropy_min and l > self.entropy_min):
|
||||
self.compute_score(word, value, a, r, l, rl, lambda_0, lambda_3)
|
||||
|
||||
elif not use_avg and self.aggregation[word] > self.aggregation_min \
|
||||
and self.right_entropy[word] > self.entropy_min and self.left_entropy[word] > self.entropy_min:
|
||||
self.new_words[word] = {}
|
||||
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
|
||||
self.new_words[word]["a"] = self.aggregation[word] # math.log10(self.aggregation[word]) - math.log10(self.total_words)
|
||||
self.new_words[word]["r"] = self.right_entropy[word]
|
||||
self.new_words[word]["l"] = self.left_entropy[word]
|
||||
self.new_words[word]["f"] = value
|
||||
# word-liberalization
|
||||
m1 = lambda_0(self.right_entropy[word])
|
||||
m2 = lambda_0(self.left_entropy[word])
|
||||
m3 = lambda_0(self.aggregation[word])
|
||||
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
|
||||
self.new_words[word]["ns"] = score_3
|
||||
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
|
||||
(self.right_entropy[word] + self.left_entropy[word])/2 * score_3
|
||||
elif use_avg and self.aggregation[word] > self.aggregation_min \
|
||||
and (self.right_entropy[word] + self.left_entropy[word]) > 2 * self.entropy_min:
|
||||
self.new_words[word] = {}
|
||||
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
|
||||
self.new_words[word]["a"] = self.aggregation[word]
|
||||
self.new_words[word]["r"] = self.right_entropy[word]
|
||||
self.new_words[word]["l"] = self.left_entropy[word]
|
||||
self.new_words[word]["f"] = value
|
||||
# word-liberalization
|
||||
m1 = lambda_0(self.right_entropy[word])
|
||||
m2 = lambda_0(self.left_entropy[word])
|
||||
m3 = lambda_0(self.aggregation[word])
|
||||
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
|
||||
self.new_words[word]["ns"] = score_3
|
||||
self.new_words[word]["s"] = self.new_words[word]["a"] * (self.right_entropy[word] + self.left_entropy[word])/2
|
||||
# mul, 相乘
|
||||
self.new_words[word]["s"] *= score_3
|
||||
# 排序
|
||||
new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
|
||||
self.new_words = OrderedDict(new_words)
|
||||
self.new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
|
||||
self.new_words = OrderedDict(self.new_words)
|
||||
return self.new_words
|
||||
|
||||
|
||||
|
@ -6,9 +6,12 @@
|
||||
|
||||
|
||||
from macropodus.similarity.similarity_word2vec_char import SimW2vChar
|
||||
import os
|
||||
|
||||
|
||||
# 词向量, 默认使用缓存
|
||||
use_cache = True
|
||||
if not os.environ.get("macropodus_use_w2v_cache", True):
|
||||
use_cache = False # 不使用缓存,重新加载
|
||||
# 文本相似度
|
||||
use_cache = True # 使用缓存
|
||||
swc = SimW2vChar(use_cache)
|
||||
sim = swc.similarity
|
||||
|
@ -9,7 +9,7 @@ from macropodus.base.word2vec import W2v
|
||||
|
||||
|
||||
class SimW2vChar(W2v):
|
||||
def __init__(self, use_cache):
|
||||
def __init__(self, use_cache=True):
|
||||
super().__init__(use_cache)
|
||||
|
||||
def encode(self, sent, type_encode="other"):
|
||||
|
@ -8,8 +8,12 @@
|
||||
from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec
|
||||
from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum
|
||||
from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn
|
||||
import os
|
||||
|
||||
|
||||
# 词向量, 默认使用缓存
|
||||
use_cache = True
|
||||
if not os.environ.get("macropodus_use_w2v_cache", True):
|
||||
use_cache = False # 不使用缓存,重新加载
|
||||
# textrank of gensim
|
||||
trgs = TextrankGensimSum()
|
||||
# textrank of word2vec
|
||||
|
@ -89,3 +89,17 @@ PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0
|
||||
* ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer)
|
||||
* fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp)
|
||||
* NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR)
|
||||
|
||||
###
|
||||
新词发现:
|
||||
1. Matrix67: The Aha Moments的信息熵方法: [互联网时代的社会语言学:基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044)
|
||||
1.词频、左右熵(丰度,字符组合左右邻字的丰富程度, -p*log(p))、
|
||||
2.互信息(凝固度,内部凝聚程度, pmi = p(x,y)*log(p(x,y)/(p(x)*p(y))))等构建得分函数
|
||||
2. HanLP的长短语构造方法: [基于互信息和左右信息熵的短语提取识别](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html)
|
||||
1.切词(只统计词典),统计词语共现(一阶、二阶、三阶)
|
||||
2.左右熵、互信息。合并词典词语,构建短语
|
||||
3. SmoothNLP:["新词发现"算法探讨与优化-SmoothNLP](https://zhuanlan.zhihu.com/p/80385615)
|
||||
1.左右熵权重: Ew =log((El*e^Er+Er*e^EL)/|Er-El|)
|
||||
2.平均互信息AMI:(1/n) * log(p(w)/(p(1)p(2)...p(n)))
|
||||
3.过滤条件:对在candidate ngram中, 首字或者尾字出现次数特别多的进行筛选, 如"XX的,美丽的,漂亮的"剔出字典
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user