From e3b74cc5613f8a6d350ceaaa402e452d18f740ff Mon Sep 17 00:00:00 2001
From: yongzhuo <2714618994@qq.com>
Date: Wed, 21 Oct 2020 15:46:50 +0800
Subject: [PATCH] add use_cache, fix macropodus_cut of cut-block

---
 macropodus/__init__.py                        |   5 +-
 macropodus/segment/__init__.py                |   8 +-
 .../segment/seg_statistics/seg_statistics.py  |  37 ++++--
 .../segment/word_discovery/word_discovery.py  | 109 ++++++++----------
 macropodus/similarity/__init__.py             |   7 +-
 .../similarity/similarity_word2vec_char.py    |   2 +-
 macropodus/summarize/graph_base/textrank.py   |   6 +-
 test/survey_report/nlp_platfom_survey.md      |  14 +++
 8 files changed, 116 insertions(+), 72 deletions(-)

diff --git a/macropodus/__init__.py b/macropodus/__init__.py
index db158ea..a085302 100644
--- a/macropodus/__init__.py
+++ b/macropodus/__init__.py
@@ -10,9 +10,9 @@ from macropodus.tookit import calculate, chi2num, num2chi, Trie, roman2num, num2
 from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find
 from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word
 from macropodus.summarize import keyword, textrank, summarization
-from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
 from macropodus.version import __version__ # 版本
 from macropodus.similarity import sim
+import os
 
 # 机械分词
 cut_bidirectional = cut_bidirectional
@@ -49,3 +49,6 @@ num2roman = num2roman
 han2zh = han2zh
 zh2han = zh2han
 pinyin = pinyin
+
+if os.environ.get("macropodus_use_dl", False)=="1":
+   from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
diff --git a/macropodus/segment/__init__.py b/macropodus/segment/__init__.py
index 08f8064..7784ea9 100644
--- a/macropodus/segment/__init__.py
+++ b/macropodus/segment/__init__.py
@@ -7,9 +7,13 @@
 
 from macropodus.segment.seg_statistics.seg_statistics import SegStatistics
 from macropodus.segment.word_discovery.word_discovery import WordDiscovery
+import os
 
-# 机械分词
-use_cache = True # 使用缓存
+
+# 机械分词,默认使用缓存
+use_cache = True
+if not os.environ.get("macropodus_use_seg_cache", True):
+    use_cache = False  # 不使用缓存，重新加载
 segs = SegStatistics(use_cache)
 cut_bidirectional = segs.cut_bidirectional
 cut_forward = segs.cut_forward
diff --git a/macropodus/segment/seg_statistics/seg_statistics.py b/macropodus/segment/seg_statistics/seg_statistics.py
index 4c20714..597550e 100644
--- a/macropodus/segment/seg_statistics/seg_statistics.py
+++ b/macropodus/segment/seg_statistics/seg_statistics.py
@@ -8,7 +8,7 @@
 from macropodus.preprocess.tools_common import re_continue
 from macropodus.base.seg_basic import SegBasic
 from math import log
-
+import re
 
 __all__ = ["cut_dag",
            "cut_forward",
@@ -16,6 +16,9 @@ __all__ = ["cut_dag",
            "cut_bidirectional",
            "cut_search"]
 
+re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
+re_skip = re.compile("(\r\n|\s)", re.U)
+
 
 class SegStatistics(SegBasic):
     def __init__(self, use_cache):
@@ -170,22 +173,42 @@ class SegStatistics(SegBasic):
     def cut(self, sentence, type_cut="cut_dag"):
         """
             切词总函数
+            cut_block, 代码来自jieba项目
+            code from: https://github.com/fxsjy/jieba
         :param sentence:str, like '大漠帝国, macropodus, 中国斗鱼' 
         :param type_cut: str, like 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'
-        :return: list, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
+        :return: yield, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
         """
         if type_cut=="cut_dag":
-            return list(self.cut_dag(sentence))
+            cut_block = self.cut_dag
         elif type_cut=="cut_forward":
-            return list(self.cut_dag(sentence))
+            cut_block = self.cut_forward
         elif type_cut=="cut_reverse":
-            return list(self.cut_dag(sentence))
+            cut_block = self.cut_reverse
         elif type_cut=="cut_bidirectional":
-            return list(self.cut_dag(sentence))
+            cut_block = self.cut_bidirectional
         elif type_cut=="cut_search":
-            return list(self.cut_dag(sentence))
+            cut_block = self.cut_search
         else:
             raise RuntimeError("type_cut must be 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'")
+        blocks = re_han.split(sentence)
+        cut_all = False
+        for block in blocks:
+            if not block:
+                continue
+            if re_han.match(block):
+                for word in cut_block(block):
+                    yield word
+            else:
+                tmp = re_skip.split(block)
+                for x in tmp:
+                    if re_skip.match(x):
+                        yield x
+                    elif not cut_all:
+                        for xx in x:
+                            yield xx
+                    else:
+                        yield x
 
 
 if __name__ == '__main__':
diff --git a/macropodus/segment/word_discovery/word_discovery.py b/macropodus/segment/word_discovery/word_discovery.py
index 4627e77..e8feb22 100644
--- a/macropodus/segment/word_discovery/word_discovery.py
+++ b/macropodus/segment/word_discovery/word_discovery.py
@@ -25,6 +25,7 @@ class WordDiscovery:
         self.total_words = 0
         self.freq_min = 3
         self.len_max = 7
+        self.round = 6
         self.eps = 1e-9
         self.empty_words = [sw for sw in stop_words.values() if len(sw)==1] # 虚词
 
@@ -35,8 +36,10 @@ class WordDiscovery:
         :param use_type: str,  "text" or "file", file of "utf-8" of "txt"
         :return: class<Counter>, word-freq
         """
+        import macropodus
         self.words_count = Counter()
         if use_type=="text": # 输入为文本形式
+            text = macropodus.han2zh(text)
             texts = cut_sentence(use_type=self.algorithm,
                                  text=text)  # 切句子, 如中英文的逗号/句号/感叹号
             for text in texts:
@@ -50,6 +53,7 @@ class WordDiscovery:
             fr8 = open(text, "r", encoding="utf-8")
             for text in fr8:
                 if text.strip():
+                    text = macropodus.han2zh(text)
                     texts = cut_sentence(use_type=self.algorithm,
                                          text=text) # 切句子, 如中英文的逗号/句号/感叹号
                     for text in texts:
@@ -108,9 +112,9 @@ class WordDiscovery:
             if (k[0] in self.empty_words or k[-1] in self.empty_words):
                 entroy_boundary = entroy_boundary / len(k)
             if boundary_type == "right":
-                self.right_entropy[k] = entroy_boundary
+                self.right_entropy[k] = round(entroy_boundary, self.round)
             else:
-                self.left_entropy[k] = entroy_boundary
+                self.left_entropy[k] = round(entroy_boundary, self.round)
 
     def compute_entropys(self):
         """
@@ -146,8 +150,38 @@ class WordDiscovery:
             probability_chars = reduce(mul,([wf for wf in words_freq])) / (twl_1**(len(word)))
             pmi = math.log(probability_word / probability_chars, 2)
             # AMI=PMI/length_word. 惩罚虚词(避免"的", "得", "了"开头结尾的情况)
-            self.aggregation[word] = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
-                                                              else pmi/len_word # pmi / len_word / len_word
+            word_aggregation = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
+                                                        else pmi/len_word # pmi / len_word / len_word
+            self.aggregation[word] = round(word_aggregation, self.round)
+
+    def compute_score(self, word, value, a, r, l, rl, lambda_0, lambda_3):
+        """
+            计算最终得分
+        :param word: str, word with prepare
+        :param value: float, word freq
+        :param a: float, aggregation of word
+        :param r: float, right entropy of word
+        :param l: float, left entropy of word
+        :param rl: float, right_entropy * left_entropy
+        :param lambda_0: lambda 0
+        :param lambda_3: lambda 3
+        :return: 
+        """
+        self.new_words[word] = {}
+        # math.log10(self.aggregation[word]) - math.log10(self.total_words)
+        self.new_words[word]["a"] = a
+        self.new_words[word]["r"] = r
+        self.new_words[word]["l"] = l
+        self.new_words[word]["f"] = value
+        # word-liberalization
+        m1 = lambda_0(r)
+        m2 = lambda_0(l)
+        m3 = lambda_0(a)
+        score_ns = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
+        self.new_words[word]["ns"] = round(score_ns, self.round)
+        # 乘以词频word-freq, 连乘是为了防止出现较小项
+        score_s = value * a * rl * score_ns
+        self.new_words[word]["s"] = round(score_s, self.round)
 
     def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2,
                         use_output=True, use_avg=False, use_filter=False):
@@ -175,66 +209,25 @@ class WordDiscovery:
         lambda_0 = lambda x: -self.eps * x + self.eps if x <= 0 else x
         # 输出
         for word, value in self.words_select.items():
+            # 过滤通用词
             if use_filter and word in self.dict_words_freq:
                 continue
+            # 过滤停用词
             if word in self.stop_words:
                 continue
-            if use_output:
-                # {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
-                self.new_words[word] = {}
-                # math.log10(self.aggregation[word]) - math.log10(self.total_words)
-                self.new_words[word]["a"] = self.aggregation[word]
-                self.new_words[word]["r"] = self.right_entropy[word]
-                self.new_words[word]["l"] = self.left_entropy[word]
-                self.new_words[word]["f"] = value
-                # word-liberalization
-                m1 = lambda_0(self.right_entropy[word])
-                m2 = lambda_0(self.left_entropy[word])
-                m3 = lambda_0(self.aggregation[word])
-                score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
-                self.new_words[word]["ns"] = score_3
-                # 乘以freq效果没那么好, 连乘是为了防止出现较小项
-                # self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
-                #                             self.right_entropy[word] * self.left_entropy[word]
-                self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
-                                            self.right_entropy[word] * self.left_entropy[word] * score_3
+            # {"aggregation":"a", "right_entropy":"r", "left_entropy":"l", "frequency":"f",
+            #  "word-liberalization":"ns", "score":"s"}
+            a = self.aggregation[word]
+            r = self.right_entropy[word]
+            l = self.left_entropy[word]
+            rl = (r+l) / 2 if use_avg else r * l
+            if use_output or (use_avg and a > self.aggregation_min and rl > self.entropy_min) or \
+                             (not use_avg and a > self.aggregation_min and r > self.entropy_min and l > self.entropy_min):
+                self.compute_score(word, value, a, r, l, rl, lambda_0, lambda_3)
 
-            elif not use_avg and self.aggregation[word] > self.aggregation_min \
-                    and self.right_entropy[word] > self.entropy_min and self.left_entropy[word] > self.entropy_min:
-                self.new_words[word] = {}
-                # {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
-                self.new_words[word]["a"] = self.aggregation[word] # math.log10(self.aggregation[word]) - math.log10(self.total_words)
-                self.new_words[word]["r"] = self.right_entropy[word]
-                self.new_words[word]["l"] = self.left_entropy[word]
-                self.new_words[word]["f"] = value
-                # word-liberalization
-                m1 = lambda_0(self.right_entropy[word])
-                m2 = lambda_0(self.left_entropy[word])
-                m3 = lambda_0(self.aggregation[word])
-                score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
-                self.new_words[word]["ns"] = score_3
-                self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
-                                            (self.right_entropy[word] + self.left_entropy[word])/2 * score_3
-            elif use_avg and self.aggregation[word] > self.aggregation_min \
-                    and (self.right_entropy[word] + self.left_entropy[word]) > 2 * self.entropy_min:
-                self.new_words[word] = {}
-                # {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
-                self.new_words[word]["a"] = self.aggregation[word]
-                self.new_words[word]["r"] = self.right_entropy[word]
-                self.new_words[word]["l"] = self.left_entropy[word]
-                self.new_words[word]["f"] = value
-                # word-liberalization
-                m1 = lambda_0(self.right_entropy[word])
-                m2 = lambda_0(self.left_entropy[word])
-                m3 = lambda_0(self.aggregation[word])
-                score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
-                self.new_words[word]["ns"] = score_3
-                self.new_words[word]["s"] = self.new_words[word]["a"] * (self.right_entropy[word] + self.left_entropy[word])/2
-                # mul, 相乘
-                self.new_words[word]["s"] *= score_3
         # 排序
-        new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
-        self.new_words = OrderedDict(new_words)
+        self.new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
+        self.new_words = OrderedDict(self.new_words)
         return self.new_words
 
 
diff --git a/macropodus/similarity/__init__.py b/macropodus/similarity/__init__.py
index 17042a2..32e4e12 100644
--- a/macropodus/similarity/__init__.py
+++ b/macropodus/similarity/__init__.py
@@ -6,9 +6,12 @@
 
 
 from macropodus.similarity.similarity_word2vec_char import SimW2vChar
+import os
 
-
+# 词向量, 默认使用缓存
+use_cache = True
+if not os.environ.get("macropodus_use_w2v_cache", True):
+    use_cache = False  # 不使用缓存，重新加载
 # 文本相似度
-use_cache = True # 使用缓存
 swc = SimW2vChar(use_cache)
 sim = swc.similarity
diff --git a/macropodus/similarity/similarity_word2vec_char.py b/macropodus/similarity/similarity_word2vec_char.py
index 8b095e7..d5ce447 100644
--- a/macropodus/similarity/similarity_word2vec_char.py
+++ b/macropodus/similarity/similarity_word2vec_char.py
@@ -9,7 +9,7 @@ from macropodus.base.word2vec import W2v
 
 
 class SimW2vChar(W2v):
-    def __init__(self, use_cache):
+    def __init__(self, use_cache=True):
         super().__init__(use_cache)
 
     def encode(self, sent, type_encode="other"):
diff --git a/macropodus/summarize/graph_base/textrank.py b/macropodus/summarize/graph_base/textrank.py
index 544ccc8..201f98d 100644
--- a/macropodus/summarize/graph_base/textrank.py
+++ b/macropodus/summarize/graph_base/textrank.py
@@ -8,8 +8,12 @@
 from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec
 from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum
 from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn
+import os
 
-
+# 词向量, 默认使用缓存
+use_cache = True
+if not os.environ.get("macropodus_use_w2v_cache", True):
+    use_cache = False  # 不使用缓存，重新加载
 # textrank of gensim
 trgs = TextrankGensimSum()
 # textrank of word2vec
diff --git a/test/survey_report/nlp_platfom_survey.md b/test/survey_report/nlp_platfom_survey.md
index 8051bfc..1ca7623 100644
--- a/test/survey_report/nlp_platfom_survey.md
+++ b/test/survey_report/nlp_platfom_survey.md
@@ -89,3 +89,17 @@ PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0
 * ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer)
 * fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp)
 * NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR)
+
+###
+新词发现:
+1. Matrix67: The Aha Moments的信息熵方法: [互联网时代的社会语言学：基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044)
+   1.词频、左右熵(丰度,字符组合左右邻字的丰富程度, -p*log(p))、
+   2.互信息(凝固度,内部凝聚程度, pmi = p(x,y)*log(p(x,y)/(p(x)*p(y))))等构建得分函数
+2. HanLP的长短语构造方法: [基于互信息和左右信息熵的短语提取识别](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html)
+   1.切词(只统计词典)，统计词语共现(一阶、二阶、三阶)
+   2.左右熵、互信息。合并词典词语，构建短语
+3. SmoothNLP:["新词发现"算法探讨与优化-SmoothNLP](https://zhuanlan.zhihu.com/p/80385615)
+   1.左右熵权重: Ew =log((El*e^Er+Er*e^EL)/|Er-El|)
+   2.平均互信息AMI:(1/n) * log(p(w)/(p(1)p(2)...p(n)))
+   3.过滤条件:对在candidate ngram中, 首字或者尾字出现次数特别多的进行筛选, 如"XX的,美丽的,漂亮的"剔出字典
+