macropodus init
This commit is contained in:
parent
c8afbbf57b
commit
213b63a2ea
238
README.md
238
README.md
@ -1,2 +1,236 @@
|
||||
# Macropodus
|
||||
Macropodus自然语言处理工具(Albert+BiLSTM+CRF) 中文分词 命名实体识别 新词发现 关键词 文本摘要 计算器 中文数字阿拉伯数字转换
|
||||
<p align="center">
|
||||
<img src="macropodus_images/macropodus_logo.png" width="480"\>
|
||||
</p>
|
||||
|
||||
# [Macropodus](https://github.com/yongzhuo/Macropodus)
|
||||
|
||||
[![PyPI](https://img.shields.io/pypi/v/Macropodus)](https://pypi.org/project/Macropodus/)
|
||||
[![Build Status](https://travis-ci.com/yongzhuo/Macropodus.svg?branch=master)](https://travis-ci.com/yongzhuo/Macropodus)
|
||||
[![PyPI_downloads](https://img.shields.io/pypi/dm/Macropodus)](https://pypi.org/project/Macropodus/)
|
||||
[![Stars](https://img.shields.io/github/stars/yongzhuo/Macropodus?style=social)](https://github.com/yongzhuo/Macropodus/stargazers)
|
||||
[![Forks](https://img.shields.io/github/forks/yongzhuo/Macropodus.svg?style=social)](https://github.com/yongzhuo/Macropodus/network/members)
|
||||
[![Join the chat at https://gitter.im/yongzhuo/Macropodus](https://badges.gitter.im/yongzhuo/Macropodus.svg)](https://gitter.im/yongzhuo/Macropodus?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
>>> Macropodus是一个以Albert+BiLSTM+CRF网络结构为基础,用大规模中文语料训练的自然语言处理工具包。将提供中文分词、命名实体识别、关键词抽取、文本摘要、新词发现、文本相似度、计算器、数字转化等常见NLP功能。
|
||||
|
||||
|
||||
## 目录
|
||||
|
||||
* [安装](#安装)
|
||||
* [使用方式](#使用方式)
|
||||
* [参考/引用](#参考/引用)
|
||||
* [FAQ](#FAQ)
|
||||
|
||||
|
||||
# 安装
|
||||
1. 通过PyPI安装(自带模型文件):
|
||||
```
|
||||
pip install macropodus
|
||||
```
|
||||
2. 使用镜像源,例如:
|
||||
```
|
||||
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple macropodus
|
||||
```
|
||||
|
||||
|
||||
# 使用方式
|
||||
|
||||
|
||||
## 快速使用
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
sen_calculate = "23 + 13 * (25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3))加根号144你算得几多"
|
||||
sen_chi2num = "三千零七十八亿三千零十五万零三百一十二点一九九四"
|
||||
sen_num2chi = 1994.1994
|
||||
sent1 = "PageRank算法简介"
|
||||
sent2 = "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。"
|
||||
summary = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
|
||||
# 分词(词典最大概率分词DAG)
|
||||
words = macropodus.cut(summary)
|
||||
print(words)
|
||||
# 新词发现
|
||||
new_words = macropodus.find(summary)
|
||||
print(new_words)
|
||||
# 文本摘要
|
||||
sum = macropodus.summarize(summary)
|
||||
print(sum)
|
||||
# 关键词抽取
|
||||
keyword = macropodus.keyword(summary)
|
||||
print(keyword)
|
||||
# 文本相似度
|
||||
sim = macropodus.sim(sent1, sent2)
|
||||
print(sim)
|
||||
# tookit
|
||||
# 计算器
|
||||
score_calcul = macropodus.calculate(sen_calculate)
|
||||
print(score_calcul)
|
||||
# 中文数字与阿拉伯数字相互转化
|
||||
res_chi2num = macropodus.chi2num(sen_chi2num)
|
||||
print(res_chi2num)
|
||||
res_num2chi = macropodus.num2chi(sen_num2chi)
|
||||
print(res_num2chi)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 中文分词
|
||||
|
||||
各种分词方法
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
# 用户词典
|
||||
macropodus.add_word(word="斗鱼科")
|
||||
macropodus.add_word(word="鲈形目") # 不持久化, 当前有效
|
||||
macropodus.save_add_words(word_freqs={"喜斗":32, "护卵":64, "护幼":132}) # 持久化保存到用户字典
|
||||
sent = "斗鱼属,Macropodus (1801),鲈形目斗鱼科的一属鱼类。本属鱼类通称斗鱼。因喜斗而得名。分布于亚洲东南部。中国有2种,即叉尾斗鱼,分布于长江及以南各省;叉尾斗鱼,分布于辽河到珠江流域。其喜栖居于小溪、河沟、池塘、稻田等缓流或静水中。雄鱼好斗,产卵期集草成巢,雄鱼口吐粘液泡沫,雌鱼产卵其中,卵浮性,受精卵在泡沫内孵化。雄鱼尚有护卵和护幼现象。"
|
||||
|
||||
# 分词
|
||||
sents = macropodus.cut_bidirectional(sent)
|
||||
print("cut_bidirectional: " + " ".join(sents))
|
||||
sents = macropodus.cut_forward(sent)
|
||||
print("cut_forward: " + " ".join(sents))
|
||||
sents = macropodus.cut_reverse(sent)
|
||||
print("cut_reverse: " + " ".join(sents))
|
||||
sents = macropodus.cut_search(sent)
|
||||
print("cut_search: " + " ".join(sents))
|
||||
# DAG
|
||||
sents = macropodus.cut_dag(sent)
|
||||
print("cut_dag: " + " ".join(sents))
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 文本相似度
|
||||
|
||||
文本相似度主要使用词向量, 余弦相似度 或 jaccard相似度
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
sent1="叉尾斗鱼是一种观赏性动物"
|
||||
sent2="中国斗鱼生性好斗,适应性强,能在恶劣的环境中生存"
|
||||
|
||||
# 文本相似度(similarity)
|
||||
sents = macropodus.sim(sent1, sent2, type_sim="total", type_encode="avg")
|
||||
print(sents)
|
||||
sents = macropodus.sim(sent1, sent2, type_sim="cosine", type_encode="single")
|
||||
print(sents)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 文本摘要
|
||||
|
||||
文本摘要方法有text_pronouns, text_teaser, word_sign, textrank, lead3, mmr, lda, lsi, nmf
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
summary = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
|
||||
# 文本摘要(summarize, 默认接口)
|
||||
sents = macropodus.summarize(summary)
|
||||
print(sents)
|
||||
|
||||
# 文本摘要(summarization, 可定义方法, 提供9种文本摘要方法, 'lda', 'mmr', 'textrank', 'text_teaser')
|
||||
sents = macropodus.summarization(text=summary, type_summarize="lda")
|
||||
print(sents)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 新词发现
|
||||
|
||||
新词发现主要使用凝固度, 左熵, 右熵, 词频等方案, 综合考虑
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
summary = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
|
||||
# 新词发现(findword, 默认接口)
|
||||
sents = macropodus.find(text=summary, freq_min=2, len_max=7, entropy_min=1.2, aggregation_min=0.5, use_avg=True)
|
||||
print(sents)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 关键词
|
||||
|
||||
关键词抽取使用的是textrank, 边关系构建: 1. 字向量构建句向量; 2. 余弦相似度计算边得分
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
sent = "斗鱼属,Macropodus (1801),鲈形目斗鱼科的一属鱼类。本属鱼类通称斗鱼。因喜斗而得名。分布于亚洲东南部。中国有2种,即叉尾斗鱼,分布于长江及以南各省;叉尾斗鱼,分布于辽河到珠江流域。其喜栖居于小溪、河沟、池塘、稻田等缓流或静水中。雄鱼好斗,产卵期集草成巢,雄鱼口吐粘液泡沫,雌鱼产卵其中,卵浮性,受精卵在泡沫内孵化。雄鱼尚有护卵和护幼现象。"
|
||||
# 关键词(keyword)
|
||||
sents = macropodus.keyword(sent)
|
||||
print(sents)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 常用小工具(tookit)
|
||||
|
||||
工具包括科学计算器, 阿拉伯-中文数字转化
|
||||
```python3
|
||||
import macropodus
|
||||
|
||||
sen_calculate = "23 + 13 * (25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3))加根号144你算得几多"
|
||||
sen_chi2num = "三千零七十八亿三千零十五万零三百一十二点一九九四"
|
||||
sen_num2chi = 1994.1994
|
||||
# tookit, 科学计算器
|
||||
score_calcul = macropodus.calculate(sen_calculate)
|
||||
print(score_calcul)
|
||||
# tookit, 中文数字转阿拉伯
|
||||
res_chi2num = macropodus.chi2num(sen_chi2num)
|
||||
print(res_chi2num)
|
||||
# tookit, 阿拉伯数字转中文
|
||||
res_num2chi = macropodus.num2chi(sen_num2chi)
|
||||
print(res_num2chi)
|
||||
|
||||
```
|
||||
|
||||
# 参考/引用
|
||||
* textrank_gensim: [https://github.com/RaRe-Technologies/gensim](https://github.com/RaRe-Technologies/gensim)
|
||||
* 最大概率(DAG-动态规划)词典分词: [https://github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
|
||||
* CRF(-未解决): [https://github.com/BrikerMan/Kashgari](https://github.com/BrikerMan/Kashgari)
|
||||
|
||||
# FAQ
|
||||
|
5
__init__.py
Normal file
5
__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/12/3 22:50
|
||||
# @author :Mo
|
||||
# @function :
|
50
macropodus/__init__.py
Normal file
50
macropodus/__init__.py
Normal file
@ -0,0 +1,50 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/12 22:57
|
||||
# @author : Mo
|
||||
# @function: init of macropodus (tookit, keras of tensorflow)
|
||||
|
||||
|
||||
# macropodus
|
||||
from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find
|
||||
from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word
|
||||
from macropodus.summarize import keyword, textrank, summarization
|
||||
from macropodus.tookit import calculate, chi2num, num2chi, Trie
|
||||
from macropodus.version import __version__ # 版本
|
||||
from macropodus.similarity import sim
|
||||
|
||||
# 机械分词
|
||||
cut_bidirectional = cut_bidirectional
|
||||
cut_forward = cut_forward
|
||||
cut_reverse = cut_reverse
|
||||
cut_search = cut_search
|
||||
cut_dag = cut_dag
|
||||
cut = cut
|
||||
|
||||
# 用户词典操作
|
||||
load_user_dict = load_user_dict
|
||||
save_delete_words = save_delete_words # 保存到用户词典的
|
||||
save_add_words = save_add_words
|
||||
delete_word = delete_word
|
||||
add_word = add_word
|
||||
|
||||
# 新词发现
|
||||
find = find
|
||||
|
||||
# 文本相似度
|
||||
sim = sim
|
||||
|
||||
# 文本摘要, 关键词
|
||||
keyword = keyword
|
||||
summarize = textrank
|
||||
summarization = summarization
|
||||
|
||||
# 常用工具(tookit, 计算器, 中文与阿拉伯数字转化, 前缀树)
|
||||
calculate = calculate
|
||||
chi2num = chi2num
|
||||
num2chi = num2chi
|
||||
|
||||
# 是否使用深度学习模型
|
||||
use_dl=False
|
||||
if use_dl:
|
||||
from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
|
54
macropodus/__init_tf_keras.py
Normal file
54
macropodus/__init_tf_keras.py
Normal file
@ -0,0 +1,54 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/20 22:22
|
||||
# @author : Mo
|
||||
# @function: init of keras of tensorflow
|
||||
|
||||
|
||||
#####################(tensorflow, keras)############################
|
||||
import sys
|
||||
import os
|
||||
path_root = os.path.abspath(os.path.dirname(__file__))
|
||||
sys.path.append(path_root) # 环境引入根目录
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||
os.environ['TF_KERAS'] = '1'
|
||||
|
||||
try:
|
||||
# tensorflow.python.keras
|
||||
import tensorflow.python.keras as keras
|
||||
import tensorflow as tf
|
||||
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
||||
except:
|
||||
import keras
|
||||
|
||||
# custom_objects
|
||||
import keras_bert
|
||||
custom_objects = keras_bert.get_custom_objects()
|
||||
from keras_adaptive_softmax import AdaptiveEmbedding, AdaptiveSoftmax
|
||||
from macropodus.network.layers.non_mask_layer import NonMaskingLayer
|
||||
from macropodus.network.layers.crf import CRF
|
||||
custom_objects['AdaptiveEmbedding'] = AdaptiveEmbedding
|
||||
custom_objects['AdaptiveSoftmax'] = AdaptiveSoftmax
|
||||
custom_objects['NonMaskingLayer'] = NonMaskingLayer
|
||||
custom_objects['CRF'] = CRF
|
||||
|
||||
# layers
|
||||
preprocessing = keras.preprocessing
|
||||
applications = keras.applications
|
||||
regularizers = keras.regularizers
|
||||
initializers = keras.initializers
|
||||
activations = keras.activations
|
||||
constraints = keras.constraints
|
||||
optimizers = keras.optimizers
|
||||
callbacks = keras.callbacks
|
||||
datasets = keras.datasets
|
||||
wrappers = keras.wrappers
|
||||
metrics = keras.metrics
|
||||
backend = keras.backend
|
||||
engine = keras.engine
|
||||
layers = keras.layers
|
||||
models = keras.models
|
||||
losses = keras.losses
|
||||
utils = keras.utils
|
5
macropodus/base/__init__.py
Normal file
5
macropodus/base/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/12 23:04
|
||||
# @author : Mo
|
||||
# @function:
|
148
macropodus/base/seg_basic.py
Normal file
148
macropodus/base/seg_basic.py
Normal file
@ -0,0 +1,148 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/28 20:17
|
||||
# @author : Mo
|
||||
# @function: basic of segment, dictionary
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_common import load_json, save_json, txt_read
|
||||
from macropodus.conf.path_config import path_dict_macropodus, path_dict_user
|
||||
from macropodus.conf.path_config import path_macropodus_dict_freq_cache
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
from collections import defaultdict
|
||||
import pickle
|
||||
import time
|
||||
import os
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
class SegBasic:
|
||||
def __init__(self, use_cache=True):
|
||||
# time_start = time.time()
|
||||
# 存在缓存则直接读取, 序列化加速缓存读取速度
|
||||
if use_cache and os.path.exists(path_macropodus_dict_freq_cache):
|
||||
with open(path_macropodus_dict_freq_cache, "rb") as fpmc:
|
||||
[self.dict_words_freq, self.num_words, self.dict_user] = pickle.load(fpmc)
|
||||
fpmc.close()
|
||||
# logger.info("seg: " + str(time.time()-time_start)) # 5.29, 5.26
|
||||
else:
|
||||
self.dict_words_freq = defaultdict()
|
||||
self.dict_user = {}
|
||||
self.load_macropodus_dict() # 默认字典
|
||||
self.load_user_dict() # 用户字典
|
||||
# logger.info("seg: " + str(time.time() - time_start)) # 10.13, 10.33
|
||||
# 第一次跑macropodus, 序列化需要的缓存
|
||||
if use_cache and not os.path.exists(path_macropodus_dict_freq_cache):
|
||||
with open(path_macropodus_dict_freq_cache, "wb") as fpmc:
|
||||
pickle.dump([self.dict_words_freq, self.num_words, self.dict_user], fpmc)
|
||||
|
||||
def load_macropodus_dict(self):
|
||||
"""
|
||||
加载默认的基础字典
|
||||
:return: None
|
||||
"""
|
||||
dict_macropodus = load_json(path_dict_macropodus)[0] # (path_dict_jiagu)[0] # (path_dict_macropodus)[0] # 加载json字典文件
|
||||
dict_macropodus_def = defaultdict() # 转为defaultdict
|
||||
for k,v in dict_macropodus.items():
|
||||
dict_macropodus_def[k] = v
|
||||
self.dict_words_freq = dict_macropodus_def # {}词-词频字典
|
||||
|
||||
def load_user_dict(self, path_user=path_dict_user, type_user="json"):
|
||||
"""
|
||||
加载用户词典
|
||||
:param path_user:str, like '/home/user.dict'
|
||||
:return: None
|
||||
"""
|
||||
if not os.path.exists(path_user):
|
||||
raise RuntimeError("your path_user is not exist!")
|
||||
if type_user == "json":
|
||||
self.dict_user = load_json(path_user)[0] # 加载json字典文件
|
||||
for k, v in self.dict_user.items():
|
||||
if k not in self.dict_words_freq:
|
||||
self.dict_words_freq[k] = v # 更新到总字典, words_freq
|
||||
else:
|
||||
self.dict_words_freq[k] = self.dict_words_freq[k] + v # 更新到总字典, words_freq
|
||||
self.num_words = sum(self.dict_words_freq.values())
|
||||
elif type_user == "txt":
|
||||
words_all = txt_read(path_user)
|
||||
for word_freq in words_all:
|
||||
wf = word_freq.split(" ") # 空格' '区分带不带词频的情况
|
||||
if len(wf) == 2:
|
||||
word = wf[0]
|
||||
freq = wf[1]
|
||||
else:
|
||||
word = wf[0]
|
||||
freq = 132
|
||||
if word not in self.dict_words_freq:
|
||||
self.dict_words_freq[word] = freq # 更新到总字典, words_freq
|
||||
else:
|
||||
self.dict_words_freq[word] = self.dict_words_freq[word] + freq # 更新到总字典, words_freq
|
||||
self.num_words = sum(self.dict_words_freq.values())
|
||||
elif type_user == "csv":
|
||||
words_all = txt_read(path_user)
|
||||
for word_freq in words_all:
|
||||
wf = word_freq.split(",") # 逗号','区分带不带词频的情况
|
||||
if len(wf)==2:
|
||||
word = wf[0]
|
||||
freq = wf[1]
|
||||
else:
|
||||
word = wf[0]
|
||||
freq = 132
|
||||
if word not in self.dict_words_freq:
|
||||
self.dict_words_freq[word] = freq # 更新到总字典, words_freq
|
||||
else:
|
||||
self.dict_words_freq[word] = self.dict_words_freq[word] + freq # 更新到总字典, words_freq
|
||||
self.num_words = sum(self.dict_words_freq.values())
|
||||
else:
|
||||
raise EOFError
|
||||
|
||||
def add_word(self, word, freq=132):
|
||||
"""
|
||||
新增词典到词语, 不可持久化, 重载消失
|
||||
:param word: str, like '大漠帝国'
|
||||
:param freq: int, like 132
|
||||
:return: None
|
||||
"""
|
||||
assert type(word) == str
|
||||
if word in self.dict_words_freq:
|
||||
self.dict_words_freq[word] = self.dict_words_freq[word] if freq !=132 else freq
|
||||
else:
|
||||
self.dict_words_freq[word] = freq
|
||||
self.num_words += freq
|
||||
|
||||
def delete_word(self, word):
|
||||
"""
|
||||
删除词语, 不可持久化, 重载消失
|
||||
:param word_freqs: str, like '大漠帝国'
|
||||
:return: None
|
||||
"""
|
||||
assert type(word) == str
|
||||
if word in self.dict_words_freq:
|
||||
self.num_words -= self.dict_words_freq[word]
|
||||
self.dict_words_freq.pop(word)
|
||||
|
||||
def save_add_words(self, word_freqs):
|
||||
"""
|
||||
新增词语到用户词典, 可持久化, 重载有效
|
||||
:param word_freqs: dict, like {'大漠帝国':132}
|
||||
:return: None
|
||||
"""
|
||||
assert type(word_freqs) == dict
|
||||
for k, v in word_freqs.items():
|
||||
self.add_word(k, v) # 新增到总字典, 不持久化
|
||||
self.dict_user[k] = v # 新增到用户字典, 持久化
|
||||
save_json([self.dict_user], path_dict_user)
|
||||
|
||||
def save_delete_words(self, words):
|
||||
"""
|
||||
删除词语到用户词典, 可持久化, 重载有效
|
||||
:param word_freqs: list, like ['大漠帝国']
|
||||
:return: None
|
||||
"""
|
||||
assert type(words) == list
|
||||
for w in words:
|
||||
self.delete_word(w) # 删除到总字典, 不持久化
|
||||
if w in self.dict_user: self.dict_user.pop(w) # 删除到用户字典, 持久化
|
||||
save_json([self.dict_user], path_dict_user)
|
64
macropodus/base/word2vec.py
Normal file
64
macropodus/base/word2vec.py
Normal file
@ -0,0 +1,64 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:52
|
||||
# @author : Mo
|
||||
# @function: word2vec of gensim
|
||||
|
||||
|
||||
from macropodus.conf.path_config import path_embedding_word2vec_char, path_macropodus_w2v_char_cache
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
import numpy as np
|
||||
import gensim
|
||||
import pickle
|
||||
import time
|
||||
import os
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
gensim.logger.level=40
|
||||
|
||||
|
||||
class W2v:
|
||||
def __init__(self, use_cache=True):
|
||||
# time_start = time.time()
|
||||
# 存在缓存则直接读取, 序列化加速缓存读取速度
|
||||
if use_cache and os.path.exists(path_macropodus_w2v_char_cache):
|
||||
with open(path_macropodus_w2v_char_cache, "rb") as fpmc:
|
||||
self.w2v_char= pickle.load(fpmc)
|
||||
fpmc.close()
|
||||
# logger.info("word2vec: " + str(time.time() - time_start)) # 0.12
|
||||
else:
|
||||
# gensim加载词向量
|
||||
self.w2v_char = gensim.models.KeyedVectors.load_word2vec_format(path_embedding_word2vec_char)
|
||||
# logger.info("word2vec: " + str(time.time() - time_start)) # 0.99, 0.78
|
||||
# 第一次跑macropodus, 序列化需要的缓存
|
||||
if use_cache and not os.path.exists(path_macropodus_w2v_char_cache):
|
||||
with open(path_macropodus_w2v_char_cache, "wb") as fpmc:
|
||||
pickle.dump(self.w2v_char, fpmc)
|
||||
|
||||
def cosine(self, sen_1, sen_2):
|
||||
"""
|
||||
余弦距离
|
||||
:param sen_1: numpy.array
|
||||
:param sen_2: numpy.array
|
||||
:return: float, like 0.0
|
||||
"""
|
||||
if sen_1.all() and sen_2.all():
|
||||
return np.dot(sen_1, sen_2) / (np.linalg.norm(sen_1) * np.linalg.norm(sen_2))
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def jaccard(self, sen_1, sen_2):
|
||||
"""
|
||||
jaccard距离
|
||||
:param sen1: str, like "大漠帝国"
|
||||
:param sen2: str, like "Macropodus"
|
||||
:return: float, like 0.998
|
||||
"""
|
||||
try:
|
||||
sent_intersection = list(set(list(sen_1)).intersection(set(list(sen_2))))
|
||||
sent_union = list(set(list(sen_1)).union(set(list(sen_2))))
|
||||
score_jaccard = float(len(sent_intersection) / len(sent_union))
|
||||
except:
|
||||
score_jaccard = 0.0
|
||||
return score_jaccard
|
5
macropodus/conf/__init__.py
Normal file
5
macropodus/conf/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/18 23:59
|
||||
# @author : Mo
|
||||
# @function:
|
38
macropodus/conf/path_config.py
Normal file
38
macropodus/conf/path_config.py
Normal file
@ -0,0 +1,38 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/18 23:59
|
||||
# @author : Mo
|
||||
# @function: path of macropodus
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
path_root = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
|
||||
sys.path.append(path_root)
|
||||
|
||||
|
||||
# path of basic
|
||||
path_dict_macropodus = os.path.join(path_root, "data/dict/macropodus.dict")
|
||||
path_dict_user = os.path.join(path_root, "data/dict/user.dict")
|
||||
path_log_basic = os.path.join(path_root, "logs")
|
||||
|
||||
# path of cache
|
||||
path_macropodus_w2v_char_cache = os.path.join(path_root, 'data/cache/word2vec_char.cache')
|
||||
path_macropodus_dict_freq_cache = os.path.join(path_root, 'data/cache/macropodus.cache')
|
||||
|
||||
# path of embedding
|
||||
path_embedding_word2vec_char = os.path.join(path_root, 'data/embedding/word2vec/w2v_model_wiki_char.vec')
|
||||
path_embedding_bert = os.path.join(path_root, 'data/embedding/chinese_L-12_H-768_A-12/')
|
||||
path_embedding_random_char = os.path.join(path_root, 'data/embedding/term_char.txt')
|
||||
path_embedding_random_word = os.path.join(path_root, 'data/embedding/term_word.txt')
|
||||
path_embedding_albert = os.path.join(path_root, 'data/embedding/albert_base_zh')
|
||||
|
||||
# path of train data of ner people 1998
|
||||
path_ner_people_1998_train = os.path.join(path_root, "data/corpus/ner_people_1998/train.json")
|
||||
path_ner_people_1998_valid = os.path.join(path_root, "data/corpus/ner_people_1998/dev.json")
|
||||
|
||||
# path of train data of seg pku 1998
|
||||
path_seg_pku_1998_train = os.path.join(path_root, "data/corpus/seg_pku_1998/train.json")
|
||||
|
||||
# path of training model save dir
|
||||
path_model_dir = os.path.join(path_root, "data/model")
|
50
macropodus/conf/path_log.py
Normal file
50
macropodus/conf/path_log.py
Normal file
@ -0,0 +1,50 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/18 23:59
|
||||
# @author : Mo
|
||||
# @function: logger of macropodus
|
||||
|
||||
|
||||
from macropodus.conf.path_config import path_log_basic
|
||||
from logging.handlers import RotatingFileHandler
|
||||
import logging
|
||||
import time
|
||||
import os
|
||||
|
||||
|
||||
logger_level = logging.INFO
|
||||
# log目录地址
|
||||
path_logs = path_log_basic # + '/logs'
|
||||
if not os.path.exists(path_logs):
|
||||
os.mkdir(path_logs)
|
||||
# 全局日志格式
|
||||
logging.basicConfig(level=logger_level,
|
||||
format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
|
||||
# 定义一个日志记录器
|
||||
logger = logging.getLogger("macropodus")
|
||||
logger.setLevel(level = logger_level)
|
||||
# 日志文件名,为启动时的日期
|
||||
log_file_name = time.strftime('macropodus-%Y-%m-%d', time.localtime(time.time())) + ".log"
|
||||
log_name_day = os.path.join(path_logs, log_file_name)
|
||||
# 文件输出, 定义一个RotatingFileHandler,最多备份32个日志文件,每个日志文件最大32K
|
||||
fHandler = RotatingFileHandler(log_name_day, maxBytes = 32*1024, backupCount = 32)
|
||||
fHandler.setLevel(logger_level)
|
||||
# 日志输出格式
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
fHandler.setFormatter(formatter)
|
||||
# 控制台输出
|
||||
console = logging.StreamHandler()
|
||||
console.setLevel(logger_level)
|
||||
console.setFormatter(formatter)
|
||||
# logger加到handel里边
|
||||
logger.addHandler(fHandler)
|
||||
logger.addHandler(console)
|
||||
|
||||
|
||||
def get_logger_root(name="macropodus"):
|
||||
"""
|
||||
全局日志引用
|
||||
:param name: str, name of logger
|
||||
:return: object, logging
|
||||
"""
|
||||
return logging.getLogger(name)
|
5
macropodus/data/__init__.py
Normal file
5
macropodus/data/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/21 23:06
|
||||
# @author : Mo
|
||||
# @function:
|
5
macropodus/data/cache/__init__.py
vendored
Normal file
5
macropodus/data/cache/__init__.py
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/3 0:25
|
||||
# @author : Mo
|
||||
# @function:
|
1
macropodus/data/dict/macropodus.dict
Normal file
1
macropodus/data/dict/macropodus.dict
Normal file
File diff suppressed because one or more lines are too long
1
macropodus/data/dict/user.dict
Normal file
1
macropodus/data/dict/user.dict
Normal file
@ -0,0 +1 @@
|
||||
[{"大漠帝国": 132, "macropodus": 132, "思慧计算器": 132, "叉尾斗鱼": 132, "BBC": 132, "坑爹": 132, "喜斗": 32, "护卵": 64, "护幼": 132}]
|
5
macropodus/data/embedding/albert_base_zh/__init__.py
Normal file
5
macropodus/data/embedding/albert_base_zh/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/2 1:08
|
||||
# @author : Mo
|
||||
# @function:
|
15515
macropodus/data/embedding/term_char.txt
Normal file
15515
macropodus/data/embedding/term_char.txt
Normal file
File diff suppressed because it is too large
Load Diff
24382
macropodus/data/embedding/word2vec/w2v_model_wiki_char.vec
Normal file
24382
macropodus/data/embedding/word2vec/w2v_model_wiki_char.vec
Normal file
File diff suppressed because it is too large
Load Diff
5
macropodus/data/model/__init__.py
Normal file
5
macropodus/data/model/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:46
|
||||
# @author : Mo
|
||||
# @function:
|
5
macropodus/data/words_common/__init__.py
Normal file
5
macropodus/data/words_common/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 20:29
|
||||
# @author : Mo
|
||||
# @function:
|
125
macropodus/data/words_common/stop_words.py
Normal file
125
macropodus/data/words_common/stop_words.py
Normal file
@ -0,0 +1,125 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/26 22:02
|
||||
# @author :Mo
|
||||
# @function :stop_words of dict
|
||||
|
||||
stop_words = {0: "…………………………………………………③", 1: "...................", 2: "......", 3: "关于具体地说", 4: "ZXFITL", 5: ")÷(1-",
|
||||
6: "-[*]-", 7: "]∧′=[", 8: "~~~~", 9: "与此同时", 10: "具体地说", 11: "具体说来", 12: "反过来说", 13: "另一方面", 14: "如上所述",
|
||||
15: "尽管如此", 16: "总的来看", 17: "总的来说", 18: "总的说来", 19: "总而言之", 20: "恰恰相反", 21: "换句话说", 22: "由此可见",
|
||||
23: "相对而言", 24: "综上所述", 25: "这么点儿", 26: "这就是说", 27: "除此之外", 28: "2.3%", 29: "R.L.", 30: "[①①]",
|
||||
31: "[①②]", 32: "[①③]", 33: "[①④]", 34: "[①⑤]", 35: "[①⑥]", 36: "[①⑦]", 37: "[①⑧]", 38: "[①⑨]",
|
||||
39: "[①A]", 40: "[①B]", 41: "[①C]", 42: "[①D]", 43: "[①E]", 44: "[①a]", 45: "[①c]", 46: "[①d]",
|
||||
47: "[①e]", 48: "[①f]", 49: "[①g]", 50: "[①h]", 51: "[①i]", 52: "[①o]", 53: "[②①]", 54: "[②②]",
|
||||
55: "[②③]", 56: "[②⑤]", 57: "[②⑥]", 58: "[②⑦]", 59: "[②⑧]", 60: "[②⑩]", 61: "[②B]", 62: "[②G]",
|
||||
63: "[②a]", 64: "[②b]", 65: "[②c]", 66: "[②d]", 67: "[②e]", 68: "[②f]", 69: "[②g]", 70: "[②h]",
|
||||
71: "[②i]", 72: "[②j]", 73: "[③①]", 74: "[③⑩]", 75: "[③F]", 76: "[③a]", 77: "[③b]", 78: "[③c]",
|
||||
79: "[③d]", 80: "[③e]", 81: "[③g]", 82: "[③h]", 83: "[④a]", 84: "[④b]", 85: "[④c]", 86: "[④d]",
|
||||
87: "[④e]", 88: "[⑤]]", 89: "[⑤a]", 90: "[⑤b]", 91: "[⑤d]", 92: "[⑤e]", 93: "[⑤f]", 94: "...", 95: "://",
|
||||
96: "Lex", 97: "exp", 98: "sub", 99: "sup", 100: "×××", 101: "———", 102: "∪φ∈", 103: "》),", 104: "一方面",
|
||||
105: "一转眼", 106: "不外乎", 107: "不尽然", 108: "不至于", 109: "与其说", 110: "且不说", 111: "为什么", 112: "乃至于",
|
||||
113: "之所以", 114: "于是乎", 115: "什么样", 116: "他们们", 117: "加当期", 118: "中列明", 119: "以至于", 120: "借傥然",
|
||||
121: "先不先", 122: "再其次", 123: "再者说", 124: "反过来", 125: "就是了", 126: "就是说", 127: "怎么办", 128: "怎么样",
|
||||
129: "换言之", 130: "没奈何", 131: "甚至于", 132: "简言之", 133: "紧接着", 134: "自个儿", 135: "自各儿", 136: "莫不然",
|
||||
137: "要不是", 138: "要不然", 139: "这一来", 140: "这么些", 141: "这么样", 142: "这会儿", 143: "那么些", 144: "那么样",
|
||||
145: "那会儿", 146: "难道说", 147: "0:2", 148: "12%", 149: "5:0", 150: "[①]", 151: "[②④", 152: "[②]",
|
||||
153: "[③]", 154: "[④]", 155: "[⑤]", 156: "[⑥]", 157: "[⑦]", 158: "[⑧]", 159: "[⑨]", 160: "[⑩]",
|
||||
161: "[*]", 162: "ng昉", 163: "--", 164: "..", 165: "./", 166: ".一", 167: ".数", 168: ".日", 169: "//",
|
||||
170: "::", 171: ">>", 172: "φ.", 173: "——", 174: "’‘", 175: "”,", 176: "……", 177: "′∈", 178: "′|",
|
||||
179: "∈[", 180: "②c", 181: "③]", 182: "──", 183: "〕〔", 184: "一.", 185: "一些", 186: "一何", 187: "一切",
|
||||
188: "一则", 189: "一旦", 190: "一来", 191: "一样", 192: "一般", 193: "万一", 194: "上下", 195: "不仅", 196: "不但",
|
||||
197: "不光", 198: "不单", 199: "不只", 200: "不如", 201: "不妨", 202: "不尽", 203: "不得", 204: "不怕", 205: "不惟",
|
||||
206: "不成", 207: "不拘", 208: "不料", 209: "不是", 210: "不比", 211: "不然", 212: "不特", 213: "不独", 214: "不管",
|
||||
215: "不若", 216: "不论", 217: "不过", 218: "不问", 219: "与其", 220: "与否", 221: "且说", 222: "两者", 223: "个别",
|
||||
224: "为了", 225: "为何", 226: "为止", 227: "为此", 228: "为着", 229: "乃至", 230: "之一", 231: "之类", 232: "乌乎",
|
||||
233: "也好", 234: "也罢", 235: "二来", 236: "于是", 237: "云云", 238: "云尔", 239: "人们", 240: "人家", 241: "什么",
|
||||
242: "介于", 243: "仍旧", 244: "从此", 245: "从而", 246: "他人", 247: "他们", 248: "以上", 249: "以为", 250: "以便",
|
||||
251: "以免", 252: "以及", 253: "以故", 254: "以期", 255: "以来", 256: "以至", 257: "以致", 258: "任何", 259: "任凭",
|
||||
260: "似的", 261: "但凡", 262: "但是", 263: "何以", 264: "何况", 265: "何处", 266: "何时", 267: "余外", 268: "作为",
|
||||
269: "你们", 270: "使得", 271: "例如", 272: "依据", 273: "依照", 274: "便于", 275: "俺们", 276: "倘使", 277: "倘或",
|
||||
278: "倘然", 279: "倘若", 280: "假使", 281: "假如", 282: "假若", 283: "光是", 284: "全体", 285: "全部", 286: "关于",
|
||||
287: "其一", 288: "其中", 289: "其二", 290: "其他", 291: "其余", 292: "其它", 293: "其次", 294: "兼之", 295: "再则",
|
||||
296: "再有", 297: "再者", 298: "再说", 299: "况且", 300: "几时", 301: "凡是", 302: "凭借", 303: "出于", 304: "出来",
|
||||
305: "分别", 306: "则甚", 307: "别人", 308: "别处", 309: "别是", 310: "别的", 311: "别管", 312: "别说", 313: "前后",
|
||||
314: "前此", 315: "前者", 316: "加之", 317: "加以", 318: "即令", 319: "即使", 320: "即便", 321: "即如", 322: "即或",
|
||||
323: "即若", 324: "又及", 325: "及其", 326: "及至", 327: "反之", 328: "反而", 329: "受到", 330: "另外", 331: "另悉",
|
||||
332: "只当", 333: "只怕", 334: "只是", 335: "只有", 336: "只消", 337: "只要", 338: "只限", 339: "叮咚", 340: "可以",
|
||||
341: "可是", 342: "可见", 343: "各个", 344: "各位", 345: "各种", 346: "各自", 347: "同时", 348: "后者", 349: "向使",
|
||||
350: "向着", 351: "否则", 352: "吧哒", 353: "呜呼", 354: "呵呵", 355: "呼哧", 356: "咱们", 357: "哈哈", 358: "哎呀",
|
||||
359: "哎哟", 360: "哪个", 361: "哪些", 362: "哪儿", 363: "哪天", 364: "哪年", 365: "哪怕", 366: "哪样", 367: "哪边",
|
||||
368: "哪里", 369: "哼唷", 370: "唯有", 371: "啪达", 372: "啷当", 373: "喔唷", 374: "嗡嗡", 375: "嘎登", 376: "嘿嘿",
|
||||
377: "因为", 378: "因了", 379: "因此", 380: "因着", 381: "因而", 382: "固然", 383: "在下", 384: "在于", 385: "基于",
|
||||
386: "处在", 387: "多么", 388: "多少", 389: "大家", 390: "她们", 391: "如上", 392: "如下", 393: "如何", 394: "如其",
|
||||
395: "如同", 396: "如是", 397: "如果", 398: "如此", 399: "如若", 400: "始而", 401: "孰料", 402: "孰知", 403: "宁可",
|
||||
404: "宁愿", 405: "宁肯", 406: "它们", 407: "对于", 408: "对待", 409: "对方", 410: "对比", 411: "尔后", 412: "尔尔",
|
||||
413: "尚且", 414: "就是", 415: "就算", 416: "就要", 417: "尽管", 418: "岂但", 419: "已矣", 420: "巴巴", 421: "并且",
|
||||
422: "庶乎", 423: "庶几", 424: "开外", 425: "开始", 426: "归齐", 427: "当地", 428: "当然", 429: "当着", 430: "彼时",
|
||||
431: "彼此", 432: "得了", 433: "怎么", 434: "怎奈", 435: "怎样", 436: "总之", 437: "惟其", 438: "慢说", 439: "我们",
|
||||
440: "或则", 441: "或是", 442: "或曰", 443: "或者", 444: "截至", 445: "所以", 446: "所在", 447: "所幸", 448: "所有",
|
||||
449: "才能", 450: "打从", 451: "抑或", 452: "按照", 453: "据此", 454: "接着", 455: "故此", 456: "故而", 457: "旁人",
|
||||
458: "无宁", 459: "无论", 460: "既往", 461: "既是", 462: "既然", 463: "时候", 464: "是以", 465: "是的", 466: "替代",
|
||||
467: "有些", 468: "有关", 469: "有及", 470: "有时", 471: "有的", 472: "朝着", 473: "本人", 474: "本地", 475: "本着",
|
||||
476: "本身", 477: "来着", 478: "来自", 479: "来说", 480: "极了", 481: "果然", 482: "果真", 483: "某个", 484: "某些",
|
||||
485: "某某", 486: "根据", 487: "正值", 488: "正如", 489: "正巧", 490: "正是", 491: "此地", 492: "此处", 493: "此外",
|
||||
494: "此时", 495: "此次", 496: "此间", 497: "毋宁", 498: "每当", 499: "比及", 500: "比如", 501: "比方", 502: "沿着",
|
||||
503: "漫说", 504: "然则", 505: "然后", 506: "然而", 507: "照着", 508: "犹且", 509: "犹自", 510: "甚且", 511: "甚么",
|
||||
512: "甚或", 513: "甚而", 514: "甚至", 515: "用来", 516: "由于", 517: "由是", 518: "由此", 519: "的确", 520: "的话",
|
||||
521: "直到", 522: "省得", 523: "眨眼", 524: "着呢", 525: "矣乎", 526: "矣哉", 527: "竟而", 528: "等到", 529: "等等",
|
||||
530: "类如", 531: "纵令", 532: "纵使", 533: "纵然", 534: "经过", 535: "结果", 536: "继之", 537: "继后", 538: "继而",
|
||||
539: "罢了", 540: "而且", 541: "而况", 542: "而后", 543: "而外", 544: "而已", 545: "而是", 546: "而言", 547: "能否",
|
||||
548: "自从", 549: "自后", 550: "自家", 551: "自己", 552: "自打", 553: "自身", 554: "至于", 555: "至今", 556: "至若",
|
||||
557: "般的", 558: "若夫", 559: "若是", 560: "若果", 561: "若非", 562: "莫如", 563: "莫若", 564: "虽则", 565: "虽然",
|
||||
566: "虽说", 567: "要不", 568: "要么", 569: "要是", 570: "譬喻", 571: "譬如", 572: "许多", 573: "设使", 574: "设或",
|
||||
575: "设若", 576: "诚如", 577: "诚然", 578: "说来", 579: "诸位", 580: "诸如", 581: "谁人", 582: "谁料", 583: "谁知",
|
||||
584: "贼死", 585: "赖以", 586: "起见", 587: "趁着", 588: "越是", 589: "较之", 590: "还是", 591: "还有", 592: "还要",
|
||||
593: "这个", 594: "这么", 595: "这些", 596: "这儿", 597: "这时", 598: "这样", 599: "这次", 600: "这般", 601: "这边",
|
||||
602: "这里", 603: "进而", 604: "连同", 605: "逐步", 606: "通过", 607: "遵循", 608: "遵照", 609: "那个", 610: "那么",
|
||||
611: "那些", 612: "那儿", 613: "那时", 614: "那样", 615: "那般", 616: "那边", 617: "那里", 618: "鄙人", 619: "鉴于",
|
||||
620: "针对", 621: "除了", 622: "除外", 623: "除开", 624: "除非", 625: "随后", 626: "随时", 627: "随着", 628: "非但",
|
||||
629: "非徒", 630: "非特", 631: "非独", 632: "顺着", 633: "首先", 634: ")、", 635: "+ξ", 636: "++", 637: ",也",
|
||||
638: "-β", 639: "--", 640: "1.", 641: "<±", 642: "<Δ", 643: "<λ", 644: "<φ", 645: "<<", 646: "=″",
|
||||
647: "=☆", 648: "=(", 649: "=-", 650: "=[", 651: "={", 652: ">λ", 653: "LI", 654: "[②", 655: "[-",
|
||||
656: "[]", 657: "][", 658: "a]", 659: "b]", 660: "c]", 661: "e]", 662: "f]", 663: "{-", 664: "}>",
|
||||
665: "~±", 666: "~+", 667: """, 668: "#", 669: "$", 670: "%", 671: "&", 672: """, 673: "(", 674: ")",
|
||||
675: "*", 676: "+", 677: ",", 678: "-", 679: ".", 680: "/", 681: "0", 682: "1", 683: "2", 684: "3",
|
||||
685: "4", 686: "5", 687: "6", 688: "7", 689: "8", 690: "9", 691: ":", 692: ";", 693: "<", 694: "=",
|
||||
695: ">", 696: "?", 697: "@", 698: "A", 699: "[", 700: "\\", 701: "]", 702: "^", 703: "_", 704: "`",
|
||||
705: "|", 706: "}", 707: "~", 708: "·", 709: "×", 710: "Δ", 711: "Ψ", 712: "γ", 713: "μ", 714: "φ",
|
||||
715: "В", 716: "—", 717: "‘", 718: "’", 719: "“", 720: "”", 721: "…", 722: "℃", 723: "Ⅲ", 724: "↑",
|
||||
725: "→", 726: "≈", 727: "①", 728: "②", 729: "③", 730: "④", 731: "⑤", 732: "⑥", 733: "⑦", 734: "⑧",
|
||||
735: "⑨", 736: "⑩", 737: "■", 738: "▲", 739: "、", 740: "。", 741: "〈", 742: "〉", 743: "《", 744: "》",
|
||||
745: "」", 746: "『", 747: "』", 748: "【", 749: "】", 750: "〔", 751: "〕", 752: "㈧", 753: "一", 754: "、",
|
||||
755: "。", 756: "〈", 757: "〉", 758: "《", 759: "》", 760: "一", 761: "七", 762: "三", 763: "上", 764: "下",
|
||||
765: "不", 766: "与", 767: "且", 768: "个", 769: "中", 770: "临", 771: "为", 772: "乃", 773: "么", 774: "之",
|
||||
775: "乎", 776: "乘", 777: "九", 778: "也", 779: "了", 780: "二", 781: "于", 782: "五", 783: "些", 784: "亦",
|
||||
785: "人", 786: "什", 787: "今", 788: "仍", 789: "从", 790: "他", 791: "以", 792: "们", 793: "任", 794: "会",
|
||||
795: "但", 796: "何", 797: "你", 798: "使", 799: "依", 800: "俺", 801: "倘", 802: "借", 803: "做", 804: "像",
|
||||
805: "儿", 806: "八", 807: "六", 808: "兮", 809: "共", 810: "其", 811: "内", 812: "再", 813: "冒", 814: "冲",
|
||||
815: "几", 816: "凡", 817: "凭", 818: "分", 819: "则", 820: "别", 821: "到", 822: "即", 823: "却", 824: "去",
|
||||
825: "又", 826: "及", 827: "另", 828: "只", 829: "叫", 830: "可", 831: "各", 832: "同", 833: "后", 834: "向",
|
||||
835: "吓", 836: "吗", 837: "吧", 838: "含", 839: "吱", 840: "呀", 841: "呃", 842: "呕", 843: "呗", 844: "呜",
|
||||
845: "呢", 846: "呵", 847: "呸", 848: "咋", 849: "和", 850: "咚", 851: "咦", 852: "咧", 853: "咱", 854: "咳",
|
||||
855: "哇", 856: "哈", 857: "哉", 858: "哎", 859: "哗", 860: "哟", 861: "哦", 862: "哩", 863: "哪", 864: "哼",
|
||||
865: "唉", 866: "啊", 867: "啐", 868: "啥", 869: "啦", 870: "喂", 871: "喏", 872: "喽", 873: "嗡", 874: "嗬",
|
||||
875: "嗯", 876: "嗳", 877: "嘎", 878: "嘘", 879: "嘛", 880: "嘻", 881: "嘿", 882: "四", 883: "因", 884: "在",
|
||||
885: "地", 886: "多", 887: "大", 888: "她", 889: "好", 890: "如", 891: "宁", 892: "它", 893: "对", 894: "将",
|
||||
895: "小", 896: "尔", 897: "就", 898: "尽", 899: "己", 900: "已", 901: "巴", 902: "年", 903: "并", 904: "归",
|
||||
905: "当", 906: "彼", 907: "往", 908: "待", 909: "很", 910: "得", 911: "怎", 912: "您", 913: "我", 914: "或",
|
||||
915: "所", 916: "才", 917: "打", 918: "把", 919: "拿", 920: "按", 921: "据", 922: "故", 923: "无", 924: "既",
|
||||
925: "日", 926: "时", 927: "是", 928: "更", 929: "曾", 930: "替", 931: "最", 932: "月", 933: "有", 934: "望",
|
||||
935: "朝", 936: "本", 937: "来", 938: "某", 939: "欤", 940: "此", 941: "每", 942: "比", 943: "沿", 944: "焉",
|
||||
945: "照", 946: "用", 947: "由", 948: "的", 949: "看", 950: "着", 951: "矣", 952: "离", 953: "秒", 954: "第",
|
||||
955: "等", 956: "管", 957: "纵", 958: "经", 959: "给", 960: "者", 961: "而", 962: "能", 963: "腾", 964: "自",
|
||||
965: "至", 966: "致", 967: "若", 968: "虽", 969: "被", 970: "要", 971: "让", 972: "论", 973: "该", 974: "说",
|
||||
975: "请", 976: "诸", 977: "谁", 978: "赶", 979: "起", 980: "趁", 981: "距", 982: "跟", 983: "较", 984: "边",
|
||||
985: "过", 986: "还", 987: "这", 988: "连", 989: "那", 990: "都", 991: "阿", 992: "除", 993: "随", 994: "零",
|
||||
995: "非", 996: "靠", 997: "顺", 998: "︿", 999: "!", 1000: "#", 1001: "$", 1002: "%", 1003: "&", 1004: "(",
|
||||
1005: ")", 1006: "*", 1007: "+", 1008: ",", 1009: "0", 1010: "1", 1011: "2", 1012: "3", 1013: "4",
|
||||
1014: "5", 1015: "6", 1016: "7", 1017: "8", 1018: "9", 1019: ":", 1020: ";", 1021: "<", 1022: ">",
|
||||
1023: "?", 1024: "@", 1025: "[", 1026: "]", 1027: "{", 1028: "|", 1029: "}", 1030: "~", 1031: "¥",
|
||||
1032: "︿", 1033: "!", 1034: "#", 1035: "$", 1036: "%", 1037: "&", 1038: "'", 1039: "(", 1040: ")",
|
||||
1041: "*", 1042: "+", 1043: ",", 1044: "-", 1045: ".", 1046: "/", 1047: "0", 1048: "1", 1049: "2",
|
||||
1050: "3", 1051: "4", 1052: "5", 1053: "6", 1054: "7", 1055: "8", 1056: "9", 1057: ":", 1058: ";",
|
||||
1059: "<", 1060: "=", 1061: ">", 1062: "?", 1063: "@", 1064: "A", 1065: "[", 1066: "]", 1067: "_",
|
||||
1068: "{", 1069: "|", 1070: "}", 1071: "~", 1072: "¥", 1073: "\n", 1074: "", 1075: ",", 1076: " ",
|
||||
1077: ";", 1078: "!", 1079: "?", 1080: ". ", 1081: "'", 1082: "\"", 1083: ","}
|
5
macropodus/logs/__init__.py
Normal file
5
macropodus/logs/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 0:20
|
||||
# @author : Mo
|
||||
# @function:
|
6
macropodus/network/__init__.py
Normal file
6
macropodus/network/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:26
|
||||
# @author : Mo
|
||||
# @function:
|
||||
|
5
macropodus/network/base/__init__.py
Normal file
5
macropodus/network/base/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:32
|
||||
# @author : Mo
|
||||
# @function:
|
331
macropodus/network/base/embedding.py
Normal file
331
macropodus/network/base/embedding.py
Normal file
@ -0,0 +1,331 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/3 21:29
|
||||
# @author :Mo
|
||||
# @function :embeddings of model, base embedding of random, word2vec or bert
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_common import load_json, save_json, txt_read, txt_write
|
||||
from macropodus.conf.path_config import path_embedding_bert, path_embedding_albert
|
||||
from macropodus.network.layers.non_mask_layer import NonMaskingLayer
|
||||
from macropodus.conf.path_config import path_embedding_word2vec_char
|
||||
from macropodus.conf.path_config import path_embedding_random_char
|
||||
from macropodus.preprocess.tools_ml import extract_chinese
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.conf.path_config import path_model_dir
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
from gensim.models import KeyedVectors
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import codecs
|
||||
import os
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
class BaseEmbedding:
|
||||
def __init__(self, hyper_parameters):
|
||||
self.len_max = hyper_parameters.get('len_max', 50) # 文本最大长度, 建议25-50
|
||||
self.embed_size = hyper_parameters.get('embed_size', 300) # 嵌入层尺寸
|
||||
self.vocab_size = hyper_parameters.get('vocab_size', 30000) # 字典大小, 这里随便填的,会根据代码里修改
|
||||
self.trainable = hyper_parameters.get('trainable', False) # 是否微调, 例如静态词向量、动态词向量、微调bert层等, random也可以
|
||||
self.level_type = hyper_parameters.get('level_type', 'char') # 还可以填'word'
|
||||
self.embedding_type = hyper_parameters.get('embedding_type', 'word2vec') # 词嵌入方式,可以选择'xlnet'、'bert'、'random'、'word2vec'
|
||||
self.path_model_dir = hyper_parameters.get('model', {}).get("path_model_dir", path_model_dir) # 模型目录, 提供给字/词典
|
||||
|
||||
# 自适应, 根据level_type和embedding_type判断corpus_path
|
||||
if self.level_type == "word":
|
||||
if self.embedding_type == "bert":
|
||||
raise RuntimeError("bert level_type is 'char', not 'word'")
|
||||
elif self.embedding_type == "xlnet":
|
||||
raise RuntimeError("xlnet level_type is 'char', not 'word'")
|
||||
elif self.embedding_type == "albert":
|
||||
raise RuntimeError("albert level_type is 'char', not 'word'")
|
||||
else:
|
||||
raise RuntimeError("embedding_type must be 'random', 'word2vec' or 'bert'")
|
||||
elif self.level_type == "char":
|
||||
if self.embedding_type == "random":
|
||||
self.corpus_path = hyper_parameters['embedding'].get('corpus_path', path_embedding_random_char)
|
||||
elif self.embedding_type == "word2vec":
|
||||
self.corpus_path = hyper_parameters['embedding'].get('corpus_path', path_embedding_word2vec_char)
|
||||
elif self.embedding_type == "bert":
|
||||
self.corpus_path = hyper_parameters['embedding'].get('corpus_path', path_embedding_bert)
|
||||
elif self.embedding_type == "albert":
|
||||
self.corpus_path = hyper_parameters['embedding'].get('corpus_path', path_embedding_albert)
|
||||
else:
|
||||
raise RuntimeError("embedding_type must be 'random', 'word2vec' or 'bert'")
|
||||
else:
|
||||
raise RuntimeError("level_type must be 'char' or 'word'")
|
||||
# 定义的符号
|
||||
self.ot_dict = {'[PAD]': 0,
|
||||
'[UNK]': 1,
|
||||
'[BOS]': 2,
|
||||
'[EOS]': 3, }
|
||||
self.deal_corpus()
|
||||
self.build()
|
||||
|
||||
def deal_corpus(self): # 处理语料
|
||||
pass
|
||||
|
||||
def build(self):
|
||||
self.token2idx = {}
|
||||
self.idx2token = {}
|
||||
|
||||
def sentence2idx(self, text):
|
||||
text = extract_chinese(str(text).upper())
|
||||
if self.level_type == 'char':
|
||||
text = list(text)
|
||||
elif self.level_type == 'word':
|
||||
text = macropodus_cut(text)
|
||||
else:
|
||||
raise RuntimeError("your input level_type is wrong, it must be 'word' or 'char'")
|
||||
text = [text_one for text_one in text]
|
||||
len_leave = self.len_max - len(text)
|
||||
if len_leave >= 0:
|
||||
text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for
|
||||
text_char in text] + [self.token2idx['[PAD]'] for i in range(len_leave)]
|
||||
else:
|
||||
text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for
|
||||
text_char in text[0:self.len_max]]
|
||||
input_mask = min(len(text), self.len_max)
|
||||
return text_index, input_mask
|
||||
|
||||
def idx2sentence(self, idx):
|
||||
assert type(idx) == list
|
||||
text_idx = [self.idx2token[id] if id in self.idx2token else self.idx2token['[UNK]'] for id in idx]
|
||||
return "".join(text_idx)
|
||||
|
||||
|
||||
class RandomEmbedding(BaseEmbedding):
|
||||
def __init__(self, hyper_parameters):
|
||||
super().__init__(hyper_parameters)
|
||||
# self.path = hyper_parameters.get('corpus_path', path_embedding_random_char)
|
||||
|
||||
def deal_corpus(self):
|
||||
token2idx = self.ot_dict.copy()
|
||||
count = 3
|
||||
if 'term' in self.corpus_path:
|
||||
with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd:
|
||||
while True:
|
||||
term_one = fd.readline()
|
||||
if not term_one:
|
||||
break
|
||||
term_one = term_one.strip()
|
||||
if term_one not in token2idx:
|
||||
count = count + 1
|
||||
token2idx[term_one] = count
|
||||
|
||||
elif 'corpus' in self.corpus_path:
|
||||
with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd:
|
||||
terms = fd.readlines()
|
||||
for term_one in terms:
|
||||
if self.level_type == 'char':
|
||||
text = list(term_one.replace(' ', '').strip())
|
||||
elif self.level_type == 'word':
|
||||
text = macropodus_cut(term_one)
|
||||
else:
|
||||
raise RuntimeError("your input level_type is wrong, it must be 'word' or 'char'")
|
||||
for text_one in text:
|
||||
if term_one not in token2idx:
|
||||
count = count + 1
|
||||
token2idx[text_one] = count
|
||||
else:
|
||||
raise RuntimeError("your input corpus_path is wrong, it must be 'dict' or 'corpus'")
|
||||
self.token2idx = token2idx
|
||||
self.idx2token = {}
|
||||
for key, value in self.token2idx.items():
|
||||
self.idx2token[value] = key
|
||||
|
||||
def build(self, **kwargs):
|
||||
self.vocab_size = len(self.token2idx)
|
||||
self.input = tf.keras.layers.Input(shape=(self.len_max,), dtype='int32')
|
||||
self.output = tf.keras.layers.Embedding(self.vocab_size,
|
||||
self.embed_size,
|
||||
input_length=self.len_max,
|
||||
trainable=self.trainable,
|
||||
)(self.input)
|
||||
self.model = tf.keras.Model(self.input, self.output)
|
||||
save_json(json_lines=self.token2idx, json_path=os.path.join(self.path_model_dir, 'vocab.txt'))
|
||||
|
||||
|
||||
class WordEmbedding(BaseEmbedding):
|
||||
def __init__(self, hyper_parameters):
|
||||
super().__init__(hyper_parameters)
|
||||
# self.path = hyper_parameters.get('corpus_path', path_embedding_vector_word2vec)
|
||||
|
||||
def build(self, **kwargs):
|
||||
self.embedding_type = 'word2vec'
|
||||
# logger.info("load word2vec start!")
|
||||
self.key_vector = KeyedVectors.load_word2vec_format(self.corpus_path, **kwargs)
|
||||
# logger.info("load word2vec end!")
|
||||
self.embed_size = self.key_vector.vector_size
|
||||
|
||||
self.token2idx = self.ot_dict.copy()
|
||||
embedding_matrix = []
|
||||
# 首先加self.token2idx中的四个[PAD]、[UNK]、[BOS]、[EOS]
|
||||
embedding_matrix.append(np.zeros(self.embed_size))
|
||||
embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
|
||||
embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
|
||||
embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
|
||||
|
||||
for word in self.key_vector.index2entity:
|
||||
self.token2idx[word] = len(self.token2idx)
|
||||
embedding_matrix.append(self.key_vector[word])
|
||||
|
||||
# self.token2idx = self.token2idx
|
||||
self.idx2token = {}
|
||||
for key, value in self.token2idx.items():
|
||||
self.idx2token[value] = key
|
||||
|
||||
self.vocab_size = len(self.token2idx)
|
||||
embedding_matrix = np.array(embedding_matrix)
|
||||
# self.input = Input(shape=(self.len_max,), dtype='int32')
|
||||
self.input = tf.keras.layers.Input(shape=(self.len_max,), dtype='int32')
|
||||
|
||||
self.output = tf.keras.layers.Embedding(self.vocab_size,
|
||||
self.embed_size,
|
||||
input_length=self.len_max,
|
||||
weights=[embedding_matrix],
|
||||
trainable=self.trainable)(self.input)
|
||||
self.model = tf.keras.Model(self.input, self.output)
|
||||
# 保存字/词典
|
||||
save_json(json_lines=self.token2idx, json_path=os.path.join(self.path_model_dir, 'vocab.txt'))
|
||||
|
||||
|
||||
class BertEmbedding(BaseEmbedding):
|
||||
def __init__(self, hyper_parameters):
|
||||
self.layer_indexes = hyper_parameters['embedding'].get('layer_indexes', [12])
|
||||
super().__init__(hyper_parameters)
|
||||
|
||||
def build(self):
|
||||
import keras_bert
|
||||
|
||||
self.embedding_type = 'bert'
|
||||
config_path = os.path.join(self.corpus_path, 'bert_config.json')
|
||||
check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
|
||||
dict_path = os.path.join(self.corpus_path, 'vocab.txt')
|
||||
# logger.info('load bert model start!')
|
||||
model = keras_bert.load_trained_model_from_checkpoint(config_path,
|
||||
check_point_path,
|
||||
seq_len=self.len_max,
|
||||
trainable=self.trainable)
|
||||
# logger.info('load bert model success!')
|
||||
# bert model all layers
|
||||
layer_dict = [6]
|
||||
layer_0 = 7
|
||||
for i in range(12):
|
||||
layer_0 = layer_0 + 8
|
||||
layer_dict.append(layer_0)
|
||||
logger.info(layer_dict)
|
||||
# 输出它本身
|
||||
if len(self.layer_indexes) == 0:
|
||||
encoder_layer = model.output
|
||||
# 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
|
||||
elif len(self.layer_indexes) == 1:
|
||||
if self.layer_indexes[0] in [i + 1 for i in range(13)]:
|
||||
encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output
|
||||
else:
|
||||
encoder_layer = model.get_layer(index=layer_dict[-1]).output
|
||||
# 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
|
||||
else:
|
||||
all_layers = [model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)]
|
||||
else model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层
|
||||
for lay in self.layer_indexes]
|
||||
all_layers_select = []
|
||||
for all_layers_one in all_layers:
|
||||
all_layers_select.append(all_layers_one)
|
||||
encoder_layer = tf.keras.layers.Add()(all_layers_select)
|
||||
self.output = NonMaskingLayer()(encoder_layer)
|
||||
self.input = model.inputs
|
||||
self.model = tf.keras.Model(self.input, self.output)
|
||||
|
||||
self.embedding_size = self.model.output_shape[-1]
|
||||
|
||||
# reader tokenizer
|
||||
self.token_dict = {}
|
||||
with codecs.open(dict_path, 'r', 'utf8') as reader:
|
||||
for line in reader:
|
||||
token = line.strip()
|
||||
self.token_dict[token] = len(self.token_dict)
|
||||
self.vocab_size = len(self.token_dict)
|
||||
self.tokenizer = keras_bert.Tokenizer(self.token_dict)
|
||||
|
||||
def sentence2idx(self, text, second_text=None):
|
||||
text = extract_chinese(str(text).upper())
|
||||
input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
|
||||
input_mask = len([1 for ids in input_id if ids == 1])
|
||||
return input_id, input_type_id, input_mask
|
||||
# input_mask = [0 if ids == 0 else 1 for ids in input_id]
|
||||
# return input_id, input_type_id, input_mask
|
||||
# return input_id, input_type_id
|
||||
|
||||
|
||||
class AlbertEmbedding(BaseEmbedding):
|
||||
def __init__(self, hyper_parameters):
|
||||
self.layer_indexes = hyper_parameters['embedding'].get('layer_indexes', [12])
|
||||
super().__init__(hyper_parameters)
|
||||
|
||||
def build(self):
|
||||
from macropodus.network.layers.albert import load_brightmart_albert_zh_checkpoint
|
||||
import keras_bert
|
||||
|
||||
self.embedding_type = 'albert'
|
||||
dict_path = os.path.join(self.corpus_path, 'vocab.txt')
|
||||
# logger.info('load albert model start!')
|
||||
layer_real = [i for i in range(25)] + [-i for i in range(25)]
|
||||
# 简要判别一下
|
||||
self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes]
|
||||
self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path,
|
||||
training=self.trainable,
|
||||
seq_len=self.len_max,
|
||||
output_layers = None) # self.layer_indexes)
|
||||
# model_l = self.model.layers
|
||||
# logger.info('load albert model success!')
|
||||
# albert model all layers
|
||||
layer_dict = [4, 8, 11, 13]
|
||||
layer_0 = 13
|
||||
for i in range(20):
|
||||
layer_0 = layer_0 + 1
|
||||
layer_dict.append(layer_0)
|
||||
layer_dict.append(34)
|
||||
logger.info(layer_dict)
|
||||
# 输出它本身
|
||||
if len(self.layer_indexes) == 0:
|
||||
encoder_layer = self.model.output
|
||||
# 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
|
||||
elif len(self.layer_indexes) == 1:
|
||||
if self.layer_indexes[0] in layer_real:
|
||||
encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0]]).output
|
||||
else:
|
||||
encoder_layer = self.model.get_layer(index=layer_dict[-2]).output
|
||||
# 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
|
||||
else:
|
||||
all_layers = [self.model.get_layer(index=layer_dict[lay]).output if lay in layer_real
|
||||
else self.model.get_layer(index=layer_dict[-2]).output # 如果给出不正确,就默认输出最后一层
|
||||
for lay in self.layer_indexes]
|
||||
all_layers_select = []
|
||||
for all_layers_one in all_layers:
|
||||
all_layers_select.append(all_layers_one)
|
||||
encoder_layer = tf.keras.layers.Add()(all_layers_select)
|
||||
output = NonMaskingLayer()(encoder_layer)
|
||||
self.output = [output]
|
||||
# self.output = [encoder_layer]
|
||||
self.input = self.model.inputs
|
||||
self.model = tf.keras.Model(self.input, self.output)
|
||||
|
||||
# reader tokenizer
|
||||
self.token_dict = {}
|
||||
with codecs.open(dict_path, 'r', 'utf8') as reader:
|
||||
for line in reader:
|
||||
token = line.strip()
|
||||
self.token_dict[token] = len(self.token_dict)
|
||||
self.vocab_size = len(self.token_dict)
|
||||
self.tokenizer = keras_bert.Tokenizer(self.token_dict)
|
||||
|
||||
def sentence2idx(self, text, second_text=None):
|
||||
text = extract_chinese(str(text).upper())
|
||||
input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
|
||||
input_mask = len([1 for ids in input_id if ids ==1])
|
||||
return input_id, input_type_id, input_mask
|
||||
# return [input_id, input_type_id]
|
238
macropodus/network/base/graph.py
Normal file
238
macropodus/network/base/graph.py
Normal file
@ -0,0 +1,238 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/12/3 20:51
|
||||
# @author :Mo
|
||||
# @function :graph of base
|
||||
|
||||
|
||||
from macropodus.network.preprocess.preprocess_generator import PreprocessGenerator
|
||||
from macropodus.network.layers.keras_lookahead import Lookahead
|
||||
from macropodus.preprocess.tools_common import save_json
|
||||
from macropodus.network.layers.keras_radam import RAdam
|
||||
from macropodus.conf.path_config import path_model_dir
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
class graph:
|
||||
def __init__(self, hyper_parameters):
|
||||
"""
|
||||
模型初始化
|
||||
:param hyper_parameters:json, json["model"] and json["embedding"]
|
||||
"""
|
||||
self.len_max = hyper_parameters.get("len_max", 50) # 文本最大长度
|
||||
self.embed_size = hyper_parameters.get("embed_size", 300) # 嵌入层尺寸
|
||||
self.trainable = hyper_parameters.get("trainable", False) # 是否微调, 例如静态词向量、动态词向量、微调bert层等, random也可以
|
||||
self.embedding_type = hyper_parameters.get("embedding_type", "word2vec") # 词嵌入方式,可以选择"xlnet"、"bert"、"gpt-2"、"word2vec"或者"None"
|
||||
self.gpu_memory_fraction = hyper_parameters.get("gpu_memory_fraction", None) # gpu使用率, 默认不配置
|
||||
self.hyper_parameters = hyper_parameters
|
||||
hyper_parameters_model = hyper_parameters["model"]
|
||||
self.label = hyper_parameters_model.get("label", 2) # 类型
|
||||
self.batch_size = hyper_parameters_model.get("batch_size", 32) # 批向量
|
||||
self.filters = hyper_parameters_model.get("filters", [3, 4, 5]) # 卷积核大小
|
||||
self.filters_num = hyper_parameters_model.get("filters_num", 300) # 核数
|
||||
self.channel_size = hyper_parameters_model.get("channel_size", 1) # 通道数
|
||||
self.dropout = hyper_parameters_model.get("dropout", 0.5) # dropout层系数,舍弃
|
||||
self.decay_step = hyper_parameters_model.get("decay_step", 100) # 衰减步数
|
||||
self.decay_rate = hyper_parameters_model.get("decay_rate", 0.9) # 衰减系数
|
||||
self.epochs = hyper_parameters_model.get("epochs", 20) # 训练轮次
|
||||
self.vocab_size = hyper_parameters_model.get("vocab_size", 20000) # 字典词典大小
|
||||
self.lr = hyper_parameters_model.get("lr", 1e-3) # 学习率
|
||||
self.l2 = hyper_parameters_model.get("l2", 1e-6) # l2正则化系数
|
||||
self.activate_rnn = hyper_parameters_model.get("activate_rnn", "tanh") # RNN激活函数, tanh, relu, signmod
|
||||
self.activate_classify = hyper_parameters_model.get("activate_classify", "softmax") # 分类激活函数,softmax或者signmod
|
||||
self.loss = hyper_parameters_model.get("loss", "categorical_crossentropy") # 损失函数, mse, categorical_crossentropy, sparse_categorical_crossentropy, binary_crossentropy等
|
||||
self.metrics = hyper_parameters_model.get("metrics", "accuracy") # acc, binary_accuracy, categorical_accuracy, sparse_categorical_accuracy, sparse_top_k_categorical_accuracy
|
||||
self.is_training = hyper_parameters_model.get("is_training", False) # 是否训练, 保存时候为Flase,方便预测
|
||||
self.patience = hyper_parameters_model.get("patience", 3) # 早停, 2-3就可以了
|
||||
self.optimizer_name = hyper_parameters_model.get("optimizer_name", "RAdam,Lookahead") # 早停, 2-3就可以了
|
||||
self.path_model_dir = hyper_parameters_model.get("path_model_dir", path_model_dir) # 模型目录
|
||||
self.path_fineture = os.path.join(self.path_model_dir, "embedding_trainable.h5") # embedding层保存地址, 例如静态词向量、动态词向量、微调bert层等
|
||||
self.path_model = os.path.join(self.path_model_dir, "model.h5") # 模型weight绝对地址
|
||||
self.path_hyper_parameters = os.path.join(self.path_model_dir, "params.json") # 超参数保存绝对地址
|
||||
self.path_model_l2i_i2l = os.path.join(self.path_model_dir, "l2i_i2l.json") # 模型字典保存绝对地址
|
||||
self.path_model_graph = os.path.join(self.path_model_dir, "graph.json") # 模型图结构绝对地址
|
||||
if self.gpu_memory_fraction:
|
||||
# keras, tensorflow控制GPU使用率等
|
||||
import tensorflow.python.keras.backend as K
|
||||
import tensorflow as tf
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
|
||||
# config.gpu_options.allow_growth = True
|
||||
sess = tf.Session(config=config)
|
||||
K.set_session(sess)
|
||||
self.create_model(hyper_parameters) # 模型初始化
|
||||
if self.is_training: # 是否是训练阶段, 与预测区分开
|
||||
self.create_compile()
|
||||
self.save_graph() # 保存图结构
|
||||
|
||||
def create_model(self, hyper_parameters):
|
||||
"""
|
||||
构建神经网络
|
||||
:param hyper_parameters: json,超参数
|
||||
:return:
|
||||
"""
|
||||
# embeddings选择
|
||||
if self.embedding_type == "albert":
|
||||
from macropodus.network.base.embedding import AlbertEmbedding as Embeddings
|
||||
elif self.embedding_type == "random":
|
||||
from macropodus.network.base.embedding import RandomEmbedding as Embeddings
|
||||
elif self.embedding_type == "word2vec":
|
||||
from macropodus.network.base.embedding import WordEmbedding as Embeddings
|
||||
elif self.embedding_type == "bert":
|
||||
from macropodus.network.base.embedding import BertEmbedding as Embeddings
|
||||
else:
|
||||
raise RuntimeError("your input embedding_type is wrong, it must be 'random'、 'bert'、 'albert' or 'word2vec'")
|
||||
# 构建网络层
|
||||
self.word_embedding = Embeddings(hyper_parameters=hyper_parameters)
|
||||
if os.path.exists(self.path_fineture) and self.trainable:
|
||||
self.word_embedding.model.load_weights(self.path_fineture)
|
||||
print("load path_fineture ok!")
|
||||
self.model = None
|
||||
|
||||
def callback(self):
|
||||
"""
|
||||
评价函数、早停
|
||||
:return: callback
|
||||
"""
|
||||
cb_em = [
|
||||
tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", min_delta=1e-8, patience=self.patience),
|
||||
tf.keras.callbacks.ModelCheckpoint(monitor="val_loss", mode="min", filepath=self.path_model, verbose=1,
|
||||
save_best_only=True, save_weights_only=False), ]
|
||||
return cb_em
|
||||
|
||||
def create_compile(self):
|
||||
"""
|
||||
构建优化器、损失函数和评价函数
|
||||
:return:
|
||||
"""
|
||||
if self.optimizer_name.upper() == "ADAM":
|
||||
self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.loss,
|
||||
metrics=[self.metrics]) # Any optimize
|
||||
elif self.optimizer_name.upper() == "RADAM":
|
||||
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.loss,
|
||||
metrics=[self.metrics]) # Any optimize
|
||||
else:
|
||||
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.loss,
|
||||
metrics=[self.metrics]) # Any optimize
|
||||
lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
|
||||
lookahead.inject(self.model) # add into model
|
||||
|
||||
def fit(self, x_train, y_train, x_dev, y_dev):
|
||||
"""
|
||||
训练
|
||||
:param x_train:
|
||||
:param y_train:
|
||||
:param x_dev:
|
||||
:param y_dev:
|
||||
:return:
|
||||
"""
|
||||
# 保存超参数
|
||||
self.hyper_parameters["model"]["is_training"] = False # 预测时候这些设为False
|
||||
self.hyper_parameters["model"]["trainable"] = False
|
||||
self.hyper_parameters["model"]["dropout"] = 1.0
|
||||
|
||||
save_json(json_lines=self.hyper_parameters, json_path=self.path_hyper_parameters)
|
||||
# 训练模型
|
||||
self.model.fit(x_train, y_train, batch_size=self.batch_size,
|
||||
epochs=self.epochs, validation_data=(x_dev, y_dev),
|
||||
shuffle=True,
|
||||
callbacks=self.callback())
|
||||
# 保存embedding, 动态的
|
||||
if self.trainable:
|
||||
self.word_embedding.model.save(self.path_fineture)
|
||||
|
||||
def fit_generator(self, embed, rate=1):
|
||||
"""
|
||||
|
||||
:param data_fit_generator: yield, 训练数据
|
||||
:param data_dev_generator: yield, 验证数据
|
||||
:param steps_per_epoch: int, 训练一轮步数
|
||||
:param validation_steps: int, 验证一轮步数
|
||||
:return:
|
||||
"""
|
||||
# 保存超参数
|
||||
self.hyper_parameters["model"]["is_training"] = False # 预测时候这些设为False
|
||||
self.hyper_parameters["model"]["trainable"] = False
|
||||
self.hyper_parameters["model"]["dropout"] = 1.0
|
||||
|
||||
save_json(json_lines=self.hyper_parameters, json_path=self.path_hyper_parameters)
|
||||
|
||||
pg = PreprocessGenerator(self.path_model_l2i_i2l)
|
||||
_, len_train = pg.preprocess_label2set(self.hyper_parameters["data"]["train_data"])
|
||||
data_fit_generator = pg.preprocess_label_question_to_idx_fit_generator(embedding_type=self.hyper_parameters["embedding_type"],
|
||||
batch_size=self.batch_size,
|
||||
path=self.hyper_parameters["data"]["train_data"],
|
||||
embed=embed,
|
||||
rate=rate)
|
||||
_, len_val = pg.preprocess_label2set(self.hyper_parameters["data"]["val_data"])
|
||||
data_dev_generator = pg.preprocess_label_question_to_idx_fit_generator(embedding_type=self.hyper_parameters["embedding_type"],
|
||||
batch_size=self.batch_size,
|
||||
path=self.hyper_parameters["data"]["val_data"],
|
||||
embed=embed,
|
||||
rate=rate)
|
||||
steps_per_epoch = len_train // self.batch_size
|
||||
validation_steps = len_val // self.batch_size
|
||||
# 训练模型
|
||||
self.model.fit_generator(generator=data_fit_generator,
|
||||
validation_data=data_dev_generator,
|
||||
callbacks=self.callback(),
|
||||
epochs=self.epochs,
|
||||
steps_per_epoch=steps_per_epoch,
|
||||
validation_steps=validation_steps)
|
||||
# 保存embedding, 动态的
|
||||
if self.trainable:
|
||||
self.word_embedding.model.save(self.path_fineture)
|
||||
|
||||
def save_graph(self):
|
||||
"""
|
||||
模型图保存
|
||||
:return: None
|
||||
"""
|
||||
# # 序列化模型embidding
|
||||
# import pickle
|
||||
# file_fineture = open(self.path_fineture, "wb")
|
||||
# pickle.dumps(self.word_embedding.sentence2idx, file_fineture)
|
||||
# 序列化模型graph
|
||||
json_string = self.model.to_json()
|
||||
open(self.path_model_graph, "w", encoding="utf-8").write(json_string)
|
||||
|
||||
def load_model(self):
|
||||
"""
|
||||
模型下载
|
||||
:return: None
|
||||
"""
|
||||
logger.info("load_model start!")
|
||||
self.model.load_weights(self.path_model)
|
||||
logger.info("load_model end!")
|
||||
|
||||
def predict(self, sen):
|
||||
"""
|
||||
预测
|
||||
:param sen:
|
||||
:return:
|
||||
"""
|
||||
if self.embedding_type in ["bert", "albert"]:
|
||||
if type(sen) == np.ndarray:
|
||||
sen = sen.tolist()
|
||||
elif type(sen) == list:
|
||||
sen = sen
|
||||
else:
|
||||
raise RuntimeError("your input sen is wrong, it must be type of list or np.array")
|
||||
return self.model.predict(sen)
|
||||
else:
|
||||
if type(sen) == np.ndarray:
|
||||
sen = sen
|
||||
elif type(sen) == list:
|
||||
sen = np.array([sen])
|
||||
else:
|
||||
raise RuntimeError("your input sen is wrong, it must be type of list or np.array")
|
||||
return self.model.predict(sen)
|
5
macropodus/network/graph/__init__.py
Normal file
5
macropodus/network/graph/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:32
|
||||
# @author : Mo
|
||||
# @function:
|
45
macropodus/network/graph/bilstm.py
Normal file
45
macropodus/network/graph/bilstm.py
Normal file
@ -0,0 +1,45 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/19 22:30
|
||||
# @author : Mo
|
||||
# @function: Bi-LSTM
|
||||
|
||||
|
||||
from macropodus.network.base.graph import graph
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class BiLSTMGraph(graph):
|
||||
def __init__(self, hyper_parameters):
|
||||
"""
|
||||
初始化
|
||||
:param hyper_parameters: json,超参
|
||||
"""
|
||||
self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 1)
|
||||
self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM')
|
||||
self.rnn_units = hyper_parameters['model'].get('rnn_units', 256)
|
||||
super().__init__(hyper_parameters)
|
||||
|
||||
def create_model(self, hyper_parameters):
|
||||
"""
|
||||
构建神经网络
|
||||
:param hyper_parameters:json, hyper parameters of network
|
||||
:return: tensor, moedl
|
||||
"""
|
||||
super().create_model(hyper_parameters)
|
||||
self.rnn_layer = {'LSTM':tf.keras.layers.LSTM, 'GRU':tf.keras.layers.GRU}[self.rnn_type]
|
||||
x = self.word_embedding.output
|
||||
# Bi-LSTM
|
||||
for nrl in range(self.num_rnn_layers):
|
||||
x = tf.keras.layers.Bidirectional(self.rnn_layer(units=self.rnn_units,
|
||||
return_sequences=True,
|
||||
activation=self.activate_rnn,
|
||||
kernel_regularizer=tf.keras.regularizers.l2(self.l2),
|
||||
recurrent_regularizer=tf.keras.regularizers.l2(self.l2)
|
||||
))(x)
|
||||
x = tf.keras.layers.Dropout(self.dropout)(x)
|
||||
x_time = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.label, name='layer_crf_dense'))(x)
|
||||
x_act = tf.keras.layers.Activation(activation=self.activate_classify)(x_time)
|
||||
self.output = x_act
|
||||
self.model = tf.keras.Model(self.word_embedding.input, self.output)
|
||||
self.model.summary(132)
|
87
macropodus/network/graph/bilstm_crf.py
Normal file
87
macropodus/network/graph/bilstm_crf.py
Normal file
@ -0,0 +1,87 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/6 20:45
|
||||
# @author : Mo
|
||||
# @function: Bi-LSTM-CRF
|
||||
|
||||
|
||||
from macropodus.network.layers.keras_lookahead import Lookahead
|
||||
from macropodus.network.layers.keras_radam import RAdam
|
||||
from macropodus.network.base.graph import graph
|
||||
from macropodus.network.layers.crf import CRF
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class BilstmCRFGraph(graph):
|
||||
def __init__(self, hyper_parameters):
|
||||
"""
|
||||
初始化
|
||||
:param hyper_parameters: json,超参
|
||||
"""
|
||||
self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 1) # 1, 2, 3
|
||||
self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM') # 'LSTM', 'GRU'
|
||||
self.rnn_units = hyper_parameters['model'].get('rnn_units', 512) # 128, 256, 512, 768, 1024
|
||||
self.crf_mode = hyper_parameters['model'].get('crf_mode', 'reg') # "reg", pad
|
||||
self.supports_masking = hyper_parameters['model'].get('supports_masking', True) # True or False
|
||||
super().__init__(hyper_parameters)
|
||||
|
||||
def create_model(self, hyper_parameters):
|
||||
"""
|
||||
构建神经网络
|
||||
:param hyper_parameters:json, hyper parameters of network
|
||||
:return: tensor, moedl
|
||||
"""
|
||||
super().create_model(hyper_parameters)
|
||||
# LSTM or GRU
|
||||
self.rnn_layer = {'LSTM':tf.keras.layers.LSTM, 'GRU':tf.keras.layers.GRU}[self.rnn_type]
|
||||
x = self.word_embedding.output
|
||||
# Bi-LSTM
|
||||
for nrl in range(self.num_rnn_layers):
|
||||
x = tf.keras.layers.Bidirectional(self.rnn_layer(units=self.rnn_units,
|
||||
return_sequences=True,
|
||||
activation=self.activate_rnn,
|
||||
kernel_regularizer=tf.keras.regularizers.l2(self.l2 * 0.1),
|
||||
recurrent_regularizer=tf.keras.regularizers.l2(self.l2)
|
||||
))(x)
|
||||
x = tf.keras.layers.Dropout(self.dropout)(x)
|
||||
# crf, 'pad' or 'reg'
|
||||
if self.crf_mode == "pad":
|
||||
# length of real sentence
|
||||
x_mask = tf.keras.layers.Input(shape=(1), dtype=tf.int32)
|
||||
self.crf = CRF(self.label, mode='pad', supports_masking=True, name='crf')
|
||||
tensor = tf.keras.layers.Dense(self.label, name='crf_dense')(x)
|
||||
self.output = self.crf([tensor, x_mask])
|
||||
if self.embedding_type in ["bert", "albert"]:
|
||||
self.inputs = [self.word_embedding.input[0], self.word_embedding.input[1], x_mask]
|
||||
else:
|
||||
self.inputs = [self.word_embedding.input, x_mask]
|
||||
else:
|
||||
self.crf = CRF(self.label, mode='reg', name='crf')
|
||||
tensor = tf.keras.layers.Dense(self.label, name='crf_dense')(x)
|
||||
self.output = self.crf(tensor)
|
||||
if self.embedding_type in ["bert", "albert"]:
|
||||
self.inputs = self.word_embedding.input
|
||||
else:
|
||||
self.inputs = self.word_embedding.input
|
||||
self.model = tf.keras.Model(self.inputs, self.output)
|
||||
self.model.summary(132)
|
||||
|
||||
def create_compile(self):
|
||||
"""
|
||||
构建优化器、损失函数和评价函数
|
||||
:return:
|
||||
"""
|
||||
if self.optimizer_name.upper() == "ADAM":
|
||||
self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.crf.loss,
|
||||
metrics=[self.crf.viterbi_accuracy]) # Any optimize, [self.metrics])
|
||||
elif self.optimizer_name.upper() == "RADAM":
|
||||
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.crf.loss,
|
||||
metrics=[self.crf.viterbi_accuracy]) # Any optimize
|
||||
else:
|
||||
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.crf.loss,
|
||||
metrics=[self.crf.viterbi_accuracy]) # Any optimize
|
||||
lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
|
||||
lookahead.inject(self.model) # add into model
|
5
macropodus/network/layers/__init__.py
Normal file
5
macropodus/network/layers/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/3 20:43
|
||||
# @author : Mo
|
||||
# @function:
|
331
macropodus/network/layers/albert.py
Normal file
331
macropodus/network/layers/albert.py
Normal file
@ -0,0 +1,331 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/22 20:35
|
||||
# @author :TinkerMob
|
||||
# @function :keras_albert_model
|
||||
# @codefrom :https://github.com/TinkerMob/keras_albert_model
|
||||
|
||||
|
||||
from keras_adaptive_softmax import AdaptiveEmbedding, AdaptiveSoftmax
|
||||
from keras_bert import get_custom_objects as get_bert_custom_objects
|
||||
from keras_position_wise_feed_forward import FeedForward
|
||||
from keras_layer_normalization import LayerNormalization
|
||||
from keras_bert.activations.gelu_fallback import gelu
|
||||
from keras_multi_head import MultiHeadAttention
|
||||
from keras_bert.layers import Masked, Extract
|
||||
from keras_pos_embd import PositionEmbedding
|
||||
from keras_bert.backend import keras
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
__all__ = [
|
||||
'get_custom_objects', 'build_albert',
|
||||
'load_brightmart_albert_zh_checkpoint',
|
||||
]
|
||||
|
||||
|
||||
def get_custom_objects():
|
||||
custom_objects = get_bert_custom_objects()
|
||||
custom_objects['AdaptiveEmbedding'] = AdaptiveEmbedding
|
||||
custom_objects['AdaptiveSoftmax'] = AdaptiveSoftmax
|
||||
return custom_objects
|
||||
|
||||
|
||||
def build_albert(token_num,
|
||||
pos_num=512,
|
||||
seq_len=512,
|
||||
embed_dim=128,
|
||||
hidden_dim=768,
|
||||
transformer_num=12,
|
||||
head_num=12,
|
||||
feed_forward_dim=3072,
|
||||
dropout_rate=0.1,
|
||||
attention_activation=None,
|
||||
feed_forward_activation='gelu',
|
||||
training=True,
|
||||
trainable=None,
|
||||
output_layers=None):
|
||||
"""Get ALBERT model.
|
||||
See: https://arxiv.org/pdf/1909.11942.pdf
|
||||
:param token_num: Number of tokens.
|
||||
:param pos_num: Maximum position.
|
||||
:param seq_len: Maximum length of the input sequence or None.
|
||||
:param embed_dim: Dimensions of embeddings.
|
||||
:param hidden_dim: Dimensions of hidden layers.
|
||||
:param transformer_num: Number of transformers.
|
||||
:param head_num: Number of heads in multi-head attention
|
||||
in each transformer.
|
||||
:param feed_forward_dim: Dimension of the feed forward layer
|
||||
in each transformer.
|
||||
:param dropout_rate: Dropout rate.
|
||||
:param attention_activation: Activation for attention layers.
|
||||
:param feed_forward_activation: Activation for feed-forward layers.
|
||||
:param training: A built model with MLM and NSP outputs will be returned
|
||||
if it is `True`, otherwise the input layers and the last
|
||||
feature extraction layer will be returned.
|
||||
:param trainable: Whether the model is trainable.
|
||||
:param output_layers: A list of indices of output layers.
|
||||
"""
|
||||
if attention_activation == 'gelu':
|
||||
attention_activation = gelu
|
||||
if feed_forward_activation == 'gelu':
|
||||
feed_forward_activation = gelu
|
||||
if trainable is None:
|
||||
trainable = training
|
||||
|
||||
def _trainable(_layer):
|
||||
if isinstance(trainable, (list, tuple, set)):
|
||||
for prefix in trainable:
|
||||
if _layer.name.startswith(prefix):
|
||||
return True
|
||||
return False
|
||||
return trainable
|
||||
|
||||
# Build inputs
|
||||
input_token = keras.layers.Input(shape=(seq_len,), name='Input-Token')
|
||||
input_segment = keras.layers.Input(shape=(seq_len,), name='Input-Segment')
|
||||
inputs = [input_token, input_segment]
|
||||
|
||||
# Build embeddings
|
||||
embed_token, embed_weights, embed_projection = AdaptiveEmbedding(
|
||||
input_dim=token_num,
|
||||
output_dim=hidden_dim,
|
||||
embed_dim=embed_dim,
|
||||
mask_zero=True,
|
||||
trainable=trainable,
|
||||
return_embeddings=True,
|
||||
return_projections=True,
|
||||
name='Embed-Token',
|
||||
)(input_token)
|
||||
embed_segment = keras.layers.Embedding(
|
||||
input_dim=2,
|
||||
output_dim=hidden_dim,
|
||||
trainable=trainable,
|
||||
name='Embed-Segment',
|
||||
)(input_segment)
|
||||
embed_layer = keras.layers.Add(name='Embed-Token-Segment')(
|
||||
[embed_token, embed_segment])
|
||||
embed_layer = PositionEmbedding(
|
||||
input_dim=pos_num,
|
||||
output_dim=hidden_dim,
|
||||
mode=PositionEmbedding.MODE_ADD,
|
||||
trainable=trainable,
|
||||
name='Embedding-Position',
|
||||
)(embed_layer)
|
||||
|
||||
if dropout_rate > 0.0:
|
||||
dropout_layer = keras.layers.Dropout(
|
||||
rate=dropout_rate,
|
||||
name='Embedding-Dropout',
|
||||
)(embed_layer)
|
||||
else:
|
||||
dropout_layer = embed_layer
|
||||
embed_layer = LayerNormalization(
|
||||
trainable=trainable,
|
||||
name='Embedding-Norm',
|
||||
)(dropout_layer)
|
||||
|
||||
# Build shared transformer
|
||||
attention_layer = MultiHeadAttention(
|
||||
head_num=head_num,
|
||||
activation=attention_activation,
|
||||
name='Attention',
|
||||
)
|
||||
attention_normal = LayerNormalization(name='Attention-Normal')
|
||||
feed_forward_layer = FeedForward(
|
||||
units=feed_forward_dim,
|
||||
activation=feed_forward_activation,
|
||||
name='Feed-Forward'
|
||||
)
|
||||
feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal')
|
||||
|
||||
transformed = embed_layer
|
||||
transformed_layers = []
|
||||
for i in range(transformer_num):
|
||||
attention_input = transformed
|
||||
transformed = attention_layer(transformed)
|
||||
if dropout_rate > 0.0:
|
||||
transformed = keras.layers.Dropout(
|
||||
rate=dropout_rate,
|
||||
name='Attention-Dropout-{}'.format(i + 1),
|
||||
)(transformed)
|
||||
transformed = keras.layers.Add(
|
||||
name='Attention-Add-{}'.format(i + 1),
|
||||
)([attention_input, transformed])
|
||||
transformed = attention_normal(transformed)
|
||||
|
||||
feed_forward_input = transformed
|
||||
transformed = feed_forward_layer(transformed)
|
||||
if dropout_rate > 0.0:
|
||||
transformed = keras.layers.Dropout(
|
||||
rate=dropout_rate,
|
||||
name='Feed-Forward-Dropout-{}'.format(i + 1),
|
||||
)(transformed)
|
||||
transformed = keras.layers.Add(
|
||||
name='Feed-Forward-Add-{}'.format(i + 1),
|
||||
)([feed_forward_input, transformed])
|
||||
transformed = feed_forward_normal(transformed)
|
||||
transformed_layers.append(transformed)
|
||||
|
||||
if training:
|
||||
# Build tasks
|
||||
mlm_dense_layer = keras.layers.Dense(
|
||||
units=hidden_dim,
|
||||
activation=feed_forward_activation,
|
||||
name='MLM-Dense',
|
||||
)(transformed)
|
||||
mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer)
|
||||
mlm_pred_layer = AdaptiveSoftmax(
|
||||
input_dim=hidden_dim,
|
||||
output_dim=token_num,
|
||||
embed_dim=embed_dim,
|
||||
bind_embeddings=True,
|
||||
bind_projections=True,
|
||||
name='MLM-Sim',
|
||||
)([mlm_norm_layer, embed_weights, embed_projection])
|
||||
masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
|
||||
extract_layer = Extract(index=0, name='Extract')(transformed)
|
||||
nsp_dense_layer = keras.layers.Dense(
|
||||
units=hidden_dim,
|
||||
activation='tanh',
|
||||
name='SOP-Dense',
|
||||
)(extract_layer)
|
||||
nsp_pred_layer = keras.layers.Dense(
|
||||
units=2,
|
||||
activation='softmax',
|
||||
name='SOP',
|
||||
)(nsp_dense_layer)
|
||||
model = keras.models.Model(
|
||||
inputs=inputs,
|
||||
outputs=[masked_layer, nsp_pred_layer])
|
||||
for layer in model.layers:
|
||||
layer.trainable = _trainable(layer)
|
||||
return model
|
||||
if output_layers is not None:
|
||||
if isinstance(output_layers, list):
|
||||
output_layers = [
|
||||
transformed_layers[index] for index in output_layers]
|
||||
output = keras.layers.Concatenate(
|
||||
name='Output',
|
||||
)(output_layers)
|
||||
else:
|
||||
output = transformed_layers[output_layers]
|
||||
model = keras.models.Model(inputs=inputs, outputs=output)
|
||||
return model
|
||||
model = keras.models.Model(inputs=inputs, outputs=transformed)
|
||||
for layer in model.layers:
|
||||
layer.trainable = _trainable(layer)
|
||||
return inputs, transformed
|
||||
|
||||
|
||||
def load_brightmart_albert_zh_checkpoint(checkpoint_path, **kwargs):
|
||||
"""Load checkpoint from https://github.com/brightmart/albert_zh
|
||||
:param checkpoint_path: path to checkpoint folder.
|
||||
:param kwargs: arguments for albert model.
|
||||
:return:
|
||||
"""
|
||||
config = {}
|
||||
for file_name in os.listdir(checkpoint_path):
|
||||
if file_name.startswith('bert_config.json'):
|
||||
with open(os.path.join(checkpoint_path, file_name)) as reader:
|
||||
config = json.load(reader)
|
||||
break
|
||||
|
||||
def _set_if_not_existed(key, value):
|
||||
if key not in kwargs:
|
||||
kwargs[key] = value
|
||||
|
||||
# 修改部分,必须输入is_training, len_max
|
||||
training = kwargs['training']
|
||||
# config['max_position_embeddings'] = config['max_position_embeddings'] = kwargs['len_max']
|
||||
_set_if_not_existed('training', True)
|
||||
_set_if_not_existed('token_num', config['vocab_size'])
|
||||
_set_if_not_existed('pos_num', config['max_position_embeddings'])
|
||||
_set_if_not_existed('seq_len', config['max_position_embeddings'])
|
||||
_set_if_not_existed('embed_dim', config['embedding_size'])
|
||||
_set_if_not_existed('hidden_dim', config['hidden_size'])
|
||||
_set_if_not_existed('transformer_num', config['num_hidden_layers'])
|
||||
_set_if_not_existed('head_num', config['num_attention_heads'])
|
||||
_set_if_not_existed('feed_forward_dim', config['intermediate_size'])
|
||||
_set_if_not_existed('dropout_rate', config['hidden_dropout_prob'])
|
||||
_set_if_not_existed('feed_forward_activation', config['hidden_act'])
|
||||
|
||||
model = build_albert(**kwargs)
|
||||
if not training:
|
||||
inputs, outputs = model
|
||||
model = keras.models.Model(inputs, outputs)
|
||||
|
||||
def _checkpoint_loader(checkpoint_file):
|
||||
def _loader(name):
|
||||
return tf.train.load_variable(checkpoint_file, name)
|
||||
return _loader
|
||||
|
||||
loader = _checkpoint_loader(
|
||||
os.path.join(checkpoint_path, 'bert_model.ckpt'))
|
||||
|
||||
model.get_layer(name='Embed-Token').set_weights([
|
||||
loader('bert/embeddings/word_embeddings'),
|
||||
loader('bert/embeddings/word_embeddings_2'),
|
||||
])
|
||||
model.get_layer(name='Embed-Segment').set_weights([
|
||||
loader('bert/embeddings/token_type_embeddings'),
|
||||
])
|
||||
model.get_layer(name='Embedding-Position').set_weights([
|
||||
loader('bert/embeddings/position_embeddings'),
|
||||
])
|
||||
model.get_layer(name='Embedding-Norm').set_weights([
|
||||
loader('bert/embeddings/LayerNorm/gamma'),
|
||||
loader('bert/embeddings/LayerNorm/beta'),
|
||||
])
|
||||
|
||||
model.get_layer(name='Attention').set_weights([
|
||||
loader('bert/encoder/layer_shared/attention/self/query/kernel'),
|
||||
loader('bert/encoder/layer_shared/attention/self/query/bias'),
|
||||
loader('bert/encoder/layer_shared/attention/self/key/kernel'),
|
||||
loader('bert/encoder/layer_shared/attention/self/key/bias'),
|
||||
loader('bert/encoder/layer_shared/attention/self/value/kernel'),
|
||||
loader('bert/encoder/layer_shared/attention/self/value/bias'),
|
||||
loader('bert/encoder/layer_shared/attention/output/dense/kernel'),
|
||||
loader('bert/encoder/layer_shared/attention/output/dense/bias'),
|
||||
])
|
||||
model.get_layer(name='Attention-Normal').set_weights([
|
||||
loader('bert/encoder/layer_shared/attention/output/LayerNorm/gamma'),
|
||||
loader('bert/encoder/layer_shared/attention/output/LayerNorm/beta'),
|
||||
])
|
||||
model.get_layer(name='Feed-Forward').set_weights([
|
||||
loader('bert/encoder/layer_shared/intermediate/dense/kernel'),
|
||||
loader('bert/encoder/layer_shared/intermediate/dense/bias'),
|
||||
loader('bert/encoder/layer_shared/output/dense/kernel'),
|
||||
loader('bert/encoder/layer_shared/output/dense/bias'),
|
||||
])
|
||||
model.get_layer(name='Feed-Forward-Normal').set_weights([
|
||||
loader('bert/encoder/layer_shared/output/LayerNorm/gamma'),
|
||||
loader('bert/encoder/layer_shared/output/LayerNorm/beta'),
|
||||
])
|
||||
|
||||
if training:
|
||||
model.get_layer(name='MLM-Dense').set_weights([
|
||||
loader('cls/predictions/transform/dense/kernel'),
|
||||
loader('cls/predictions/transform/dense/bias'),
|
||||
])
|
||||
model.get_layer(name='MLM-Norm').set_weights([
|
||||
loader('cls/predictions/transform/LayerNorm/gamma'),
|
||||
loader('cls/predictions/transform/LayerNorm/beta'),
|
||||
])
|
||||
model.get_layer(name='MLM-Sim').set_weights([
|
||||
loader('cls/predictions/output_bias'),
|
||||
])
|
||||
|
||||
model.get_layer(name='SOP-Dense').set_weights([
|
||||
loader('bert/pooler/dense/kernel'),
|
||||
loader('bert/pooler/dense/bias'),
|
||||
])
|
||||
model.get_layer(name='SOP').set_weights([
|
||||
np.transpose(loader('cls/seq_relationship/output_weights')),
|
||||
loader('cls/seq_relationship/output_bias'),
|
||||
])
|
||||
|
||||
return model
|
128
macropodus/network/layers/crf.py
Normal file
128
macropodus/network/layers/crf.py
Normal file
@ -0,0 +1,128 @@
|
||||
# encoding: utf-8
|
||||
|
||||
# author: BrikerMan
|
||||
# contact: eliyar917@gmail.com
|
||||
# blog: https://eliyar.biz
|
||||
# code from:
|
||||
|
||||
# file: crf.py
|
||||
# time: 2019-06-28 14:33
|
||||
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class CRF(tf.keras.layers.Layer):
|
||||
"""
|
||||
Conditional Random Field layer (tf.keras)
|
||||
`CRF` can be used as the last layer in a network (as a classifier). Input shape (features)
|
||||
must be equal to the number of classes the CRF can predict (a linear layer is recommended).
|
||||
Note: the loss and accuracy functions of networks using `CRF` must
|
||||
use the provided loss and accuracy functions (denoted as loss and viterbi_accuracy)
|
||||
as the classification of sequences are used with the layers internal weights.
|
||||
Args:
|
||||
output_dim (int): the number of labels to tag each temporal input.
|
||||
Input shape:
|
||||
nD tensor with shape `(batch_size, sentence length, num_classes)`.
|
||||
Output shape:
|
||||
nD tensor with shape: `(batch_size, sentence length, num_classes)`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
output_dim,
|
||||
mode='reg',
|
||||
supports_masking=False,
|
||||
transitions=None,
|
||||
**kwargs):
|
||||
self.transitions = None
|
||||
super(CRF, self).__init__(**kwargs)
|
||||
self.output_dim = int(output_dim)
|
||||
self.mode = mode
|
||||
if self.mode == 'pad':
|
||||
self.input_spec = [tf.keras.layers.InputSpec(min_ndim=3), tf.keras.layers.InputSpec(min_ndim=2)]
|
||||
elif self.mode == 'reg':
|
||||
self.input_spec = tf.keras.layers.InputSpec(min_ndim=3)
|
||||
else:
|
||||
raise ValueError
|
||||
self.supports_masking = supports_masking
|
||||
self.sequence_lengths = None
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'output_dim': self.output_dim,
|
||||
'mode': self.mode,
|
||||
'supports_masking': self.supports_masking,
|
||||
'transitions': tf.keras.backend.eval(self.transitions)
|
||||
}
|
||||
base_config = super(CRF, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
def build(self, input_shape):
|
||||
if self.mode == 'pad':
|
||||
assert len(input_shape) == 2
|
||||
assert len(input_shape[0]) == 3
|
||||
assert len(input_shape[1]) == 2
|
||||
f_shape = tf.TensorShape(input_shape[0])
|
||||
input_spec = [tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]}),
|
||||
tf.keras.layers.InputSpec(min_ndim=2, axes={-1: 1}, dtype=tf.int32)]
|
||||
else:
|
||||
assert len(input_shape) == 3
|
||||
f_shape = tf.TensorShape(input_shape)
|
||||
input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})
|
||||
|
||||
if f_shape[-1] is None:
|
||||
raise ValueError('The last dimension of the inputs to `CRF` should be defined. Found `None`.')
|
||||
if f_shape[-1] != self.output_dim:
|
||||
raise ValueError('The last dimension of the input shape must be equal to output shape. '
|
||||
'Use a linear layer if needed.')
|
||||
self.input_spec = input_spec
|
||||
self.transitions = self.add_weight(name='transitions',
|
||||
shape=[self.output_dim, self.output_dim],
|
||||
initializer='glorot_uniform',
|
||||
trainable=True)
|
||||
self.built = True
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
if self.mode == 'pad':
|
||||
sequences = tf.convert_to_tensor(inputs[0], dtype=self.dtype)
|
||||
self.sequence_lengths = tf.keras.backend.flatten(inputs[-1])
|
||||
else:
|
||||
sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
|
||||
shape = tf.shape(inputs)
|
||||
self.sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
|
||||
viterbi_sequence, _ = tf.contrib.crf.crf_decode(sequences, self.transitions,
|
||||
self.sequence_lengths)
|
||||
output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
|
||||
return tf.keras.backend.in_train_phase(sequences, output)
|
||||
|
||||
def loss(self, y_true, y_pred):
|
||||
y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
|
||||
log_likelihood, self.transitions = tf.contrib.crf.crf_log_likelihood(y_pred,
|
||||
tf.cast(tf.keras.backend.argmax(y_true),
|
||||
dtype=tf.int32),
|
||||
self.sequence_lengths,
|
||||
transition_params=self.transitions)
|
||||
# loss_crf = tf.reduce_mean(-log_likelihood)
|
||||
return tf.reduce_mean(-log_likelihood)
|
||||
# return tf.math.log(loss_crf)
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
if self.mode == 'pad':
|
||||
data_shape = input_shape[0]
|
||||
else:
|
||||
data_shape = input_shape
|
||||
tf.TensorShape(data_shape).assert_has_rank(3)
|
||||
return data_shape[:2] + (self.output_dim,)
|
||||
|
||||
@property
|
||||
def viterbi_accuracy(self):
|
||||
def accuracy(y_true, y_pred):
|
||||
shape = tf.shape(y_pred)
|
||||
sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
|
||||
viterbi_sequence, _ = tf.contrib.crf.crf_decode(y_pred, self.transitions, sequence_lengths)
|
||||
output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
|
||||
return tf.keras.metrics.categorical_accuracy(y_true, output)
|
||||
|
||||
accuracy.func_name = 'viterbi_accuracy'
|
||||
return accuracy
|
||||
|
77
macropodus/network/layers/keras_lookahead.py
Normal file
77
macropodus/network/layers/keras_lookahead.py
Normal file
@ -0,0 +1,77 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/12 21:14
|
||||
# @author : Mo
|
||||
# @function: lookahead of keras
|
||||
# @codefrom: https://github.com/bojone/keras_lookahead
|
||||
|
||||
|
||||
import tensorflow.python.keras.backend as K
|
||||
|
||||
|
||||
class Lookahead(object):
|
||||
"""Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
|
||||
"""
|
||||
|
||||
def __init__(self, k=5, alpha=0.5):
|
||||
self.k = k
|
||||
self.alpha = alpha
|
||||
self.count = 0
|
||||
|
||||
def inject(self, model):
|
||||
"""Inject the Lookahead algorithm for the given model.
|
||||
The following code is modified from keras's _make_train_function method.
|
||||
See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
|
||||
"""
|
||||
if not hasattr(model, 'train_function'):
|
||||
raise RuntimeError('You must compile your model before using it.')
|
||||
|
||||
model._check_trainable_weights_consistency()
|
||||
|
||||
if model.train_function is None:
|
||||
inputs = (model._feed_inputs +
|
||||
model._feed_targets +
|
||||
model._feed_sample_weights)
|
||||
if model._uses_dynamic_learning_phase():
|
||||
inputs += [K.learning_phase()]
|
||||
fast_params = model._collected_trainable_weights
|
||||
|
||||
with K.name_scope('training'):
|
||||
with K.name_scope(model.optimizer.__class__.__name__):
|
||||
training_updates = model.optimizer.get_updates(
|
||||
params=fast_params,
|
||||
loss=model.total_loss)
|
||||
slow_params = [K.variable(p) for p in fast_params]
|
||||
fast_updates = (model.updates +
|
||||
training_updates +
|
||||
model.metrics_updates)
|
||||
|
||||
slow_updates, copy_updates = [], []
|
||||
for p, q in zip(fast_params, slow_params):
|
||||
slow_updates.append(K.update(q, q + self.alpha * (p - q)))
|
||||
copy_updates.append(K.update(p, q))
|
||||
|
||||
# Gets loss and metrics. Updates weights at each call.
|
||||
fast_train_function = K.function(
|
||||
inputs,
|
||||
[model.total_loss] + model.metrics_tensors,
|
||||
updates=fast_updates,
|
||||
name='fast_train_function',
|
||||
**model._function_kwargs)
|
||||
|
||||
def F(inputs):
|
||||
self.count += 1
|
||||
R = fast_train_function(inputs)
|
||||
if self.count % self.k == 0:
|
||||
K.batch_get_value(slow_updates)
|
||||
K.batch_get_value(copy_updates)
|
||||
return R
|
||||
|
||||
model.train_function = F
|
||||
|
||||
if __name__ == '__main__':
|
||||
gg = 0
|
||||
# # useage
|
||||
# model.compile(optimizer=Adam(1e-3), loss='mse') # Any optimizer
|
||||
# lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
|
||||
# lookahead.inject(model) # add into model
|
96
macropodus/network/layers/keras_radam.py
Normal file
96
macropodus/network/layers/keras_radam.py
Normal file
@ -0,0 +1,96 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/12 20:12
|
||||
# @author : Mo
|
||||
# @function: radam of keras
|
||||
# @codefrom: https://github.com/bojone/keras_radam
|
||||
|
||||
|
||||
from tensorflow.python.keras.optimizers import Optimizer
|
||||
# from tensorflow.python.keras.legacy import interfaces
|
||||
import tensorflow.python.keras.backend as K
|
||||
|
||||
|
||||
class RAdam(Optimizer):
|
||||
"""RAdam optimizer.
|
||||
Default parameters follow those provided in the original Adam paper.
|
||||
# Arguments
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1: float, 0 < beta < 1. Generally close to 1.
|
||||
beta_2: float, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
amsgrad: boolean. Whether to apply the AMSGrad variant of this
|
||||
algorithm from the paper "On the Convergence of Adam and
|
||||
Beyond".
|
||||
# References
|
||||
- [RAdam - A Method for Stochastic Optimization]
|
||||
(https://arxiv.org/abs/1908.03265)
|
||||
- [On The Variance Of The Adaptive Learning Rate And Beyond]
|
||||
(https://arxiv.org/abs/1908.03265)
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
|
||||
epsilon=None, decay=0., **kwargs):
|
||||
super(RAdam, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
# @interfaces.legacy_get_updates_support
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = [K.update_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
|
||||
K.dtype(self.decay))))
|
||||
|
||||
t = K.cast(self.iterations, K.floatx()) + 1
|
||||
beta_1_t = K.pow(self.beta_1, t)
|
||||
beta_2_t = K.pow(self.beta_2, t)
|
||||
rho = 2 / (1 - self.beta_2) - 1
|
||||
rho_t = rho - 2 * t * beta_2_t / (1 - beta_2_t)
|
||||
r_t = K.sqrt(
|
||||
K.relu(rho_t - 4) * K.relu(rho_t - 2) * rho / ((rho - 4) * (rho - 2) * rho_t)
|
||||
)
|
||||
flag = K.cast(rho_t > 4, K.floatx())
|
||||
|
||||
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
self.weights = [self.iterations] + ms + vs
|
||||
|
||||
for p, g, m, v in zip(params, grads, ms, vs):
|
||||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
|
||||
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
|
||||
mhat_t = m_t / (1 - beta_1_t)
|
||||
vhat_t = K.sqrt(v_t / (1 - beta_2_t))
|
||||
p_t = p - lr * mhat_t * (flag * r_t / (vhat_t + self.epsilon) + (1 - flag))
|
||||
|
||||
self.updates.append(K.update(m, m_t))
|
||||
self.updates.append(K.update(v, v_t))
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(K.update(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon}
|
||||
base_config = super(RAdam, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
31
macropodus/network/layers/non_mask_layer.py
Normal file
31
macropodus/network/layers/non_mask_layer.py
Normal file
@ -0,0 +1,31 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/10 21:35
|
||||
# @author :Mo
|
||||
# @function :NonMaskingLayer of bert
|
||||
# @codefrom :https://github.com/jacoxu
|
||||
|
||||
|
||||
from __future__ import print_function, division
|
||||
from tensorflow.python.keras.layers import Layer
|
||||
|
||||
|
||||
class NonMaskingLayer(Layer):
|
||||
"""
|
||||
fix convolutional 1D can't receive masked input,
|
||||
detail: https://github.com/keras-team/keras/issues/4978
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.supports_masking = True
|
||||
super(NonMaskingLayer, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
pass
|
||||
|
||||
def compute_mask(self, inputs, input_mask=None):
|
||||
# do not pass the mask to the next layers
|
||||
return None
|
||||
|
||||
def call(self, x, mask=None):
|
||||
return x
|
5
macropodus/network/predict/__init__.py
Normal file
5
macropodus/network/predict/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/20 23:08
|
||||
# @author : Mo
|
||||
# @function:
|
81
macropodus/network/predict/predict_albert_bilstm.py
Normal file
81
macropodus/network/predict/predict_albert_bilstm.py
Normal file
@ -0,0 +1,81 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/20 23:08
|
||||
# @author : Mo
|
||||
# @function: predict(albert+bilstm)
|
||||
|
||||
|
||||
from macropodus.conf.path_config import path_embedding_albert
|
||||
from macropodus.preprocess.tools_ml import extract_chinese
|
||||
from tensorflow.python.keras.models import model_from_json
|
||||
from macropodus.preprocess.tools_common import load_json
|
||||
from macropodus.conf.path_config import path_model_dir
|
||||
from keras_bert import Tokenizer
|
||||
import numpy as np
|
||||
import macropodus
|
||||
import codecs
|
||||
import pickle
|
||||
import os
|
||||
|
||||
|
||||
path_dir = path_model_dir # + "/ner_albert_bilstm_people_199801"
|
||||
# 加载模型结构
|
||||
model = model_from_json(open(path_dir+"/graph.json", "r", encoding="utf-8").read(),
|
||||
custom_objects=macropodus.custom_objects)
|
||||
# 加载模型权重
|
||||
model.load_weights(path_dir+"/model.h5")
|
||||
|
||||
# reader tokenizer
|
||||
token_dict = {}
|
||||
path_dict = os.path.join(path_embedding_albert, "vocab.txt")
|
||||
with codecs.open(path_dict, 'r', 'utf8') as reader:
|
||||
for line in reader:
|
||||
token = line.strip()
|
||||
token_dict[token] = len(token_dict)
|
||||
vocab_size = len(token_dict)
|
||||
tokenizer = Tokenizer(token_dict)
|
||||
# params
|
||||
path_params = path_dir + "/params.json"
|
||||
params = load_json(path_params)
|
||||
len_max = params["len_max"]
|
||||
# l2i_i2l
|
||||
path_l2i_i2l = path_dir + "/l2i_i2l.json"
|
||||
l2i_i2l = load_json(path_l2i_i2l)
|
||||
|
||||
def sentence2idx(text, second_text=None):
|
||||
text = extract_chinese(str(text).upper())
|
||||
input_id, input_type_id = tokenizer.encode(first=text, second=second_text, max_len=len_max)
|
||||
input_mask = len([1 for ids in input_id if ids != 0])
|
||||
# return input_id, input_type_id, input_mask
|
||||
# x_ = np.array((input_id, input_type_id, input_mask))
|
||||
x = [[input_id, input_type_id, input_mask]]
|
||||
x_ = np.array(x)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
x_3 = np.array([x[2] for x in x_])
|
||||
|
||||
return [x_1, x_2, x_3]
|
||||
|
||||
while True:
|
||||
print("请输入:")
|
||||
ques = input()
|
||||
mode_input = sentence2idx(ques)
|
||||
res = model.predict(mode_input)
|
||||
res_list = res.tolist()[0]
|
||||
res_idxs = [np.argmax(rl) for rl in res_list]
|
||||
res_label = [l2i_i2l["i2l"][str(ri)] if str(ri) in l2i_i2l["i2l"] else "O" for ri in res_idxs]
|
||||
print(res_label[:len(ques)])
|
||||
|
||||
# gg = 0
|
||||
|
||||
# # 保存模型的结构
|
||||
# json_string = model.to_json() # 方式1
|
||||
# open("model_architecture_1.json", "w").write(json_string)
|
||||
# yaml_string = model.to_yaml() # 方式2
|
||||
# open("model_arthitecture_2.yaml", "w").write(yaml_string)
|
||||
# # 加载模型结构
|
||||
# model = model_from_json(open("model_architecture_1.json", "r").read())
|
||||
# # 加载模型权重
|
||||
# model.load_weights("weights-improvement-40-0.96208.hdf5")
|
||||
# # 编译模型
|
||||
# model.compile(loss="categorical_crossentropy", optimizer=OPTIMIZER, metrics=["accuracy"])
|
73
macropodus/network/predict/predict_w2v_bilstm.py
Normal file
73
macropodus/network/predict/predict_w2v_bilstm.py
Normal file
@ -0,0 +1,73 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/20 23:08
|
||||
# @author : Mo
|
||||
# @function: predict(albert+bilstm)
|
||||
|
||||
|
||||
from macropodus.conf.path_config import path_embedding_albert
|
||||
from macropodus.preprocess.tools_ml import extract_chinese
|
||||
from tensorflow.python.keras.models import model_from_json
|
||||
from macropodus.preprocess.tools_common import load_json
|
||||
from macropodus.conf.path_config import path_model_dir
|
||||
from keras_bert import Tokenizer
|
||||
import numpy as np
|
||||
import macropodus
|
||||
import codecs
|
||||
import pickle
|
||||
import os
|
||||
|
||||
|
||||
path_dir = path_model_dir # + "/ner_albert_bilstm_people_199801"
|
||||
# 加载模型结构
|
||||
model = model_from_json(open(path_dir+"/graph.json", "r", encoding="utf-8").read(),
|
||||
custom_objects=macropodus.custom_objects)
|
||||
# 加载模型权重
|
||||
model.load_weights(path_dir+"/model.h5")
|
||||
|
||||
# reader tokenizer
|
||||
token_dict = {}
|
||||
path_dict = os.path.join(path_embedding_albert, "vocab.txt")
|
||||
with codecs.open(path_dict, 'r', 'utf8') as reader:
|
||||
for line in reader:
|
||||
token = line.strip()
|
||||
token_dict[token] = len(token_dict)
|
||||
vocab_size = len(token_dict)
|
||||
tokenizer = Tokenizer(token_dict)
|
||||
# params
|
||||
path_params = path_dir + "/params.json"
|
||||
params = load_json(path_params)
|
||||
len_max = params["len_max"]
|
||||
# l2i_i2l
|
||||
path_l2i_i2l = path_dir + "/l2i_i2l.json"
|
||||
l2i_i2l = load_json(path_l2i_i2l)
|
||||
|
||||
|
||||
def sentence2idx(text):
|
||||
text = extract_chinese(str(text).upper())
|
||||
text = list(text)
|
||||
text = [text_one for text_one in text]
|
||||
len_leave = len_max - len(text)
|
||||
if len_leave >= 0:
|
||||
text_index = [token_dict[text_char] if text_char in token_dict else token_dict['[UNK]'] for
|
||||
text_char in text] + [token_dict['[PAD]'] for i in range(len_leave)]
|
||||
else:
|
||||
text_index = [token_dict[text_char] if text_char in token_dict else token_dict['[UNK]'] for
|
||||
text_char in text[0:len_max]]
|
||||
input_mask = min(len(text), len_max)
|
||||
x = [[text_index, input_mask]]
|
||||
x_ = np.array(x)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
return [x_1, x_2]
|
||||
|
||||
while True:
|
||||
print("请输入:")
|
||||
ques = input()
|
||||
mode_input = sentence2idx(ques)
|
||||
res = model.predict(mode_input)
|
||||
res_list = res.tolist()[0]
|
||||
res_idxs = [np.argmax(rl) for rl in res_list]
|
||||
res_label = [l2i_i2l["i2l"][str(ri)] if str(ri) in l2i_i2l["i2l"] else "O" for ri in res_idxs]
|
||||
print(res_label[:len(ques)])
|
||||
|
5
macropodus/network/preprocess/__init__.py
Normal file
5
macropodus/network/preprocess/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:35
|
||||
# @author : Mo
|
||||
# @function:
|
306
macropodus/network/preprocess/preprocess_generator.py
Normal file
306
macropodus/network/preprocess/preprocess_generator.py
Normal file
@ -0,0 +1,306 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/2 21:08
|
||||
# @author : Mo
|
||||
# @function: preprocess of network
|
||||
|
||||
|
||||
from tensorflow.python.keras.utils import to_categorical
|
||||
from macropodus.preprocess.tools_common import load_json
|
||||
from macropodus.preprocess.tools_common import save_json
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
class PreprocessGenerator:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self, path_model_l2i_i2l):
|
||||
self.path_model_l2i_i2l = path_model_l2i_i2l
|
||||
self.l2i_i2l = None
|
||||
if os.path.exists(self.path_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx2label(self, pred):
|
||||
"""
|
||||
类标(idx)转类别(label)
|
||||
:param pred:
|
||||
:return:
|
||||
"""
|
||||
if os.path.exists(self.path_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
pred_i2l[i2l[str(i)]] = pred[i]
|
||||
pred_i2l_rank = [sorted(pred_i2l.items(), key=lambda k: k[1], reverse=True)]
|
||||
return pred_i2l_rank
|
||||
else:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_label2idx(self, pred):
|
||||
"""
|
||||
类别(label)转类标(idx)
|
||||
:param pred:
|
||||
:return:
|
||||
"""
|
||||
if os.path.exists(self.path_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
pred_l2i[pred[i]] = l2i[pred[i]]
|
||||
pred_l2i_rank = [sorted(pred_l2i.items(), key=lambda k: k[1], reverse=True)]
|
||||
return pred_l2i_rank
|
||||
else:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def preprocess_label2set(self, path):
|
||||
"""
|
||||
统计label个数, 以及具体的存在
|
||||
:param path: str, like 'train.json'
|
||||
:return:
|
||||
"""
|
||||
# 首先获取label,set,即存在的具体类
|
||||
label_sets = set(["<PAD>"])
|
||||
len_all = 0
|
||||
file_csv = open(path, "r", encoding="utf-8")
|
||||
for line in file_csv:
|
||||
len_all += 1
|
||||
if line.strip():
|
||||
ques_label = json.loads(line.strip())
|
||||
label_org = ques_label["label"]
|
||||
label_sets = label_sets | set(label_org)
|
||||
|
||||
file_csv.close()
|
||||
return label_sets, len_all
|
||||
|
||||
def preprocess_label_question_to_idx_fit_generator(self, embedding_type, batch_size, path, embed, rate=1):
|
||||
"""
|
||||
fit_generator用, 将句子, 类标转化为数字idx
|
||||
:param embedding_type: str, like 'albert'
|
||||
:param batch_size: int, like 64
|
||||
:param path: str, like 'train.json'
|
||||
:param embed: class, like embed
|
||||
:param rate: float, like 0.9
|
||||
:return: yield
|
||||
"""
|
||||
# 首先获取label,set,即存在的具体类
|
||||
label_set, len_all = self.preprocess_label2set(path)
|
||||
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
|
||||
if not os.path.exists(self.path_model_l2i_i2l):
|
||||
count = 0
|
||||
label2index = {}
|
||||
index2label = {}
|
||||
for label_one in label_set:
|
||||
label2index[label_one] = count
|
||||
index2label[count] = label_one
|
||||
count = count + 1
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, self.path_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(self.path_model_l2i_i2l)
|
||||
|
||||
# 读取数据的比例
|
||||
len_ql = int(rate * len_all)
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
len_ql = len_all
|
||||
|
||||
def process_line(line, embed, use_len_seq=True):
|
||||
"""
|
||||
关键:对每一条数据操作,获取label和问句index
|
||||
:param line: str, like '大漠帝国'
|
||||
:param embed: class, like embed
|
||||
:param use_len_seq: boolean, True or False
|
||||
:return:
|
||||
"""
|
||||
|
||||
ques_label = json.loads(line.strip())
|
||||
label_org = ques_label["label"]
|
||||
label_index = [l2i_i2l["l2i"][lr] for lr in label_org]
|
||||
len_sequence = len(label_index)
|
||||
que_embed = embed.sentence2idx(ques_label["question"])
|
||||
# padding label
|
||||
len_leave = embed.len_max - len(label_index)
|
||||
if len_leave >= 0:
|
||||
label_index_leave = [li for li in label_index] + [l2i_i2l["l2i"]["O"] for i in range(len_leave)]
|
||||
else:
|
||||
label_index_leave = label_index[0:embed.len_max]
|
||||
if use_len_seq:
|
||||
return [que_embed[0], que_embed[1], len_sequence], label_index_leave
|
||||
else:
|
||||
return [que_embed, len_sequence], label_index_leave
|
||||
|
||||
while True:
|
||||
file_csv = open(path, "r", encoding="utf-8")
|
||||
cout_all_line = 0
|
||||
cnt = 0
|
||||
x, y = [], []
|
||||
# 跳出循环
|
||||
if len_ql < cout_all_line:
|
||||
break
|
||||
for line in file_csv:
|
||||
cout_all_line += 1
|
||||
if line.strip():
|
||||
x_line, y_line = process_line(line, embed, use_len_seq=True)
|
||||
x.append(x_line)
|
||||
y.append(y_line)
|
||||
cnt += 1
|
||||
if cnt == batch_size:
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
x_3 = np.array([x[2] for x in x_])
|
||||
x_all = [x_1, x_2, x_3]
|
||||
else:
|
||||
x_all, y_ = np.array(x), np.array(y)
|
||||
|
||||
cnt = 0
|
||||
yield (x_all, y_)
|
||||
x, y =[], []
|
||||
file_csv.close()
|
||||
print("preprocess_label_ques_to_idx ok")
|
||||
|
||||
def preprocess_label_question_to_idx_fit(self, embedding_type, path, embed, rate=1, batch_size=64, crf_mode='reg', fit_type='fit'):
|
||||
"""
|
||||
fit用, 关键:对每一条数据操作,获取label和问句index
|
||||
:param embedding_type: str, like 'albert'
|
||||
:param path: str, like 'train.json'
|
||||
:param embed: class, like embed
|
||||
:param rate: float, like 0.9
|
||||
:param batch_size: int, like 64
|
||||
:param crf_mode: str, like 'reg', 'pad'
|
||||
:param fit_type: str, like 'fit', 'fit_generator'
|
||||
:return: np.array
|
||||
"""
|
||||
# 首先获取label,set,即存在的具体类
|
||||
label_set, len_all = self.preprocess_label2set(path)
|
||||
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
|
||||
if not os.path.exists(self.path_model_l2i_i2l):
|
||||
count = 0
|
||||
label2index = {}
|
||||
index2label = {}
|
||||
for label_one in label_set:
|
||||
label2index[label_one] = count
|
||||
index2label[count] = label_one
|
||||
count = count + 1
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, self.path_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(self.path_model_l2i_i2l)
|
||||
|
||||
# 读取数据的比例
|
||||
len_ql = int(rate * len_all)
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
len_ql = len_all
|
||||
|
||||
def process_line(line, embed, l2i_i2l):
|
||||
"""
|
||||
对每一条数据操作,获取label和问句index
|
||||
:param line:
|
||||
:param embed:
|
||||
:param l2i_i2l:
|
||||
:return:
|
||||
"""
|
||||
# 对每一条数据操作,对question和label进行padding
|
||||
ques_label = json.loads(line.strip())
|
||||
label_org = ques_label["label"]
|
||||
label_index = [l2i_i2l["l2i"][lr] for lr in label_org]
|
||||
# len_sequence = len(label_index)
|
||||
que_embed = embed.sentence2idx("".join(ques_label["question"]))
|
||||
# label padding
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
# padding label
|
||||
len_leave = embed.len_max - len(label_index) -2
|
||||
if len_leave >= 0:
|
||||
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + [li for li in label_index] + [l2i_i2l["l2i"]["<PAD>"]] + [l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)]
|
||||
else:
|
||||
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + label_index[0:embed.len_max-2] + [l2i_i2l["l2i"]["<PAD>"]]
|
||||
else:
|
||||
# padding label
|
||||
len_leave = embed.len_max - len(label_index) # -2
|
||||
if len_leave >= 0:
|
||||
label_index_leave = [li for li in label_index] + [l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)]
|
||||
else:
|
||||
label_index_leave = label_index[0:embed.len_max]
|
||||
# 转为one-hot
|
||||
label_res = to_categorical(label_index_leave, num_classes=len(l2i_i2l["l2i"]))
|
||||
return que_embed, label_res
|
||||
|
||||
file_csv = open(path, "r", encoding="utf-8")
|
||||
cout_all_line = 0
|
||||
cnt = 0
|
||||
x, y = [], []
|
||||
for line in file_csv:
|
||||
# 跳出循环
|
||||
if len_ql < cout_all_line:
|
||||
break
|
||||
cout_all_line += 1
|
||||
if line.strip():
|
||||
# 一个json一个json处理
|
||||
# 备注:最好训练前先处理,使得ques长度小于等于len_max(word2vec), len_max-2(bert, albert)
|
||||
x_line, y_line = process_line(line, embed, l2i_i2l)
|
||||
x.append(x_line)
|
||||
y.append(y_line.tolist())
|
||||
cnt += 1
|
||||
# 使用fit_generator时候, 每个batch_size进行yield
|
||||
if fit_type=='fit_generator' and cnt == batch_size:
|
||||
# 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
x_3 = np.array([x[2] for x in x_])
|
||||
if crf_mode == 'pad':
|
||||
x_all = [x_1, x_2, x_3]
|
||||
elif crf_mode == 'reg':
|
||||
x_all = [x_1, x_2]
|
||||
else:
|
||||
x_all = [x_1, x_2]
|
||||
else:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
if crf_mode == 'pad':
|
||||
x_all = [x_1, x_2]
|
||||
elif crf_mode == 'reg':
|
||||
x_all = [x_1]
|
||||
else:
|
||||
x_all = [x_1]
|
||||
|
||||
cnt = 0
|
||||
yield (x_all, y_)
|
||||
x, y =[], []
|
||||
# 使用fit的时候, return返回
|
||||
if fit_type=='fit':
|
||||
# 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
x_3 = np.array([x[2] for x in x_])
|
||||
if crf_mode=='pad':
|
||||
x_all = [x_1, x_2, x_3]
|
||||
elif crf_mode=='reg':
|
||||
x_all = [x_1, x_2]
|
||||
else:
|
||||
x_all = [x_1, x_2]
|
||||
else:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
if crf_mode=='pad':
|
||||
x_all = [x_1, x_2]
|
||||
elif crf_mode=='reg':
|
||||
x_all = x_1
|
||||
else:
|
||||
x_all = x_1
|
||||
# 使用fit的时候, return返回
|
||||
return x_all, y_
|
||||
|
||||
|
5
macropodus/network/train/__init__.py
Normal file
5
macropodus/network/train/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/20 22:18
|
||||
# @author : Mo
|
||||
# @function:
|
101
macropodus/network/train/train_bilstm.py
Normal file
101
macropodus/network/train/train_bilstm.py
Normal file
@ -0,0 +1,101 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/16 22:11
|
||||
# @author : Mo
|
||||
# @function:
|
||||
|
||||
|
||||
# 适配linux
|
||||
import pathlib
|
||||
import sys
|
||||
import os
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||
os.environ['TF_KERAS'] = '1'
|
||||
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||
sys.path.append(project_path)
|
||||
# 模型图
|
||||
# from macropodus.network.graph.bilstm_crf import BilstmCRFGraph as Graph
|
||||
from macropodus.network.graph.bilstm import BiLSTMGraph as Graph
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from macropodus.preprocess.tools_common import delete_file
|
||||
# 地址
|
||||
# from macropodus.conf.path_config import path_model_dir, path_ner_people_1998_train, path_ner_people_1998_valid
|
||||
from macropodus.conf.path_config import path_model_dir, path_seg_pku_1998_train, path_ner_people_1998_train, path_ner_people_1998_valid
|
||||
# 计算时间
|
||||
import time
|
||||
|
||||
|
||||
|
||||
def train(hyper_parameters=None, rate=1.0):
|
||||
# path_ner_people_1998_train = "D:/soft_install/dataset/corpus/ner/china-people-daily-ner-corpus/example.train"
|
||||
# path_ner_people_1998_valid = "D:/soft_install/dataset/corpus/ner/china-people-daily-ner-corpus/example.dev"
|
||||
if not hyper_parameters:
|
||||
hyper_parameters = {
|
||||
'len_max': 128, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 本地win10-4G设为20就好, 过大小心OOM
|
||||
'embed_size': 768, # 768, # 字/词向量维度, bert取768, word取300, char可以更小些
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': False, # embedding是静态的还是动态的, 即控制可不可以微调
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
|
||||
'embedding_type': 'albert', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.76, #gpu使用率
|
||||
'model': {'label': 4, # 类别数
|
||||
'batch_size': 256, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'dropout': 0.5, # 随机失活, 概率
|
||||
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.9, # 学习率衰减系数, 乘法
|
||||
'epochs': 132, # 训练最大轮次
|
||||
'patience': 3, # 早停,2-3就好
|
||||
'lr': 1e-3, # 学习率, bert取5e-5,其他取1e-3,如果acc较低或者一直不变,优先调这个, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
'l2': 1e-9, # l2正则化
|
||||
'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数
|
||||
'loss': 'categorical_crossentropy', # 损失函数
|
||||
'metrics': 'accuracy', # 保存更好模型的评价标准
|
||||
'optimizer_name': 'ADAM',
|
||||
'is_training': True, # 训练后者是测试模型
|
||||
'path_model_dir': path_model_dir,
|
||||
'model_path': os.path.join(path_model_dir, "bilstm_crf.model"), # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True
|
||||
'path_hyper_parameters': os.path.join(path_model_dir, "hyper_parameters.json"), # 模型(包括embedding),超参数地址,
|
||||
'path_fineture': os.path.join(path_model_dir, "embedding.model"), # embedding trainable地址, 例如字向量、词向量、bert向量等
|
||||
'path_l2i_i2l':os.path.join(path_model_dir, "l2i_i2l.json"),
|
||||
'num_rnn_layers': 1, # rnn层数
|
||||
'rnn_type': 'GRU', # rnn类型,可以填"LSTM","GRU","CuDNNLSTM","CuDNNGRU"
|
||||
'rnn_units': 256, # rnn隐藏元
|
||||
},
|
||||
'embedding': {'layer_indexes': [12], # bert取的层数
|
||||
# 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
},
|
||||
'data':{'train_data': path_seg_pku_1998_train, # path_ner_people_1998_train, # 训练数据
|
||||
'val_data': path_seg_pku_1998_train # path_ner_people_1998_valid # 验证数据
|
||||
},
|
||||
}
|
||||
|
||||
# 删除先前存在的模型和embedding微调模型等
|
||||
delete_file(path_model_dir)
|
||||
time_start = time.time()
|
||||
# graph初始化
|
||||
graph = Graph(hyper_parameters)
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
|
||||
from macropodus.network.preprocess.preprocess_generator import PreprocessGenerator
|
||||
|
||||
pg = PreprocessGenerator(os.path.join(path_model_dir, "l2i_i2l.json"))
|
||||
_, len_train = pg.preprocess_label2set(hyper_parameters['data']['train_data'])
|
||||
x_train, y_train = pg.preprocess_label_question_to_idx_fit(embedding_type=hyper_parameters['embedding_type'],
|
||||
path=hyper_parameters['data']['train_data'],
|
||||
embed=ra_ed,
|
||||
rate=rate)
|
||||
|
||||
x_val, y_val = pg.preprocess_label_question_to_idx_fit(embedding_type=hyper_parameters['embedding_type'],
|
||||
path=hyper_parameters['data']['train_data'],
|
||||
embed=ra_ed,
|
||||
rate=rate)
|
||||
# 训练
|
||||
graph.fit(x_train, y_train, x_val, y_val)
|
||||
print("耗时:" + str(time.time()-time_start))
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
train(rate=1)
|
114
macropodus/network/train/train_bilstm_crf.py
Normal file
114
macropodus/network/train/train_bilstm_crf.py
Normal file
@ -0,0 +1,114 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/16 22:11
|
||||
# @author : Mo
|
||||
# @function: train bilstm-crf-?embedding
|
||||
|
||||
|
||||
# 适配linux
|
||||
import pathlib
|
||||
import sys
|
||||
import os
|
||||
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||
os.environ['TF_KERAS'] = '1'
|
||||
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||
sys.path.append(project_path)
|
||||
# 模型图
|
||||
# from macropodus.network.graph.bilstm_crf import BilstmCRFGraph as Graph
|
||||
from macropodus.network.graph.bilstm_crf import BilstmCRFGraph as Graph
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from macropodus.preprocess.tools_common import delete_file
|
||||
# 地址
|
||||
from macropodus.conf.path_config import path_model_dir, path_seg_pku_1998_train, path_ner_people_1998_train, \
|
||||
path_ner_people_1998_valid
|
||||
# 计算时间
|
||||
import time
|
||||
|
||||
|
||||
def train(hyper_parameters=None, rate=1.0):
|
||||
# path_ner_people_1998_train = "D:/soft_install/dataset/corpus/ner/china-people-daily-ner-corpus/example.train"
|
||||
# path_ner_people_1998_valid = "D:/soft_install/dataset/corpus/ner/china-people-daily-ner-corpus/example.dev"
|
||||
if not hyper_parameters:
|
||||
hyper_parameters = {
|
||||
'len_max': 128, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 本地win10-4G设为20就好, 过大小心OOM
|
||||
'embed_size': 300, # 768, # 字/词向量维度, bert取768, word取300, char可以更小些
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': False, # embedding是静态的还是动态的, 即控制可不可以微调
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
|
||||
'embedding_type': 'word2vec', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.76, # gpu使用率
|
||||
'model': {'label': 5, # 类别数
|
||||
'batch_size': 256, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'dropout': 0.5, # 随机失活, 概率
|
||||
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.9, # 学习率衰减系数, 乘法
|
||||
'epochs': 132, # 训练最大轮次
|
||||
'patience': 3, # 早停,2-3就好
|
||||
'lr': 1e-3, # 学习率, bert取5e-5,其他取1e-3,如果acc较低或者一直不变,优先调这个, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
'l2': 0.32, # l2正则化
|
||||
'activate_rnn': 'tanh', # rnn-layer中的激活函数, 即RNN激活函数, 可填'tanh', 'relu', 'signmoid'
|
||||
'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数, 'softmax', 'signmoid'
|
||||
'loss': 'sparse_categorical_accuracy',
|
||||
# 损失函数, mse, categorical_crossentropy, sparse_categorical_crossentropy, binary_crossentropy等
|
||||
'metrics': 'accuracy',
|
||||
# 保存更好模型的评价标准, accuracy, binary_accuracy, categorical_accuracy, sparse_categorical_accuracy, sparse_top_k_categorical_accuracy
|
||||
'optimizer_name': 'ADAM', # 可填'ADAM', 'RADAM', 'RADAM,LOOKAHEAD'
|
||||
'is_training': True, # 训练后者是测试模型, 训练时候是True, 测试时候是False
|
||||
'path_model_dir': path_model_dir, # 保存当前训练模型的根目录
|
||||
'model_path': os.path.join(path_model_dir, "bilstm_crf.model"),
|
||||
# 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True
|
||||
'path_hyper_parameters': os.path.join(path_model_dir, "hyper_parameters.json"),
|
||||
# 模型(包括embedding),超参数地址,
|
||||
'path_fineture': os.path.join(path_model_dir, "embedding.model"),
|
||||
# embedding trainable地址, 例如字向量、词向量、bert向量等
|
||||
'path_l2i_i2l': os.path.join(path_model_dir, "l2i_i2l.json"), # 类别转类标的字典
|
||||
'num_rnn_layers': 1, # rnn层数, 1, 2 or 3等
|
||||
'rnn_type': 'GRU', # rnn类型,可以填"LSTM","GRU","CuDNNLSTM","CuDNNGRU"
|
||||
'rnn_units': 256, # rnn隐藏元, 128, 256, 512, 768, 1024等
|
||||
'crf_mode': 'reg', # crf类型, 可填'reg', 'pad'(包括句子实际长度)
|
||||
},
|
||||
'embedding': {'layer_indexes': [12], # bert取的层数
|
||||
# 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
},
|
||||
'data': {'train_data': path_seg_pku_1998_train, # path_ner_people_1998_train, # 训练数据
|
||||
'val_data': path_seg_pku_1998_train # path_ner_people_1998_valid # 验证数据
|
||||
},
|
||||
}
|
||||
|
||||
# 删除先前存在的模型和embedding微调模型等
|
||||
delete_file(path_model_dir)
|
||||
time_start = time.time()
|
||||
# graph初始化
|
||||
graph = Graph(hyper_parameters)
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
|
||||
from macropodus.network.preprocess.preprocess_generator import PreprocessGenerator
|
||||
|
||||
pg = PreprocessGenerator(os.path.join(path_model_dir, "l2i_i2l.json"))
|
||||
_, len_train = pg.preprocess_label2set(hyper_parameters['data']['train_data'])
|
||||
x_train, y_train = pg.preprocess_label_question_to_idx_fit(embedding_type=hyper_parameters['embedding_type'],
|
||||
path=hyper_parameters['data']['train_data'],
|
||||
embed=ra_ed,
|
||||
rate=rate,
|
||||
batch_size=hyper_parameters['model']['batch_size'],
|
||||
crf_mode=hyper_parameters['model']['crf_mode'],
|
||||
fit_type='fit')
|
||||
|
||||
x_val, y_val = pg.preprocess_label_question_to_idx_fit(embedding_type=hyper_parameters['embedding_type'],
|
||||
path=hyper_parameters['data']['train_data'],
|
||||
embed=ra_ed,
|
||||
rate=rate,
|
||||
batch_size=hyper_parameters['model']['batch_size'],
|
||||
crf_mode=hyper_parameters['model']['crf_mode'],
|
||||
fit_type='fit')
|
||||
# 训练
|
||||
graph.fit(x_train, y_train, x_val, y_val)
|
||||
print("耗时:" + str(time.time() - time_start))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train(rate=0.1)
|
5
macropodus/preprocess/__init__.py
Normal file
5
macropodus/preprocess/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 10:39
|
||||
# @author : Mo
|
||||
# @function:
|
26
macropodus/preprocess/tools_clear.py
Normal file
26
macropodus/preprocess/tools_clear.py
Normal file
@ -0,0 +1,26 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:02
|
||||
# @author : Mo
|
||||
# @function: clear text
|
||||
|
||||
|
||||
def is_total_num(text):
|
||||
"""
|
||||
判断是否是数字的
|
||||
:param text: str
|
||||
:return: boolean, True or false
|
||||
"""
|
||||
try:
|
||||
text_clear = text.replace(" ", "").strip()
|
||||
number = 0
|
||||
for one in text_clear:
|
||||
if one.isdigit():
|
||||
number += 1
|
||||
if number == len(text_clear):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
139
macropodus/preprocess/tools_common.py
Normal file
139
macropodus/preprocess/tools_common.py
Normal file
@ -0,0 +1,139 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 0:15
|
||||
# @author : Mo
|
||||
# @function: common tools of macropodus
|
||||
|
||||
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
re_continue = re.compile('[A-Za-z0-9.@_]', re.U)
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
__all__ = ["txt_read",
|
||||
"txt_write",
|
||||
"save_json",
|
||||
"load_json",
|
||||
"delete_file"]
|
||||
|
||||
|
||||
def txt_read(path_file, encode_type='utf-8'):
|
||||
"""
|
||||
读取txt文件,默认utf8格式, 不能有空行
|
||||
:param file_path: str, 文件路径
|
||||
:param encode_type: str, 编码格式
|
||||
:return: list
|
||||
"""
|
||||
list_line = []
|
||||
try:
|
||||
file = open(path_file, 'r', encoding=encode_type)
|
||||
while True:
|
||||
line = file.readline().strip()
|
||||
if not line:
|
||||
break
|
||||
list_line.append(line)
|
||||
file.close()
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
finally:
|
||||
return list_line
|
||||
|
||||
|
||||
def txt_write(list_line, file_path, type='w', encode_type='utf-8'):
|
||||
"""
|
||||
txt写入list文件
|
||||
:param listLine:list, list文件,写入要带"\n"
|
||||
:param filePath:str, 写入文件的路径
|
||||
:param type: str, 写入类型, w, a等
|
||||
:param encode_type:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
file = open(file_path, type, encoding=encode_type)
|
||||
file.writelines(list_line)
|
||||
file.close()
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
|
||||
|
||||
def save_json(json_lines, json_path):
|
||||
"""
|
||||
保存json,
|
||||
:param json_lines: json
|
||||
:param path: str
|
||||
:return: None
|
||||
"""
|
||||
with open(json_path, 'w', encoding='utf-8') as fj:
|
||||
fj.write(json.dumps(json_lines, ensure_ascii=False))
|
||||
fj.close()
|
||||
|
||||
|
||||
def load_json(path):
|
||||
"""
|
||||
获取json, json存储为[{}]格式, like [{'大漠帝国':132}]
|
||||
:param path: str
|
||||
:return: json
|
||||
"""
|
||||
with open(path, 'r', encoding='utf-8') as fj:
|
||||
model_json = json.load(fj)
|
||||
return model_json
|
||||
|
||||
|
||||
def delete_file(path):
|
||||
"""
|
||||
删除一个目录下的所有文件
|
||||
:param path: str, dir path
|
||||
:return: None
|
||||
"""
|
||||
for i in os.listdir(path):
|
||||
# 取文件或者目录的绝对路径
|
||||
path_children = os.path.join(path, i)
|
||||
if os.path.isfile(path_children):
|
||||
if path_children.endswith(".h5") or path_children.endswith(".json"):
|
||||
os.remove(path_children)
|
||||
else:# 递归, 删除目录下的所有文件
|
||||
delete_file(path_children)
|
||||
|
||||
|
||||
def get_dir_files(path_dir):
|
||||
"""
|
||||
递归获取某个目录下的所有文件(单层)
|
||||
:param path_dir: str, like '/home/data'
|
||||
:return: list, like ['2019_12_5.txt']
|
||||
"""
|
||||
|
||||
def get_dir_files_func(file_list, dir_list, root_path=path_dir):
|
||||
"""
|
||||
递归获取某个目录下的所有文件
|
||||
:param root_path: str, like '/home/data'
|
||||
:param file_list: list, like []
|
||||
:param dir_list: list, like []
|
||||
:return: None
|
||||
"""
|
||||
# 获取该目录下所有的文件名称和目录名称
|
||||
dir_or_files = os.listdir(root_path)
|
||||
for dir_file in dir_or_files:
|
||||
# 获取目录或者文件的路径
|
||||
dir_file_path = os.path.join(root_path, dir_file)
|
||||
# 判断该路径为文件还是路径
|
||||
if os.path.isdir(dir_file_path):
|
||||
dir_list.append(dir_file_path)
|
||||
# 递归获取所有文件和目录的路径
|
||||
get_dir_files_func(dir_file_path, file_list, dir_list)
|
||||
else:
|
||||
file_list.append(dir_file_path)
|
||||
|
||||
# 用来存放所有的文件路径
|
||||
_files = []
|
||||
# 用来存放所有的目录路径
|
||||
dir_list = []
|
||||
get_dir_files_func(_files, dir_list, path_dir)
|
||||
return _files
|
||||
|
192
macropodus/preprocess/tools_ml.py
Normal file
192
macropodus/preprocess/tools_ml.py
Normal file
@ -0,0 +1,192 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 20:23
|
||||
# @author : Mo
|
||||
# @function: data utils of ml, text_summarization
|
||||
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import jieba.posseg as pseg
|
||||
import macropodus
|
||||
import re
|
||||
|
||||
|
||||
__all__ = ["extract_chinese",
|
||||
"macropodus_cut",
|
||||
"jieba_tag_cut",
|
||||
"cut_sentence",
|
||||
"remove_urls",
|
||||
"tfidf_fit",
|
||||
"tfidf_sim"
|
||||
]
|
||||
|
||||
|
||||
def extract_chinese(text):
|
||||
"""
|
||||
只提取出中文、字母和数字
|
||||
:param text: str, input of sentence
|
||||
:return: str
|
||||
"""
|
||||
chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@. ])", text))
|
||||
return chinese_exttract
|
||||
|
||||
|
||||
def jieba_tag_cut(text):
|
||||
"""
|
||||
jieba cut and tagged
|
||||
:param text:str
|
||||
:return: dict
|
||||
"""
|
||||
words = pseg.cut(text)
|
||||
return dict(words)
|
||||
|
||||
|
||||
def macropodus_cut(text):
|
||||
"""
|
||||
Macropodus cut
|
||||
:param text: input sentence
|
||||
:return: list
|
||||
"""
|
||||
return macropodus.cut_dag(text)
|
||||
|
||||
|
||||
def cut_sentence(text, use_type="summarize"):
|
||||
"""
|
||||
分句(文本摘要)
|
||||
:param sentence:str, like "大漠帝国"
|
||||
:param use_type:str, like "summarize" or "new-word-discovery"
|
||||
:return:list
|
||||
"""
|
||||
if use_type=="summarize":
|
||||
re_sen = re.compile('[:;!?。:;?!\n\r]') #.不加是因为不确定.是小数还是英文句号(中文省略号......)
|
||||
elif use_type=="new-word-discovery":
|
||||
re_sen = re.compile('[,,"“”、<>《》{}【】:;!?。:;?!\n\r]') #.不加是因为不确定.是小数还是英文句号(中文省略号......)
|
||||
else:
|
||||
raise RuntimeError("use_type must be 'summarize' or 'new-word-discovery'")
|
||||
sentences = re_sen.split(text)
|
||||
sen_cuts = []
|
||||
for sen in sentences:
|
||||
if sen and str(sen).strip():
|
||||
sen_cuts.append(sen)
|
||||
return sen_cuts
|
||||
|
||||
|
||||
def remove_urls(text):
|
||||
"""
|
||||
删除https/http等无用url
|
||||
:param text: str
|
||||
:return: str
|
||||
"""
|
||||
text_remove_url = re.sub(r'(全文:)?(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
|
||||
'', text, flags=re.MULTILINE)
|
||||
return text_remove_url
|
||||
|
||||
|
||||
def gram_uni_bi_tri(text):
|
||||
"""
|
||||
获取文本的unigram, trugram, bigram等特征
|
||||
:param text: str
|
||||
:return: list
|
||||
"""
|
||||
len_text = len(text)
|
||||
gram_uni = []
|
||||
gram_bi = []
|
||||
gram_tri = []
|
||||
for i in range(len_text):
|
||||
if i + 3 <= len_text:
|
||||
gram_uni.append(text[i])
|
||||
gram_bi.append(text[i:i+2])
|
||||
gram_tri.append(text[i:i+3])
|
||||
elif i + 2 <= len_text:
|
||||
gram_uni.append(text[i])
|
||||
gram_bi.append(text[i:i+2])
|
||||
elif i + 1 <= len_text:
|
||||
gram_uni.append(text[i])
|
||||
else:
|
||||
break
|
||||
return gram_uni, gram_bi, gram_tri
|
||||
|
||||
|
||||
def get_ngrams(text, ns=[1], use_type="summarize", len_max=7):
|
||||
"""
|
||||
获取文本的ngram等特征
|
||||
:param text: str, like "大漠帝国"
|
||||
:param ns: list, like [1, 2, 3]
|
||||
:param type: str, like "summarize" or "new-word-discovery"
|
||||
:param type: int, like 6, 7
|
||||
:return: list<list> or list
|
||||
"""
|
||||
if type(ns) != list:
|
||||
raise RuntimeError("ns of function get_ngram() must be list!")
|
||||
for n in ns:
|
||||
if n < 1:
|
||||
raise RuntimeError("enum of ns must '>1'!")
|
||||
len_text = len(text)
|
||||
ngrams = []
|
||||
if use_type == "summarize": # 分别返回uni, bi, tri...
|
||||
for n in ns:
|
||||
ngram_n = []
|
||||
for i in range(len_text):
|
||||
if i + n <= len_text:
|
||||
ngram_n.append(text[i:i + n])
|
||||
else:
|
||||
break
|
||||
if not ngram_n:
|
||||
ngram_n.append(text)
|
||||
ngrams += ngram_n
|
||||
else: # 只返回一个list
|
||||
for i in range(len_text):
|
||||
ngrams += [text[i: j + i]
|
||||
for j in range(1, min(len_max + 1, len_text - i + 1))]
|
||||
return ngrams
|
||||
|
||||
|
||||
def tfidf_fit(sentences):
|
||||
"""
|
||||
tfidf相似度
|
||||
:param sentences: str
|
||||
:return: list, list, list
|
||||
"""
|
||||
# tfidf计算
|
||||
model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
|
||||
stop_words=[' ', '\t', '\n'], # 停用词
|
||||
max_features=10000,
|
||||
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
use_idf=1, # 光滑
|
||||
smooth_idf=1, # 光滑
|
||||
sublinear_tf=1, ) # 光滑
|
||||
matrix = model.fit_transform(sentences)
|
||||
return matrix
|
||||
|
||||
|
||||
def tdidf_sim(sentences):
|
||||
"""
|
||||
tfidf相似度
|
||||
:param sentences:
|
||||
:return:
|
||||
"""
|
||||
# tfidf计算
|
||||
model = TfidfVectorizer(tokenizer=macropodus_cut,
|
||||
ngram_range=(1, 2), # 3,5
|
||||
stop_words=[' ', '\t', '\n'], # 停用词
|
||||
max_features=10000,
|
||||
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
use_idf=1, # 光滑
|
||||
smooth_idf=1, # 光滑
|
||||
sublinear_tf=1, ) # 光滑
|
||||
matrix = model.fit_transform(sentences)
|
||||
matrix_norm = TfidfTransformer().fit_transform(matrix)
|
||||
return matrix_norm
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text = "你喜欢谁,小老弟,你好烦哇。"
|
||||
# gg = jieba_tag_cut("我不再喜欢你,正如你的不喜欢我")
|
||||
grams = get_ngrams(text, use_type="new-word-discovery", len_max=7)
|
||||
# print(gg)
|
||||
print(grams)
|
30
macropodus/segment/__init__.py
Normal file
30
macropodus/segment/__init__.py
Normal file
@ -0,0 +1,30 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/18 22:00
|
||||
# @author : Mo
|
||||
# @function: segment of sent
|
||||
|
||||
|
||||
from macropodus.segment.seg_statistics.seg_statistics import SegStatistics
|
||||
from macropodus.segment.word_discovery.word_discovery import WordDiscovery
|
||||
|
||||
# 机械分词
|
||||
use_cache = True # 使用缓存
|
||||
segs = SegStatistics(use_cache)
|
||||
cut_bidirectional = segs.cut_bidirectional
|
||||
cut_forward = segs.cut_forward
|
||||
cut_reverse = segs.cut_reverse
|
||||
cut_search = segs.cut_search
|
||||
cut_dag = segs.cut_dag
|
||||
cut = segs.cut
|
||||
|
||||
# 用户词典增删改查
|
||||
load_user_dict = segs.load_user_dict
|
||||
save_delete_words = segs.save_delete_words
|
||||
save_add_words = segs.save_add_words
|
||||
delete_word = segs.delete_word
|
||||
add_word = segs.add_word
|
||||
|
||||
# 新词发现
|
||||
wd = WordDiscovery()
|
||||
find = wd.find_word
|
5
macropodus/segment/seg_statistics/__init__.py
Normal file
5
macropodus/segment/seg_statistics/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 9:25
|
||||
# @author : Mo
|
||||
# @function:
|
62
macropodus/segment/seg_statistics/seg_bidirectional.py
Normal file
62
macropodus/segment/seg_statistics/seg_bidirectional.py
Normal file
@ -0,0 +1,62 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 9:55
|
||||
# @author : Mo
|
||||
# @function: cut sentences of forward of reverse of maxlength
|
||||
|
||||
|
||||
from macropodus.segment.seg_statistics.seg_forward import SegForward
|
||||
from macropodus.segment.seg_statistics.seg_reverse import SegReverse
|
||||
|
||||
|
||||
class SegBidirectional(object):
|
||||
def __init__(self):
|
||||
self.seg_forward = SegForward()
|
||||
self.seg_reverse = SegReverse()
|
||||
|
||||
def cut(self, sentence):
|
||||
"""
|
||||
最大双向词典切词, 即最大正向切词与最大反向切词合并, 选择词数小的那个返回
|
||||
:param sentence: str
|
||||
:return:
|
||||
"""
|
||||
res_forward = self.seg_forward.cut(sentence)
|
||||
res_reverse = self.seg_reverse.cut(sentence)
|
||||
res_forward_list = list(res_forward)
|
||||
res_reverse_list = list(res_reverse)
|
||||
len_res_forward = len(res_forward_list)
|
||||
len_res_reverse = len(res_reverse_list)
|
||||
if len_res_forward >= len_res_reverse:
|
||||
for rrl in res_reverse_list:
|
||||
yield rrl
|
||||
else:
|
||||
for rfl in res_forward_list:
|
||||
yield rfl
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sb = SegBidirectional()
|
||||
sentence = "研究生命科学研究生命科学"
|
||||
print(list(sb.cut(sentence)))
|
||||
|
||||
# 测试性能
|
||||
from macropodus.preprocess.tools_common import txt_read, txt_write
|
||||
from macropodus.conf.path_config import path_root
|
||||
import time
|
||||
|
||||
path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
|
||||
sentences = txt_read(path_wordseg_a)
|
||||
|
||||
time_start = time.time()
|
||||
count = 0
|
||||
for i in range(10000):
|
||||
for sen in sentences:
|
||||
count += 1
|
||||
res = sb.cut(sen)
|
||||
# print(list(res))
|
||||
time_end = time.time()
|
||||
print(time_end - time_start)
|
||||
print(count/(time_end - time_start))
|
||||
# yield
|
||||
# 10000/0.17*50 = 2500*50 = 2896810(line/s)
|
||||
# 50000/0.90*50 = 2500000/20 = 2763600(line/s)
|
118
macropodus/segment/seg_statistics/seg_dag.py
Normal file
118
macropodus/segment/seg_statistics/seg_dag.py
Normal file
@ -0,0 +1,118 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 9:58
|
||||
# @author : Mo
|
||||
# @function: segmentation of maximum probability using dictionary
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_common import re_continue
|
||||
from macropodus.base.seg_basic import SegBasic
|
||||
from math import log
|
||||
|
||||
|
||||
class SegDAG(SegBasic):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def build_dag(self, sentence, len_word_max=105):
|
||||
"""
|
||||
构建句子的词典概率有向图;
|
||||
jieba使用的是前缀字典替代前缀树,内存比前缀树小,且比前缀树快;
|
||||
基本思想是构建'大漠帝国:132','大漠帝','大漠:640','大':1024等,没有则置为0,
|
||||
搜索时候前缀不存在就跳出,不用继续下去了
|
||||
:param sentence: str, like '大漠帝国是谁'
|
||||
:param sentence: int, like 132
|
||||
:return: dict, like {0:[0,1], 1:[1]}
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
dag_sen = {}
|
||||
for i in range(len_sen): # 前向遍历, 全切分
|
||||
enum_j = [i] # 单个字就是它本身
|
||||
for j in range(i+1, min(len_sen, i+len_word_max)): # 遍历从当前字到句子末尾可能成词的部分, 当前的不取, 设置最大成词长度为132
|
||||
word_maybe = sentence[i:j+1]
|
||||
if word_maybe in self.dict_words_freq:
|
||||
enum_j.append(j)
|
||||
dag_sen[i] = enum_j
|
||||
return dag_sen
|
||||
|
||||
def calculate_prob(self, sentence, DAG, route):
|
||||
"""
|
||||
动态规划求取最大概率, 代码来自jieba项目
|
||||
code from: https://github.com/fxsjy/jieba
|
||||
:param sentence: str, input of sentence, like "大漠帝国是谁?"
|
||||
:param DAG: dict,
|
||||
:param route: dict,
|
||||
:return: None
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
route[len_sen] = (0, 0)
|
||||
log_total = log(self.num_words)
|
||||
for index in range(len_sen - 1, -1, -1): # 动态规划
|
||||
route[index] = max((log(self.dict_words_freq.get(sentence[index:x + 1]) or 1)
|
||||
- log_total + route[x + 1][0], x) for x in DAG[index])
|
||||
|
||||
def cut(self, sentence):
|
||||
"""
|
||||
seg_dag字典最大概率切词, 代码来自jieba项目
|
||||
code from: https://github.com/fxsjy/jieba
|
||||
:param sentence: str, input of sentence, like "大漠帝国是谁?"
|
||||
:return: None
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
word_temp = ''
|
||||
route = {}
|
||||
i = 0
|
||||
DAG = self.build_dag(sentence) # 根据sentence构建有向图dag
|
||||
self.calculate_prob(sentence, DAG, route) # 动态规划计算概率最大的路径
|
||||
while i < len_sen:
|
||||
j = route[i][1] + 1 # 获取index, i为成词的begin, j为成词的end
|
||||
word_ch = sentence[i:j] # 概率成词
|
||||
if (j-i<2) and re_continue.match(word_ch): # 单个字判断是否为连续, 字母-数字-.-@等为连续
|
||||
word_temp += word_ch
|
||||
i = j
|
||||
else: # 成词后返回一个yield可迭代对象, yield后转list有点耗时
|
||||
if word_temp: # 有word_temp的情况下 word_ch也没有迭代返回
|
||||
yield word_temp
|
||||
word_temp = ''
|
||||
yield word_ch
|
||||
i = j
|
||||
if word_temp: # 最后一个成词为"字母-数字-.-@等为连续"的情况
|
||||
yield word_temp
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sd = SegDAG()
|
||||
sd.add_word(str('知识图谱'))
|
||||
|
||||
# for i in range(50000):
|
||||
sd_enum = sd.cut(sentence='apple_pir大漠帝国我再也找不到了')
|
||||
print(list(sd_enum))
|
||||
|
||||
# 测试性能
|
||||
from macropodus.preprocess.tools_common import txt_read, txt_write
|
||||
from macropodus.conf.path_config import path_root
|
||||
import time
|
||||
path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
|
||||
sentences = txt_read(path_wordseg_a)
|
||||
|
||||
time_start = time.time()
|
||||
count = 0
|
||||
for i in range(10000):
|
||||
for sen in sentences:
|
||||
# print("原句:"+sen)
|
||||
count += 1
|
||||
res = sd.cut(sen)
|
||||
# print(list(res))
|
||||
time_end = time.time()
|
||||
print(time_end-time_start)
|
||||
print(count/(time_end - time_start))
|
||||
|
||||
while True:
|
||||
print("请输入:")
|
||||
sen = input()
|
||||
print(list(sd.cut(sen)))
|
||||
# win10测试, i7 8th + 16G RAM
|
||||
# 10000/0.17*50 = 2864136(line/s)
|
||||
# 50000/0.87*50 = 2872092(line/s)
|
||||
|
||||
|
65
macropodus/segment/seg_statistics/seg_forward.py
Normal file
65
macropodus/segment/seg_statistics/seg_forward.py
Normal file
@ -0,0 +1,65 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 9:54
|
||||
# @author : Mo
|
||||
# @function: cut sentences of forward of maxlength
|
||||
|
||||
|
||||
from macropodus.base.seg_basic import SegBasic
|
||||
|
||||
|
||||
class SegForward(SegBasic):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def cut(self, sentence, len_max=7):
|
||||
"""
|
||||
正向最大切词
|
||||
:param sentence: str, like '大漠帝国'
|
||||
:param len_max: int, like 32
|
||||
:return: yield
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
i = 0
|
||||
while i < len_sen: # while判断条件
|
||||
flag = False # flag标志位,确定有没有在字典里边的单字词或多字词
|
||||
for j in range(min(len_sen+1, i+len_max), -i, -1): # 遍历从当前字到句子末尾可能成词的部分, 从最后i+len_max算起
|
||||
word_maybe = sentence[i:j] # 正向可能成词的语
|
||||
if word_maybe in self.dict_words_freq: # 是否在字典里边
|
||||
i = j # 成词前标志i向后移动
|
||||
flag = True # flag标志位变化
|
||||
yield word_maybe
|
||||
break # 成词则跳出循环
|
||||
if not flag: # 未选中后单个字的情况
|
||||
yield sentence[i]
|
||||
i += 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
sf = SegForward()
|
||||
sentence = "macropodus是啥子呢"
|
||||
sentence = "方程的解除了零以外还有…"
|
||||
print(list(sf.cut(sentence)))
|
||||
|
||||
# 测试性能
|
||||
from macropodus.preprocess.tools_common import txt_read, txt_write
|
||||
from macropodus.conf.path_config import path_root
|
||||
import time
|
||||
|
||||
path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
|
||||
sentences = txt_read(path_wordseg_a)
|
||||
|
||||
time_start = time.time()
|
||||
count = 0
|
||||
for i in range(10000):
|
||||
for sen in sentences:
|
||||
# print(sen)
|
||||
count += 1
|
||||
res = sf.cut(sen)
|
||||
# print(list(res))
|
||||
time_end = time.time()
|
||||
print(time_end - time_start)
|
||||
print(count/(time_end - time_start))
|
||||
|
||||
# 10000/0.17*50 = 2831272(line/s)
|
||||
|
||||
|
72
macropodus/segment/seg_statistics/seg_reverse.py
Normal file
72
macropodus/segment/seg_statistics/seg_reverse.py
Normal file
@ -0,0 +1,72 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/19 9:54
|
||||
# @author : Mo
|
||||
# @function: cut sentences of reverse of maxlength
|
||||
|
||||
|
||||
from macropodus.base.seg_basic import SegBasic
|
||||
|
||||
|
||||
class SegReverse(SegBasic):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def cut(self, sentence, len_max=7):
|
||||
"""
|
||||
反向最大切词
|
||||
:param sentence: str, like '大漠帝国'
|
||||
:param len_max: int, like 32
|
||||
:return: yield
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
i = len_sen
|
||||
res = []
|
||||
while i > 0: # while判断条件
|
||||
flag = False # flag标志位,确定有没有在字典里边的单字词或多字词
|
||||
for j in range(max(0, i - len_max), i): # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起
|
||||
word_maybe = sentence[j:i] # 正向可能成词的语
|
||||
if word_maybe in self.dict_words_freq: # 是否在字典里边
|
||||
i = j # 成词前标志i向后移动
|
||||
flag = True # flag标志位变化
|
||||
res.append(word_maybe)
|
||||
# yield word_maybe
|
||||
break # 成词则跳出循环
|
||||
if not flag: # 未选中后单个字的情况
|
||||
i -= 1
|
||||
# yield sentence[i]
|
||||
res.append(sentence[i])
|
||||
for i in range(len(res)-1, 0, -1):
|
||||
yield res[i]
|
||||
# return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
a = max(0,5)
|
||||
sf = SegReverse()
|
||||
sentence = "研究生命科学\t研究 生命 科学"
|
||||
print(list(sf.cut(sentence)))
|
||||
print(list(sf.cut("")))
|
||||
|
||||
# 测试性能
|
||||
from macropodus.preprocess.tools_common import txt_read, txt_write
|
||||
from macropodus.conf.path_config import path_root
|
||||
import time
|
||||
path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
|
||||
sentences = txt_read(path_wordseg_a)
|
||||
|
||||
time_start = time.time()
|
||||
count = 0
|
||||
for i in range(50000):
|
||||
for sen in sentences:
|
||||
# print(sen)
|
||||
count += 1
|
||||
res = (sf.cut(sen))
|
||||
# print(res)
|
||||
time_end = time.time()
|
||||
print(time_end-time_start)
|
||||
print(count/(time_end - time_start))
|
||||
|
||||
# 10000/0.18*50 = 2500*50 = 2784226(line/s)
|
||||
# 50000/0.98*50 = 2500000/20 = 2550109(line/s)
|
||||
|
222
macropodus/segment/seg_statistics/seg_statistics.py
Normal file
222
macropodus/segment/seg_statistics/seg_statistics.py
Normal file
@ -0,0 +1,222 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/3 20:01
|
||||
# @author : Mo
|
||||
# @function: segnment of statistics
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_common import re_continue
|
||||
from macropodus.base.seg_basic import SegBasic
|
||||
from math import log
|
||||
|
||||
|
||||
__all__ = ["cut_dag",
|
||||
"cut_forward",
|
||||
"cut_reverse",
|
||||
"cut_bidirectional",
|
||||
"cut_search"]
|
||||
|
||||
|
||||
class SegStatistics(SegBasic):
|
||||
def __init__(self, use_cache):
|
||||
self.algorithm = "chinese-word-segnment"
|
||||
super().__init__(use_cache)
|
||||
|
||||
def build_dag(self, sentence, len_word_max=105):
|
||||
"""
|
||||
构建句子的词典概率有向图;
|
||||
jieba使用的是前缀字典替代前缀树,内存比前缀树小,且比前缀树快;
|
||||
基本思想是构建'大漠帝国:132','大漠帝','大漠:640','大':1024等,没有则置为0,
|
||||
搜索时候前缀不存在就跳出,不用继续下去了
|
||||
:param sentence: str, like '大漠帝国是谁'
|
||||
:param sentence: int, like 132
|
||||
:return: dict, like {0:[0,1], 1:[1]}
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
dag_sen = {}
|
||||
for i in range(len_sen): # 前向遍历, 全切分
|
||||
enum_j = [i] # 单个字就是它本身
|
||||
for j in range(i+1, min(len_sen, i+len_word_max)): # 遍历从当前字到句子末尾可能成词的部分, 当前的不取, 设置最大成词长度为132
|
||||
word_maybe = sentence[i:j+1]
|
||||
if word_maybe in self.dict_words_freq:
|
||||
enum_j.append(j)
|
||||
dag_sen[i] = enum_j
|
||||
return dag_sen
|
||||
|
||||
def calculate_prob(self, sentence, DAG, route):
|
||||
"""
|
||||
动态规划求取最大概率, 代码来自jieba项目
|
||||
code from: https://github.com/fxsjy/jieba
|
||||
:param sentence: str, input of sentence, like "大漠帝国是谁?"
|
||||
:param DAG: dict,
|
||||
:param route: dict,
|
||||
:return: None
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
route[len_sen] = (0, 0)
|
||||
log_total = log(self.num_words)
|
||||
for index in range(len_sen - 1, -1, -1): # 动态规划
|
||||
route[index] = max((log(self.dict_words_freq.get(sentence[index:x + 1]) or 1)
|
||||
- log_total + route[x + 1][0], x) for x in DAG[index])
|
||||
|
||||
def cut_dag(self, sentence):
|
||||
"""
|
||||
seg_dag字典最大概率切词, 代码来自jieba项目
|
||||
code from: https://github.com/fxsjy/jieba
|
||||
:param sentence: str, input of sentence, like "大漠帝国是谁?"
|
||||
:return: None
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
word_temp = ''
|
||||
route = {}
|
||||
i = 0
|
||||
DAG = self.build_dag(sentence) # 根据sentence构建有向图dag
|
||||
self.calculate_prob(sentence, DAG, route) # 动态规划计算概率最大的路径
|
||||
while i < len_sen:
|
||||
j = route[i][1] + 1 # 获取index, i为成词的begin, j为成词的end
|
||||
word_ch = sentence[i:j] # 概率成词
|
||||
if (j-i<2) and re_continue.match(word_ch): # 单个字判断是否为连续, 字母-数字-.-@等为连续
|
||||
word_temp += word_ch
|
||||
i = j
|
||||
else: # 成词后返回一个yield可迭代对象, yield后转list有点耗时
|
||||
if word_temp: # 有word_temp的情况下 word_ch也没有迭代返回
|
||||
yield word_temp
|
||||
word_temp = ''
|
||||
yield word_ch
|
||||
i = j
|
||||
if word_temp: # 最后一个成词为"字母-数字-.-@等为连续"的情况
|
||||
yield word_temp
|
||||
|
||||
def cut_forward(self, sentence, len_max=7):
|
||||
"""
|
||||
正向最大切词
|
||||
:param sentence: str, like '大漠帝国'
|
||||
:param len_max: int, like 32
|
||||
:return: yield
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
i = 0
|
||||
while i < len_sen: # while判断条件
|
||||
flag = False # flag标志位,确定有没有在字典里边的单字词或多字词
|
||||
for j in range(min(len_sen+1, i+len_max), -i, -1): # 遍历从当前字到句子末尾可能成词的部分, 从最后i+len_max算起
|
||||
word_maybe = sentence[i:j] # 正向可能成词的语
|
||||
if word_maybe in self.dict_words_freq: # 是否在字典里边
|
||||
i = j # 成词前标志i向后移动
|
||||
flag = True # flag标志位变化
|
||||
yield word_maybe
|
||||
break # 成词则跳出循环
|
||||
if not flag: # 未选中后单个字的情况
|
||||
yield sentence[i]
|
||||
i += 1
|
||||
|
||||
def cut_reverse(self, sentence, len_max=7):
|
||||
"""
|
||||
反向最大切词
|
||||
:param sentence: str, like '大漠帝国'
|
||||
:param len_max: int, like 32
|
||||
:return: yield
|
||||
"""
|
||||
len_sen = len(sentence)
|
||||
i = len_sen
|
||||
res = []
|
||||
while i > 0: # while判断条件
|
||||
flag = False # flag标志位,确定有没有在字典里边的单字词或多字词
|
||||
for j in range(max(0, i - len_max), i): # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起
|
||||
word_maybe = sentence[j:i] # 正向可能成词的语
|
||||
if word_maybe in self.dict_words_freq: # 是否在字典里边
|
||||
i = j # 成词前标志i向后移动
|
||||
flag = True # flag标志位变化
|
||||
res.append(word_maybe)
|
||||
# yield word_maybe
|
||||
break # 成词则跳出循环
|
||||
if not flag: # 未选中后单个字的情况
|
||||
i -= 1
|
||||
# yield sentence[i]
|
||||
res.append(sentence[i])
|
||||
for i in range(len(res)-1, 0, -1):
|
||||
yield res[i]
|
||||
# return res
|
||||
|
||||
def cut_bidirectional(self, sentence):
|
||||
"""
|
||||
最大双向词典切词, 即最大正向切词与最大反向切词合并, 选择词数小的那个返回
|
||||
:param sentence: str
|
||||
:return:
|
||||
"""
|
||||
res_forward = self.cut_forward(sentence)
|
||||
res_reverse = self.cut_reverse(sentence)
|
||||
res_forward_list = list(res_forward)
|
||||
res_reverse_list = list(res_reverse)
|
||||
len_res_forward = len(res_forward_list)
|
||||
len_res_reverse = len(res_reverse_list)
|
||||
if len_res_forward >= len_res_reverse:
|
||||
for rrl in res_reverse_list:
|
||||
yield rrl
|
||||
else:
|
||||
for rfl in res_forward_list:
|
||||
yield rfl
|
||||
|
||||
def cut_search(self, sentence):
|
||||
"""
|
||||
搜索引擎切词, 全切词
|
||||
:param sentence: str, like "大漠帝国"
|
||||
:return: yield
|
||||
"""
|
||||
DAG = self.build_dag(sentence) # 根据sentence构建有向图dag
|
||||
for k, v in DAG.items():
|
||||
for vi in v:
|
||||
yield sentence[k:vi+1] # 遍历无向图, 返回可能存在的所有切分
|
||||
|
||||
def cut(self, sentence, type_cut="cut_dag"):
|
||||
"""
|
||||
切词总函数
|
||||
:param sentence:str, like '大漠帝国, macropodus, 中国斗鱼'
|
||||
:param type_cut: str, like 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'
|
||||
:return: list, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼']
|
||||
"""
|
||||
if type_cut=="cut_dag":
|
||||
return list(self.cut_dag(sentence))
|
||||
elif type_cut=="cut_forward":
|
||||
return list(self.cut_dag(sentence))
|
||||
elif type_cut=="cut_reverse":
|
||||
return list(self.cut_dag(sentence))
|
||||
elif type_cut=="cut_bidirectional":
|
||||
return list(self.cut_dag(sentence))
|
||||
elif type_cut=="cut_search":
|
||||
return list(self.cut_dag(sentence))
|
||||
else:
|
||||
raise RuntimeError("type_cut must be 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sd = SegStatistics(True)
|
||||
sd.add_word(str('知识图谱'))
|
||||
sd_search = sd.cut_search("已结婚的和尚未结婚的青年都要实行计划生育")
|
||||
print(list(sd_search))
|
||||
# for i in range(50000):
|
||||
sd_enum = sd.cut_dag(sentence="已结婚的和尚未结婚的青年都要实行计划生育")
|
||||
print(list(sd_enum))
|
||||
sd_enum = sd.cut_dag(sentence='what‘syournamesirareyouok!')
|
||||
print(list(sd_enum))
|
||||
# 测试性能
|
||||
from macropodus.preprocess.tools_common import txt_read, txt_write
|
||||
from macropodus.conf.path_config import path_root
|
||||
import time
|
||||
path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
|
||||
sentences = txt_read(path_wordseg_a)
|
||||
|
||||
time_start = time.time()
|
||||
count = 0
|
||||
for i in range(50000):
|
||||
for sen in sentences:
|
||||
count += 1
|
||||
res = sd.cut_search(sen)
|
||||
# print(list(res))
|
||||
time_end = time.time()
|
||||
print(time_end-time_start)
|
||||
print(count/(time_end - time_start))
|
||||
|
||||
|
||||
# win10测试, i7 8th + 16G RAM
|
||||
# 10000/0.17*50 = 2864136(line/s)
|
||||
# 50000/0.87*50 = 2872092(line/s)
|
5
macropodus/segment/word_discovery/__init__.py
Normal file
5
macropodus/segment/word_discovery/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/19 15:36
|
||||
# @author : Mo
|
||||
# @function:
|
217
macropodus/segment/word_discovery/word_discovery.py
Normal file
217
macropodus/segment/word_discovery/word_discovery.py
Normal file
@ -0,0 +1,217 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/19 16:44
|
||||
# @author : Mo
|
||||
# @function: chinese word discovery
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
from macropodus.preprocess.tools_ml import get_ngrams
|
||||
from collections import Counter
|
||||
import math
|
||||
import os
|
||||
|
||||
|
||||
class WordDiscovery:
|
||||
def __init__(self):
|
||||
self.algorithm = "new-word-discovery"
|
||||
self.total_words = 0
|
||||
self.freq_min = 3
|
||||
self.len_max = 7
|
||||
|
||||
def count_word(self, text, use_type="text"):
|
||||
"""
|
||||
词频统计(句子/段落/文章)
|
||||
:param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt"
|
||||
:param use_type: str, "text" or "file", file of "utf-8" of "txt"
|
||||
:return: class<Counter>, word-freq
|
||||
"""
|
||||
self.words_count = Counter()
|
||||
if use_type=="text": # 输入为文本形式
|
||||
texts = cut_sentence(use_type=self.algorithm,
|
||||
text=text) # 切句子, 如中英文的逗号/句号/感叹号
|
||||
for text in texts:
|
||||
n_grams = get_ngrams(use_type=self.algorithm,
|
||||
len_max=self.len_max,
|
||||
text=text) # 获取一个句子的所有n-gram
|
||||
self.words_count.update(n_grams)
|
||||
elif use_type=="file": # 输入为文件形式
|
||||
if not os.path.exists(text):
|
||||
raise RuntimeError("path of text must exist!")
|
||||
fr8 = open(text, "r", encoding="utf-8")
|
||||
for text in fr8:
|
||||
if text.strip():
|
||||
texts = cut_sentence(use_type=self.algorithm,
|
||||
text=text) # 切句子, 如中英文的逗号/句号/感叹号
|
||||
for text in texts:
|
||||
n_grams = get_ngrams(use_type=self.algorithm,
|
||||
len_max=self.len_max,
|
||||
text=text) # 获取一个句子的所有n-gram
|
||||
self.words_count.update(n_grams)
|
||||
fr8.close()
|
||||
else:
|
||||
raise RuntimeError("use_type must be 'text' or 'file'")
|
||||
self.total_words = sum(self.words_count.values())
|
||||
|
||||
def calculate_entropy(self, boundary_type="left"):
|
||||
"""
|
||||
计算左熵和右熵
|
||||
:param boundary_type: str, like "left" or "right"
|
||||
:return: None
|
||||
"""
|
||||
# 获取成词的最左边和最右边的一个字
|
||||
one_collect = {}
|
||||
for k, v in self.words_count.items():
|
||||
len_k = len(k)
|
||||
if len_k >= 3: # 词长度大于3
|
||||
if boundary_type == "right":
|
||||
k_boundary = k[:-1]
|
||||
else:
|
||||
k_boundary = k[1:]
|
||||
if k_boundary in self.words_select: # 左右边, 保存为dict
|
||||
if k_boundary not in one_collect:
|
||||
one_collect[k_boundary] = [v]
|
||||
else:
|
||||
one_collect[k_boundary] = one_collect[k_boundary] + [v]
|
||||
|
||||
# 计算成词的互信息
|
||||
for k, v in self.words_select.items():
|
||||
# 从字典获取
|
||||
boundary_v = one_collect.get(k, None)
|
||||
# 计算候选词的左右凝固度, 取最小的那个
|
||||
if boundary_v:
|
||||
sum_boundary = sum(boundary_v) # 求和
|
||||
# 计算信息熵
|
||||
entroy_boundary = sum([-(enum_bo / sum_boundary) * math.log(enum_bo / sum_boundary)
|
||||
for enum_bo in boundary_v])
|
||||
else:
|
||||
entroy_boundary = 0.0
|
||||
if boundary_type == "right":
|
||||
self.right_entropy[k] = entroy_boundary
|
||||
else:
|
||||
self.left_entropy[k] = entroy_boundary
|
||||
|
||||
def compute_entropys(self):
|
||||
"""
|
||||
计算凝固度
|
||||
:param words_count:dict, like {"我":32, "你们":12}
|
||||
:param len_max: int, like 6
|
||||
:param freq_min: int, like 32
|
||||
:return: dict
|
||||
"""
|
||||
# 提取大于最大频率的词语, 以及长度在3-len_max的词语
|
||||
self.words_select = {word: count for word, count in self.words_count.items()
|
||||
if count >= self.freq_min and " " not in word
|
||||
and 1 < len(word) <= self.len_max
|
||||
}
|
||||
# 计算凝固度, 左右两边
|
||||
self.right_entropy = {}
|
||||
self.left_entropy = {}
|
||||
self.calculate_entropy(boundary_type="left")
|
||||
self.calculate_entropy(boundary_type="right")
|
||||
# self.words_count.clear() # 清除变量
|
||||
|
||||
def compute_aggregation(self):
|
||||
"""
|
||||
计算凝固度
|
||||
:return: None
|
||||
"""
|
||||
self.aggregation = {}
|
||||
for word, value in self.words_select.items():
|
||||
len_word = len(word)
|
||||
score_aggs = []
|
||||
for i in range(1, len_word): # 候选词的左右两边各取一个字
|
||||
word_right = word[i:]
|
||||
word_left = word[:i]
|
||||
value_right = self.words_select.get(word_right, self.freq_min)
|
||||
value_left = self.words_select.get(word_left, self.freq_min)
|
||||
# score_agg_single = math.log(value) - math.log(value_right * value_left)
|
||||
score_agg_single = value / (value_right * value_left)
|
||||
# score_agg_single = math.log10(value) - math.log10(self.total_words) -math.log10((value_right * value_left))
|
||||
score_aggs.append(score_agg_single)
|
||||
self.aggregation[word] = min(score_aggs)
|
||||
|
||||
def find_word(self, text, use_type="text", freq_min=2, len_max=7, entropy_min=1.2, aggregation_min=0.5, use_avg=False):
|
||||
"""
|
||||
新词发现与策略
|
||||
:param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt"
|
||||
:param use_type: str, "text" or "file", file of "utf-8" of "txt"
|
||||
:param freq_min: int, 最小词频, 大于1
|
||||
:param len_max: int, 最大成词长度, 一般为5, 6, 7
|
||||
:param entropy_min: int, 最小词频, 大于1
|
||||
:param aggregation_min: int, 最大成词长度, 一般为5, 6, 7
|
||||
:return:
|
||||
"""
|
||||
self.aggregation_min = aggregation_min
|
||||
self.entropy_min = entropy_min
|
||||
self.freq_min = freq_min
|
||||
self.len_max = len_max
|
||||
self.count_word(text=text, use_type=use_type)
|
||||
self.compute_entropys()
|
||||
self.compute_aggregation()
|
||||
self.new_words = {}
|
||||
# 输出
|
||||
for word,value in self.words_select.items():
|
||||
if not use_avg and self.aggregation[word] > self.aggregation_min \
|
||||
and self.right_entropy[word] > self.entropy_min and self.left_entropy[word] > self.entropy_min:
|
||||
self.new_words[word] = {}
|
||||
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
|
||||
self.new_words[word]["a"] = self.aggregation[word] # math.log10(self.aggregation[word]) - math.log10(self.total_words)
|
||||
self.new_words[word]["r"] = self.right_entropy[word]
|
||||
self.new_words[word]["l"] = self.left_entropy[word]
|
||||
self.new_words[word]["f"] = value / self.total_words
|
||||
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
|
||||
(self.right_entropy[word] + self.left_entropy[word])
|
||||
elif use_avg and self.aggregation[word] > self.aggregation_min \
|
||||
and (self.right_entropy[word] + self.left_entropy[word]) > 2 * self.entropy_min:
|
||||
self.new_words[word] = {}
|
||||
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
|
||||
self.new_words[word]["a"] = self.aggregation[word]
|
||||
self.new_words[word]["r"] = self.right_entropy[word]
|
||||
self.new_words[word]["l"] = self.left_entropy[word]
|
||||
self.new_words[word]["f"] = value / self.total_words
|
||||
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
|
||||
(self.right_entropy[word] + self.left_entropy[word])
|
||||
|
||||
return self.new_words
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
# wc = count_word(text)
|
||||
# path = "data/poet_tangsong.csv"
|
||||
# wd = WordDiscovery()
|
||||
# res = wd.find_word(text=path, use_type="file", freq_min=2, len_max=6, entropy_min=1.2, aggregation_min=0.4)
|
||||
# from macropodus.preprocess.tools_common import txt_write
|
||||
# import json
|
||||
# res_s = json.dumps(res)
|
||||
# txt_write([res_s], "res_s.txt")
|
||||
# print(res)
|
||||
# with open("res_s.txt", "r", encoding="utf-8") as fd:
|
||||
# ff = fd.readlines()[0]
|
||||
# res_ = json.loads(ff)
|
||||
# res_soft = sorted(res_.items(), key=lambda d: d[1]['score'], reverse=True)
|
||||
wd = WordDiscovery()
|
||||
res = wd.find_word(text=text, use_type="text", use_avg=True, freq_min=2, len_max=7, entropy_min=0.4, aggregation_min=1.2)
|
||||
for k, v in res.items():
|
||||
print(k, v)
|
||||
while True:
|
||||
print("请输入:")
|
||||
ques = input()
|
||||
res = wd.find_word(text=ques, use_type="text", use_avg=True, freq_min=2, len_max=7, entropy_min=0.52, aggregation_min=1.2)
|
||||
for k, v in res.items():
|
||||
print(k, v)
|
||||
# gg = 0
|
14
macropodus/similarity/__init__.py
Normal file
14
macropodus/similarity/__init__.py
Normal file
@ -0,0 +1,14 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/18 22:04
|
||||
# @author : Mo
|
||||
# @function:
|
||||
|
||||
|
||||
from macropodus.similarity.similarity_word2vec_char import SimW2vChar
|
||||
|
||||
|
||||
# 文本相似度
|
||||
use_cache = True # 使用缓存
|
||||
swc = SimW2vChar(use_cache)
|
||||
sim = swc.similarity
|
69
macropodus/similarity/similarity_word2vec_char.py
Normal file
69
macropodus/similarity/similarity_word2vec_char.py
Normal file
@ -0,0 +1,69 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/17 14:50
|
||||
# @author : Mo
|
||||
# @function: similarity of sentence of word2vec
|
||||
|
||||
|
||||
from macropodus.base.word2vec import W2v
|
||||
|
||||
|
||||
class SimW2vChar(W2v):
|
||||
def __init__(self, use_cache):
|
||||
super().__init__(use_cache)
|
||||
|
||||
def encode(self, sent, type_encode="other"):
|
||||
"""
|
||||
生成句向量, 字符级别, char
|
||||
:param sent: str, like "大漠帝国"
|
||||
:param type_encode: str, like "avg", "other"
|
||||
:return: vector
|
||||
"""
|
||||
sentence_vec = self.w2v_char.wv[self.w2v_char.index2word[1]] * 0
|
||||
len_sent = len(sent)
|
||||
for i in range(len_sent):
|
||||
word = sent[i]
|
||||
try:
|
||||
sentence_vec = sentence_vec + self.w2v_char.wv[word]
|
||||
except Exception as e:
|
||||
sentence_vec = sentence_vec + 0.01 # unknow_know词加1
|
||||
if type_encode == "avg":
|
||||
sentence_vec = sentence_vec / len_sent
|
||||
return sentence_vec
|
||||
|
||||
def similarity(self, sent1, sent2, type_sim="total", type_encode="avg"):
|
||||
"""
|
||||
相似度计算, 默认余弦相似度+jaccard相似度
|
||||
:param sen1: str, like "大漠帝国"
|
||||
:param sen2: str, like "Macropodus"
|
||||
:param type_sim: str, like "total" or "cosine"
|
||||
:param type_encode: str, like "other" or "avg"
|
||||
:return: float, like 0.998
|
||||
"""
|
||||
if sent1 and sent2:
|
||||
encode_sen1 = self.encode(sent1, type_encode)
|
||||
encode_sen2 = self.encode(sent2, type_encode)
|
||||
score_res = self.cosine(encode_sen1, encode_sen2)
|
||||
else:
|
||||
score_res = 0.0
|
||||
if type_sim=="total":
|
||||
score_jaccard = self.jaccard(sent1, sent2)
|
||||
score_res = (score_res + score_jaccard)/2
|
||||
return score_res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
sent1 = "大漠帝国"
|
||||
sent2 = "macropodus"
|
||||
swc = SimW2vChar(use_cache=True)
|
||||
sen_encede = swc.encode(sent1)
|
||||
score = swc.similarity(sent1, sent2)
|
||||
print(score)
|
||||
gg = 0
|
||||
while True:
|
||||
print("请输入sent1:")
|
||||
sent1 = input()
|
||||
print("请输入sent2:")
|
||||
sent2 = input()
|
||||
print(swc.similarity(sent1, sent2))
|
86
macropodus/summarize/__init__.py
Normal file
86
macropodus/summarize/__init__.py
Normal file
@ -0,0 +1,86 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/18 22:10
|
||||
# @author : Mo
|
||||
# @function: text summarize
|
||||
|
||||
|
||||
# text_summarize of extractive
|
||||
from macropodus.summarize.feature_base.word_significance import WordSignificanceSum
|
||||
from macropodus.summarize.feature_base.text_pronouns import TextPronounsSum
|
||||
from macropodus.summarize.graph_base.textrank import TextRankSum, TextRankKey
|
||||
from macropodus.summarize.feature_base.text_teaser import TextTeaserSum
|
||||
from macropodus.summarize.feature_base.mmr import MMRSum
|
||||
|
||||
from macropodus.summarize.topic_base.topic_lda import LDASum
|
||||
from macropodus.summarize.topic_base.topic_lsi import LSISum
|
||||
from macropodus.summarize.topic_base.topic_nmf import NMFSum
|
||||
|
||||
from macropodus.summarize.nous_base.lead_3 import Lead3Sum
|
||||
|
||||
|
||||
# feature
|
||||
wss = WordSignificanceSum()
|
||||
tps = TextPronounsSum()
|
||||
tts = TextTeaserSum()
|
||||
mms = MMRSum()
|
||||
|
||||
# graph-3
|
||||
trs = TextRankSum()
|
||||
trk = TextRankKey()
|
||||
|
||||
# nous
|
||||
l3s = Lead3Sum()
|
||||
|
||||
# topic
|
||||
lds = LDASum()
|
||||
lss = LSISum()
|
||||
nms = NMFSum()
|
||||
|
||||
# summarization
|
||||
text_pronouns = tps.summarize
|
||||
text_teaser = tts.summarize
|
||||
word_sign = wss.summarize
|
||||
textrank = trs.summarize
|
||||
lead3 = l3s.summarize
|
||||
mmr = mms.summarize
|
||||
lda = lds.summarize
|
||||
lsi = lss.summarize
|
||||
nmf = nms.summarize
|
||||
|
||||
# keyword
|
||||
keyword = trk.keyword
|
||||
|
||||
|
||||
def summarization(text, num=320, type_summarize="lda", topic_min=6, judge_topic=False, alpha=0.6, type_l='mix', model_type="textrank_sklearn", title=None):
|
||||
"""
|
||||
文本摘要汇总
|
||||
:param text: str, like "你是。大漠帝国。不是吧错了。哈哈。我的。"
|
||||
:param num: int, like 32
|
||||
:param type_summarize: str, like "lda", must in ['text_pronouns', 'text_teaser', 'word_sign', 'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf']
|
||||
:return:
|
||||
"""
|
||||
|
||||
if type_summarize=="text_pronouns": # title, str, 可填标题, like "震惊,MacropodusXXX"
|
||||
res = text_pronouns(text, num, title)
|
||||
elif type_summarize=="text_teaser": # title, str, 可填标题, like "震惊,MacropodusXXX"
|
||||
res = text_teaser(text, num, title)
|
||||
elif type_summarize=="word_sign": #
|
||||
res = word_sign(text, num)
|
||||
elif type_summarize=="textrank": # model_type 可填 'textrank_textrank4zh', 'text_rank_sklearn' or 'textrank_gensim'
|
||||
res = textrank(text, num)
|
||||
elif type_summarize=="lead3":
|
||||
res = lead3(text, num, type_l) # type_l 可填 'begin', 'end' or 'mix'
|
||||
elif type_summarize=="mmr":
|
||||
res = mmr(text, num, alpha) # alpha 可填 0-1
|
||||
elif type_summarize=="lda": # topic_min>1, judge_topic=True or False
|
||||
res = lda(text, num, topic_min, judge_topic)
|
||||
elif type_summarize=="lsi": # topic_min>1, judge_topic=True or False
|
||||
res = lsi(text, num, topic_min, judge_topic)
|
||||
elif type_summarize=="nmf": # topic_min>1, judge_topic=True or False
|
||||
res = nmf(text, num, topic_min, judge_topic)
|
||||
else:
|
||||
raise RuntimeError("your input type_summarize is wrong, it must be in "
|
||||
"['text_pronouns', 'text_teaser', 'word_sign', "
|
||||
"'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf']")
|
||||
return res
|
5
macropodus/summarize/feature_base/__init__.py
Normal file
5
macropodus/summarize/feature_base/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/12/25 21:41
|
||||
# @author :Mo
|
||||
# @function :
|
90
macropodus/summarize/feature_base/mmr.py
Normal file
90
macropodus/summarize/feature_base/mmr.py
Normal file
@ -0,0 +1,90 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/28 20:16
|
||||
# @author :Mo
|
||||
# @function :mmr
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import extract_chinese, cut_sentence
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut, tfidf_fit
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
import copy
|
||||
|
||||
|
||||
class MMRSum:
|
||||
def __init__(self):
|
||||
self.stop_words = stop_words.values()
|
||||
self.algorithm = 'mmr'
|
||||
|
||||
def summarize(self, text, num=8, alpha=0.6):
|
||||
"""
|
||||
|
||||
:param text: str
|
||||
:param num: int
|
||||
:return: list
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# 切词
|
||||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||||
if word.strip()] for sentence in self.sentences]
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
|
||||
# # 计算每个句子的词语个数
|
||||
# sen_word_len = [len(sc)+1 for sc in sentences_cut]
|
||||
# 计算每个句子的tfidf
|
||||
sen_tfidf = tfidf_fit(self.sentences_cut)
|
||||
# 矩阵中两两句子相似度
|
||||
SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度"
|
||||
# 输入文本句子长度
|
||||
len_sen = len(self.sentences)
|
||||
# 句子标号
|
||||
sen_idx = [i for i in range(len_sen)]
|
||||
summary_set = []
|
||||
mmr = {}
|
||||
for i in range(len_sen):
|
||||
if not self.sentences[i] in summary_set:
|
||||
sen_idx_pop = copy.deepcopy(sen_idx)
|
||||
sen_idx_pop.pop(i)
|
||||
# 两两句子相似度
|
||||
sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop]
|
||||
score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确
|
||||
mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j)
|
||||
summary_set.append(self.sentences[i])
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)]
|
||||
if len(mmr) > num:
|
||||
score_sen = score_sen[0:num]
|
||||
return score_sen
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
mmr_sum = MMRSum()
|
||||
doc = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
sum = mmr_sum.summarize(doc)
|
||||
for i in sum:
|
||||
print(i)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
235
macropodus/summarize/feature_base/text_pronouns.py
Normal file
235
macropodus/summarize/feature_base/text_pronouns.py
Normal file
@ -0,0 +1,235 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/8/25 20:51
|
||||
# @author :Mo
|
||||
# @paper :Sentence Extraction Based Single Document Summarization(2005)
|
||||
# @function :text summary of feature-base
|
||||
# @evaluate :bad, it is for english, and that's not clearly explain of formula
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut, jieba_tag_cut
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import extract_chinese
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
from macropodus.preprocess.tools_ml import get_ngrams
|
||||
# import jieba.analyse as analyse
|
||||
from collections import Counter
|
||||
|
||||
|
||||
# # jieba预训练好的idf值
|
||||
# default_tfidf = analyse.default_tfidf
|
||||
# # 引入TF-IDF关键词抽取接口
|
||||
# tfidf = analyse.extract_tags
|
||||
# # 引入TextRank关键词抽取接口
|
||||
# textrank = analyse.textrank
|
||||
|
||||
|
||||
CHAR_PUMCTUATION = ',.:;?!`\'"[]{}<>。?!,、;:“” ‘’「」『』《》()[]〔〕【】——……—-~·《》〈〉﹏﹏.___'
|
||||
CHAR_ENGLISH = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||
CHAR_NUMBER = "0123456789零一二两三四五六七八九"
|
||||
CHAR_CHINESE = "\u4e00-\u9fa5"
|
||||
ES_MIN = 1e-9
|
||||
|
||||
|
||||
class TextPronounsSum:
|
||||
def __init__(self):
|
||||
self.algorithm = 'text_pronouns'
|
||||
self.stop_words = stop_words.values()
|
||||
self.len_ideal = 18 # 中心句子长度, 默认
|
||||
|
||||
def score_position(self):
|
||||
"""
|
||||
文本长度得分
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
score_position = []
|
||||
for i, _ in enumerate(self.sentences):
|
||||
score_standard = i / (len(self.sentences))
|
||||
if score_standard >= 0 and score_standard <= 0.1:
|
||||
score_position.append(0.17)
|
||||
elif score_standard > 0.1 and score_standard <= 0.2:
|
||||
score_position.append(0.23)
|
||||
elif score_standard > 0.2 and score_standard <= 0.3:
|
||||
score_position.append(0.14)
|
||||
elif score_standard > 0.3 and score_standard <= 0.4:
|
||||
score_position.append(0.08)
|
||||
elif score_standard > 0.4 and score_standard <= 0.5:
|
||||
score_position.append(0.05)
|
||||
elif score_standard > 0.5 and score_standard <= 0.6:
|
||||
score_position.append(0.04)
|
||||
elif score_standard > 0.6 and score_standard <= 0.7:
|
||||
score_position.append(0.06)
|
||||
elif score_standard > 0.7 and score_standard <= 0.8:
|
||||
score_position.append(0.04)
|
||||
elif score_standard > 0.8 and score_standard <= 0.9:
|
||||
score_position.append(0.04)
|
||||
elif score_standard > 0.9 and score_standard <= 1.0:
|
||||
score_position.append(0.15)
|
||||
else:
|
||||
score_position.append(0)
|
||||
return score_position
|
||||
|
||||
def score_length(self):
|
||||
"""
|
||||
文本长度得分
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
score_length = []
|
||||
for i, sentence in enumerate(self.sentences):
|
||||
score_len = 1 - abs(self.len_ideal - len(sentence)) / self.len_ideal
|
||||
score_length.append(score_len)
|
||||
return score_length
|
||||
|
||||
def score_tag(self):
|
||||
"""
|
||||
词性打分名词-动词-代词(n,v,r)
|
||||
:return:
|
||||
"""
|
||||
score_tag = []
|
||||
for i, sen_tag_score in enumerate(self.sentences_tag_cut):
|
||||
sen_tag = sen_tag_score.values()
|
||||
tag_dict = dict(Counter(sen_tag))
|
||||
tag_n = tag_dict.get('n', 0) + tag_dict.get('nr', 0) + tag_dict.get('ns', 0) + \
|
||||
tag_dict.get('nt', 0) + tag_dict.get('nz', 0) + tag_dict.get('ng', 0)
|
||||
tag_v = tag_dict.get('v', 0) + tag_dict.get('vd', 0) + tag_dict.get('vn', 0) + tag_dict.get('vg', 0)
|
||||
tag_p = tag_dict.get('r', 0)
|
||||
score_sen_tag = (1.2 * tag_n + 1.0 * tag_v + 0.8 * tag_p)/(len(sen_tag_score) + 1)
|
||||
score_tag.append(score_sen_tag)
|
||||
return score_tag
|
||||
|
||||
def score_title(self, words):
|
||||
"""
|
||||
与标题重合部分词语
|
||||
:param words:
|
||||
:return:
|
||||
"""
|
||||
mix_word = [word for word in words if word in self.title]
|
||||
len_mix_word = len(mix_word)
|
||||
len_title_word = len(self.title)
|
||||
return (len_mix_word + 1.0) / (len_mix_word + 2.0) / len_title_word
|
||||
|
||||
def summarize(self, text, num=320, title=None):
|
||||
"""
|
||||
文本句子排序
|
||||
:param docs: list
|
||||
:return: list
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
self.title = title
|
||||
if self.title:
|
||||
self.title = macropodus_cut(title)
|
||||
# 切词,含词性标注
|
||||
self.sentences_tag_cut = [jieba_tag_cut(extract_chinese(sentence)) for sentence in self.sentences]
|
||||
# 词语,不含词性标注
|
||||
sentences_cut = [[jc for jc in jtc.keys() ] for jtc in self.sentences_tag_cut]
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
# 词频统计
|
||||
self.words = []
|
||||
for sen in self.sentences_cut:
|
||||
self.words = self.words + sen
|
||||
self.word_count = dict(Counter(self.words))
|
||||
# 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}]
|
||||
self.word_freqs = {}
|
||||
self.len_words = len(self.words)
|
||||
for k, v in self.word_count.items():
|
||||
self.word_freqs[k] = v * 0.5 / self.len_words
|
||||
# uni_bi_tri_gram特征
|
||||
[gram_uni, gram_bi, gram_tri] = get_ngrams("".join(self.sentences), ns=[1, 2, 3])
|
||||
ngrams = gram_uni + gram_bi + gram_tri
|
||||
self.ngrams_count = dict(Counter(ngrams))
|
||||
# 句子位置打分
|
||||
scores_posi = self.score_position()
|
||||
# 句子长度打分
|
||||
scores_length = self.score_length()
|
||||
# 句子词性打分, 名词(1.2)-代词(0.8)-动词(1.0)
|
||||
scores_tag = self.score_tag()
|
||||
|
||||
res_rank = {}
|
||||
self.res_score = []
|
||||
for i in range(len(sentences_cut)):
|
||||
sen_cut = self.sentences_cut[i] # 句子中的词语
|
||||
# ngram得分
|
||||
[gram_uni_, gram_bi_, gram_tri_] = get_ngrams(self.sentences[i], ns=[1, 2, 3]) # gram_uni_bi_tri(self.sentences[i])
|
||||
n_gram_s = gram_uni_ + gram_bi_ + gram_tri_
|
||||
score_ngram = sum([self.ngrams_count[ngs] if ngs in self.ngrams_count else 0 for ngs in n_gram_s]) / (len(n_gram_s) + 1)
|
||||
# 句子中词语的平均长度
|
||||
score_word_length_avg = sum([len(sc) for sc in sen_cut])/(len(sen_cut)+1)
|
||||
score_posi = scores_posi[i]
|
||||
score_length = scores_length[i]
|
||||
score_tag = scores_tag[i]
|
||||
if self.title: # 有标题的文本打分合并
|
||||
score_title = self.score_title(sen_cut)
|
||||
score_total = (score_title * 0.5 + score_ngram * 2.0 + score_word_length_avg * 0.5 +
|
||||
score_length * 0.5 + score_posi * 1.0 + score_tag * 0.6) / 6.0
|
||||
# 可查阅各部分得分统计
|
||||
self.res_score.append(["score_title", "score_ngram", "score_word_length_avg",
|
||||
"score_length", "score_posi", "score_tag"])
|
||||
self.res_score.append([score_title, score_ngram, score_word_length_avg,
|
||||
score_length, score_posi, score_tag, self.sentences[i]])
|
||||
else: # 无标题的文本打分合并
|
||||
score_total = (score_ngram * 2.0 + score_word_length_avg * 0.5 + score_length * 0.5 +
|
||||
score_posi * 1.0 + score_tag * 0.6) / 5.0
|
||||
# 可查阅各部分得分统计
|
||||
self.res_score.append(["score_ngram", "score_word_length_avg",
|
||||
"score_length", "score_posi", "score_tag"])
|
||||
self.res_score.append([score_ngram, score_word_length_avg,
|
||||
score_length, score_posi, score_tag, self.sentences[i]])
|
||||
res_rank[self.sentences[i].strip()] = score_total
|
||||
# 最小句子数
|
||||
num_min = min(num, int(len(self.word_count) * 0.6))
|
||||
res_rank_sort = sorted(res_rank.items(), key=lambda rr: rr[1], reverse=True)
|
||||
res_rank_sort_reverse = [(rrs[1], rrs[0]) for rrs in res_rank_sort][0:num_min]
|
||||
return res_rank_sort_reverse
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sen = "自然语言理解(NLU,Natural Language Understanding): 使计算机理解自然语言(人类语言文字)等,重在理解。"
|
||||
tp = TextPronounsSum()
|
||||
docs ="和投票目标的等级来决定新的等级.简单的说。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。" \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。" \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。" \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。" \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。" \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。" \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。"
|
||||
|
||||
docs1 = "/article/details/98530760。" \
|
||||
"CSDN\n。" \
|
||||
"文本生成NLG,不同于文本理解NLU(例如分词、词向量、分类、实体提取。" \
|
||||
"是重在文本生成的另一种关键技术(常用的有翻译、摘要、同义句生成等)。" \
|
||||
"传统的文本生成NLG任务主要是抽取式的,生成式的方法看起来到现在使用也没有那么普遍。" \
|
||||
"现在,我记录的是textrank,一种使用比较广泛的抽取式关键句提取算法。" \
|
||||
"版权声明:本文为CSDN博主「大漠帝国」的原创文章,遵循CC 4.0 by-sa版权协议," \
|
||||
"转载请附上原文出处链接及本声明。原文链接:https://blog.csdn.net/rensihui" \
|
||||
"CSDN是神"
|
||||
sums = tp.summarize(docs)
|
||||
for sum_ in sums:
|
||||
print(sum_)
|
||||
|
||||
# ran_20 = range(20)
|
||||
# print(type(ran_20))
|
||||
# print(ran_20)
|
||||
# idx = [1,2,3]
|
||||
# idx.pop(1)
|
||||
# print(idx)
|
||||
# print(max([1,2,3,4]))
|
||||
|
||||
|
||||
|
||||
|
193
macropodus/summarize/feature_base/text_teaser.py
Normal file
193
macropodus/summarize/feature_base/text_teaser.py
Normal file
@ -0,0 +1,193 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/26 20:02
|
||||
# @author :Mo
|
||||
# @function :text summary of feature-base of TextTeaser
|
||||
# @paper :Automatic Text Summarization for Indonesian Language Using TextTeaser(2013)
|
||||
# @url :using Google Scholar
|
||||
|
||||
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import extract_chinese
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class TextTeaserSum:
|
||||
def __init__(self):
|
||||
self.algorithm = 'text_teaser'
|
||||
self.stop_words = stop_words.values()
|
||||
self.len_ideal = 18 # 中心句子长度, 默认
|
||||
|
||||
def score_position(self):
|
||||
"""
|
||||
文本句子位置得分
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
score_position = []
|
||||
for i, sen in enumerate(self.sentences):
|
||||
score_standard = i / (len(self.sentences))
|
||||
if score_standard >= 0 and score_standard <= 0.1:
|
||||
score_position.append(0.17)
|
||||
elif score_standard > 0.1 and score_standard <= 0.2:
|
||||
score_position.append(0.23)
|
||||
elif score_standard > 0.2 and score_standard <= 0.3:
|
||||
score_position.append(0.14)
|
||||
elif score_standard > 0.3 and score_standard <= 0.4:
|
||||
score_position.append(0.08)
|
||||
elif score_standard > 0.4 and score_standard <= 0.5:
|
||||
score_position.append(0.05)
|
||||
elif score_standard > 0.5 and score_standard <= 0.6:
|
||||
score_position.append(0.04)
|
||||
elif score_standard > 0.6 and score_standard <= 0.7:
|
||||
score_position.append(0.06)
|
||||
elif score_standard > 0.7 and score_standard <= 0.8:
|
||||
score_position.append(0.04)
|
||||
elif score_standard > 0.8 and score_standard <= 0.9:
|
||||
score_position.append(0.04)
|
||||
elif score_standard > 0.9 and score_standard <= 1.0:
|
||||
score_position.append(0.15)
|
||||
else:
|
||||
score_position.append(0)
|
||||
return score_position
|
||||
|
||||
def score_length(self, sentence):
|
||||
"""
|
||||
文本长度得分
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
score_length = 1 - min(abs(self.len_ideal - len(sentence)), self.len_ideal) / self.len_ideal
|
||||
return score_length
|
||||
|
||||
def score_sbs(self, words):
|
||||
"""
|
||||
单个句子的sbs分数
|
||||
:param words:
|
||||
:return:
|
||||
"""
|
||||
score_sbs = 0.0
|
||||
for word in words:
|
||||
if word in self.word_freqs:
|
||||
score_sbs += self.word_freqs[word]
|
||||
return ((1.0 / abs(len(words))) if len(words) else 1e-9) * score_sbs
|
||||
|
||||
def score_dbs(self, words):
|
||||
"""
|
||||
单个句子的dbs分数
|
||||
:param words:
|
||||
:return:
|
||||
"""
|
||||
words_all = list(self.word_freqs.keys())
|
||||
pun = len(set(words)&set(words_all)) + 1
|
||||
score_dbs = 0.0
|
||||
wf_first = []
|
||||
for i, word in enumerate(words):
|
||||
if word in words_all:
|
||||
index = words_all.index(word)
|
||||
if not wf_first:
|
||||
wf_first = [index, self.word_freqs[word]]
|
||||
else:
|
||||
score_dbs += wf_first[1]*self.word_freqs[word] / (((wf_first[0] - index) if (wf_first[0] - index)!=0 else self.len_words)**2)
|
||||
score_dbs = score_dbs if score_dbs !=0 else 1e-9
|
||||
return (1.0 / pun * (pun + 1.0)) * score_dbs
|
||||
|
||||
def score_title(self, words):
|
||||
"""
|
||||
与标题重合部分词语
|
||||
:param words:
|
||||
:return:
|
||||
"""
|
||||
mix_word = [word for word in words if word in self.title]
|
||||
len_mix_word = len(mix_word)
|
||||
len_title_word = len(self.title)
|
||||
return (len_mix_word + 1.0) / (len_mix_word + 2.0) / len_title_word
|
||||
|
||||
def summarize(self, text, num=320, title=None):
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
self.title = title
|
||||
if self.title:
|
||||
self.title = macropodus_cut(title)
|
||||
# 切词
|
||||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||||
if word.strip()] for sentence in self.sentences]
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
# 词频统计
|
||||
self.words = []
|
||||
for sen in self.sentences_cut:
|
||||
self.words = self.words + sen
|
||||
self.word_count = dict(Counter(self.words))
|
||||
# word_count_rank = sorted(word_count.items(), key=lambda f:f[1], reverse=True)
|
||||
# self.word_freqs = [{'word':wcr[0], 'freq':wcr[1]} for wcr in word_count_rank]
|
||||
# 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}]
|
||||
self.word_freqs = {}
|
||||
self.len_words = len(self.words)
|
||||
for k, v in self.word_count.items():
|
||||
self.word_freqs[k] = v * 0.5 / self.len_words
|
||||
# 句子位置打分
|
||||
scores_posi = self.score_position()
|
||||
res_rank = {}
|
||||
self.res_score = []
|
||||
for i in range(len(sentences_cut)):
|
||||
sen = self.sentences[i] # 句子
|
||||
sen_cut = self.sentences_cut[i] # 句子中的词语
|
||||
score_sbs = self.score_sbs(sen_cut) # 句子中的词语打分1
|
||||
score_dbs = self.score_dbs(sen_cut) # 句子中的词语打分2
|
||||
score_word = (score_sbs + score_dbs) * 10.0 / 2.0 # 句子中的词语打分mix
|
||||
score_length = self.score_length(sen) # 句子文本长度打分
|
||||
score_posi = scores_posi[i]
|
||||
if self.title: # 有标题的文本打分合并
|
||||
score_title = self.score_title(sen_cut)
|
||||
score_total = (score_title * 0.5 + score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 4.0
|
||||
# 可查阅各部分得分统计
|
||||
self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "score_title", "sentences"])
|
||||
self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, score_title, self.sentences[i]])
|
||||
else: # 无标题的文本打分合并
|
||||
score_total = (score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 3.5
|
||||
self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "sentences"])
|
||||
self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, self.sentences[i].strip()])
|
||||
res_rank[self.sentences[i].strip()] = score_total
|
||||
# 最小句子数
|
||||
num_min = min(num, int(len(self.word_count) * 0.6))
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_rank.items(), key=lambda d: d[1], reverse=True)][0:num_min]
|
||||
return score_sen
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
doc1 = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
title = "方直科技等公司合伙设立教育投资基金"
|
||||
doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
|
||||
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
|
||||
"该基金认缴出资总规模为人民币3.01亿元。" \
|
||||
"基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \
|
||||
"各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \
|
||||
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
|
||||
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
|
||||
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||||
tt = TextTeaserSum()
|
||||
res_ = tt.summarize(doc)
|
||||
for res in res_:
|
||||
print(res)
|
||||
gg = 0
|
112
macropodus/summarize/feature_base/word_significance.py
Normal file
112
macropodus/summarize/feature_base/word_significance.py
Normal file
@ -0,0 +1,112 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/8/26 23:42
|
||||
# @author :Mo
|
||||
# @function :text summarize of extraction of word significance
|
||||
# @paper :The Automatic Creation of Literature Abstracts*
|
||||
# @url :http://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf
|
||||
|
||||
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import extract_chinese
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class WordSignificanceSum:
|
||||
def __init__(self):
|
||||
"""
|
||||
features:
|
||||
1. words mix in title and sentence
|
||||
2. keywords in sentence
|
||||
3. Position of sentence
|
||||
4. Length of sentence
|
||||
"""
|
||||
self.algorithm = 'word_significance'
|
||||
self.stop_words = stop_words.values()
|
||||
self.num = 0
|
||||
|
||||
def summarize(self, text, num=320):
|
||||
"""
|
||||
根据词语意义确定中心句
|
||||
:param text: str
|
||||
:param num: int
|
||||
:return: list
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# 切词
|
||||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||||
if word.strip()] for sentence in self.sentences]
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
# 词频统计
|
||||
self.words = []
|
||||
for sen in self.sentences_cut:
|
||||
self.words = self.words + sen
|
||||
self.word_count = dict(Counter(self.words))
|
||||
self.word_count_rank = sorted(self.word_count.items(), key=lambda f: f[1], reverse=True)
|
||||
# 最小句子数
|
||||
num_min = min(num, int(len(self.word_count)*0.6))
|
||||
# 词语排序, 按照词频
|
||||
self.word_rank = [wcr[0] for wcr in self.word_count_rank][0:num_min]
|
||||
res_sentence = []
|
||||
# 抽取句子, 顺序, 如果词频高的词语在句子里, 则抽取
|
||||
for word in self.word_rank:
|
||||
for i in range(0, len(self.sentences)):
|
||||
# 当返回关键句子到达一定量, 则结束返回
|
||||
if len(res_sentence) < num_min:
|
||||
added = False
|
||||
for sent in res_sentence:
|
||||
if sent == self.sentences[i]: added = True
|
||||
if (added == False and word in self.sentences[i]):
|
||||
res_sentence.append(self.sentences[i])
|
||||
break
|
||||
# 只是计算各得分,没什么用
|
||||
len_sentence = len(self.sentences)
|
||||
res_sentence = [(1-1/(len_sentence+len_sentence/(k+1)), rs) for k, rs in enumerate(res_sentence)]
|
||||
return res_sentence
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
doc = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
|
||||
doc1 = "多知网. "\
|
||||
"多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
|
||||
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
|
||||
"该基金认缴出资总规模为人民币3.01亿元。" \
|
||||
"基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \
|
||||
"各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \
|
||||
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
|
||||
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
|
||||
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||||
|
||||
ws = WordSignificanceSum()
|
||||
res = ws.summarize(doc, num=6)
|
||||
for r in res:
|
||||
print(r)
|
||||
|
||||
|
||||
"多知网. 多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元,与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。该基金认缴出资总规模为人民币3.01亿元。基金的出资方式具体如下:出资进度方面,基金合伙人的出资" \
|
||||
"应于基金成立之日起四年内分四期缴足,每期缴付7525万元;各基金合伙人每期按其出资比例缴付。" \
|
||||
"合伙期限为11年,投资目标为教育领域初创期或成长期企业。截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||||
|
5
macropodus/summarize/graph_base/__init__.py
Normal file
5
macropodus/summarize/graph_base/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/25 21:42
|
||||
# @author :Mo
|
||||
# @function :
|
84
macropodus/summarize/graph_base/textrank.py
Normal file
84
macropodus/summarize/graph_base/textrank.py
Normal file
@ -0,0 +1,84 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/29 22:39
|
||||
# @author :Mo
|
||||
# @function :textrank of textrank4zh, sklearn or gensim
|
||||
|
||||
|
||||
from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec
|
||||
from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum
|
||||
from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn
|
||||
|
||||
|
||||
# textrank of gensim
|
||||
trgs = TextrankGensimSum()
|
||||
# textrank of word2vec
|
||||
trwv = TextrankWord2vec()
|
||||
# textrank of sklearn
|
||||
trsk = TextrankSklearn()
|
||||
|
||||
|
||||
class TextRankSum:
|
||||
def __init__(self):
|
||||
self.algorithm = 'textrank'
|
||||
|
||||
def summarize(self, text, num=6, model_type="textrank_word2vec"):
|
||||
"""
|
||||
文本摘要
|
||||
:param text:str, like "你好!大漠帝国!"
|
||||
:param num: int, like 3
|
||||
:param model_type: str, like "textrank_sklearn"
|
||||
:return: list
|
||||
"""
|
||||
if model_type=="textrank_sklearn":
|
||||
res = trsk.summarize(text, num=num)
|
||||
elif model_type=="textrank_gensim":
|
||||
res = trgs.summarize(text, num=num)
|
||||
elif model_type=="textrank_word2vec":
|
||||
res = trwv.summarize(text, num=num)
|
||||
else:
|
||||
raise RuntimeError(" model_type must be 'textrank_textrank4zh', 'text_rank_sklearn' or 'textrank_gensim' ")
|
||||
|
||||
return res
|
||||
|
||||
class TextRankKey:
|
||||
def __init__(self):
|
||||
self.algorithm = 'keyword'
|
||||
|
||||
def keyword(self, text, num=6, score_min=0.025, model_type="keywor_word2vec"):
|
||||
if model_type=="keywor_word2vec":
|
||||
res = trwv.keyword(text, num=num, score_min=score_min)
|
||||
else:
|
||||
raise RuntimeError(" model_type must be 'keywor_word2vec'")
|
||||
|
||||
return res
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
doc = "和投票目标的等级来决定新的等级.简单的说。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法!" \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。" \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。" \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。" \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。" \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。" \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。" \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。"
|
||||
|
||||
text = doc.encode('utf-8').decode('utf-8')
|
||||
|
||||
tr = TextRankSum()
|
||||
kw = TextRankKey()
|
||||
score_ques = tr.summarize(text, num=100, model_type="textrank_gensim") # "text_rank_sklearn")
|
||||
for sq in score_ques:
|
||||
print(sq)
|
||||
|
||||
score_ques = kw.keyword(text, num=100, model_type="keywor_word2vec") # "text_rank_sklearn")
|
||||
for sq in score_ques:
|
||||
print(sq)
|
362
macropodus/summarize/graph_base/textrank_gensim.py
Normal file
362
macropodus/summarize/graph_base/textrank_gensim.py
Normal file
@ -0,0 +1,362 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/8/21 22:01
|
||||
# @author :Mo
|
||||
# @function :textrank using gensim summarization of chinese. (split is '. ', '! ', '. ' and so on)
|
||||
# @code from:most code from https://github.com/RaRe-Technologies/gensim
|
||||
|
||||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||||
# """This module provides functions for summarizing texts. Summarizing is based on
|
||||
# ranks of text sentences using a variation of the TextRank algorithm [1]_.
|
||||
#
|
||||
# .. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016).
|
||||
# Variations of the Similarity Function of TextRank for Automated Summarization,
|
||||
# https://arxiv.org/abs/1602.03606
|
||||
|
||||
|
||||
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
|
||||
from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank
|
||||
from gensim.summarization.commons import build_graph as _build_graph
|
||||
from gensim.summarization.bm25 import iter_bm25_bow as _bm25_weights
|
||||
from gensim.corpora import Dictionary
|
||||
from gensim.utils import deprecated
|
||||
from math import log10 as _log10
|
||||
from six.moves import range
|
||||
import logging
|
||||
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
WEIGHT_THRESHOLD = 1.e-3
|
||||
INPUT_MIN_LENGTH = 2
|
||||
|
||||
|
||||
class TextrankGensimSum:
|
||||
def __init__(self):
|
||||
self.algorithm = 'textrank_gensim'
|
||||
self.stop_words = stop_words.values()
|
||||
self.len_ideal = 18 # 中心句子长度, 默认
|
||||
|
||||
def summarize(self, text, num=320):
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# str of sentence >>> index
|
||||
corpus = _build_corpus(sentences)
|
||||
# pagerank and so on
|
||||
most_important_docs = summarize_corpus(corpus)
|
||||
|
||||
count = 0
|
||||
sentences_score = {}
|
||||
for cor in corpus:
|
||||
tuple_cor = tuple(cor)
|
||||
sentences_score[sentences[count]] = most_important_docs[tuple_cor]
|
||||
count += 1
|
||||
# 最小句子数
|
||||
num_min = min(num, int(len(sentences) * 0.6))
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(sentences_score.items(),
|
||||
key=lambda d: d[1], reverse=True)][0:num_min]
|
||||
return score_sen
|
||||
|
||||
|
||||
|
||||
def _set_graph_edge_weights(graph):
|
||||
"""Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small,
|
||||
forces all weights to 1, inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : :class:`~gensim.summarization.graph.Graph`
|
||||
Given graph.
|
||||
|
||||
"""
|
||||
documents = graph.nodes()
|
||||
weights = _bm25_weights(documents)
|
||||
|
||||
for i, doc_bow in enumerate(weights):
|
||||
if i % 1000 == 0 and i > 0:
|
||||
logger.info('PROGRESS: processing %s/%s doc (%s non zero elements)', i, len(documents), len(doc_bow))
|
||||
|
||||
for j, weight in doc_bow:
|
||||
if i == j or weight < WEIGHT_THRESHOLD:
|
||||
continue
|
||||
|
||||
edge = (documents[i], documents[j])
|
||||
|
||||
if not graph.has_edge(edge):
|
||||
graph.add_edge(edge, weight)
|
||||
|
||||
# Handles the case in which all similarities are zero.
|
||||
# The resultant summary will consist of random sentences.
|
||||
if all(graph.edge_weight(edge) == 0 for edge in graph.iter_edges()):
|
||||
_create_valid_graph(graph)
|
||||
|
||||
|
||||
def _create_valid_graph(graph):
|
||||
"""Sets all weights of edges for different edges as 1, inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : :class:`~gensim.summarization.graph.Graph`
|
||||
Given graph.
|
||||
|
||||
"""
|
||||
nodes = graph.nodes()
|
||||
|
||||
for i in range(len(nodes)):
|
||||
for j in range(len(nodes)):
|
||||
if i == j:
|
||||
continue
|
||||
|
||||
edge = (nodes[i], nodes[j])
|
||||
|
||||
if graph.has_edge(edge):
|
||||
graph.del_edge(edge)
|
||||
|
||||
graph.add_edge(edge, 1)
|
||||
|
||||
|
||||
@deprecated("Function will be removed in 4.0.0")
|
||||
def _get_doc_length(doc):
|
||||
"""Get length of (tokenized) document.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
doc : list of (list of (tuple of int))
|
||||
Given document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
Length of document.
|
||||
|
||||
"""
|
||||
return sum(item[1] for item in doc)
|
||||
|
||||
|
||||
@deprecated("Function will be removed in 4.0.0")
|
||||
def _get_similarity(doc1, doc2, vec1, vec2):
|
||||
"""Returns similarity of two documents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
doc1 : list of (list of (tuple of int))
|
||||
First document.
|
||||
doc2 : list of (list of (tuple of int))
|
||||
Second document.
|
||||
vec1 : array
|
||||
? of first document.
|
||||
vec1 : array
|
||||
? of secont document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
Similarity of two documents.
|
||||
|
||||
"""
|
||||
numerator = vec1.dot(vec2.transpose()).toarray()[0][0]
|
||||
length_1 = _get_doc_length(doc1)
|
||||
length_2 = _get_doc_length(doc2)
|
||||
|
||||
denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
|
||||
|
||||
return numerator / denominator if denominator != 0 else 0
|
||||
|
||||
|
||||
def _build_corpus(sentences):
|
||||
"""Construct corpus from provided sentences.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Given sentences.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of list of (int, int)
|
||||
Corpus built from sentences.
|
||||
|
||||
"""
|
||||
split_tokens = [macropodus_cut(sentence) for sentence in sentences]
|
||||
dictionary = Dictionary(split_tokens)
|
||||
return [dictionary.doc2bow(token) for token in split_tokens]
|
||||
|
||||
|
||||
def _get_important_sentences(sentences, corpus, important_docs):
|
||||
"""Get most important sentences.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Given sentences.
|
||||
corpus : list of list of (int, int)
|
||||
Provided corpus.
|
||||
important_docs : list of list of (int, int)
|
||||
Most important documents of the corpus.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Most important sentences.
|
||||
|
||||
"""
|
||||
hashable_corpus = _build_hasheable_corpus(corpus)
|
||||
sentences_by_corpus = dict(zip(hashable_corpus, sentences))
|
||||
return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs]
|
||||
|
||||
|
||||
def _get_sentences_with_word_count(sentences, word_count):
|
||||
"""Get list of sentences. Total number of returned words close to specified `word_count`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Given sentences.
|
||||
word_count : int or None
|
||||
Number of returned words. If None full most important sentences will be returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Most important sentences.
|
||||
|
||||
"""
|
||||
length = 0
|
||||
selected_sentences = []
|
||||
|
||||
# Loops until the word count is reached.
|
||||
for sentence in sentences:
|
||||
words_in_sentence = len(sentence.text.split())
|
||||
|
||||
# Checks if the inclusion of the sentence gives a better approximation
|
||||
# to the word parameter.
|
||||
if abs(word_count - length - words_in_sentence) > abs(word_count - length):
|
||||
return selected_sentences
|
||||
|
||||
selected_sentences.append(sentence)
|
||||
length += words_in_sentence
|
||||
|
||||
return selected_sentences
|
||||
|
||||
|
||||
def _extract_important_sentences(sentences, corpus, important_docs, word_count):
|
||||
"""Get most important sentences of the `corpus`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Given sentences.
|
||||
corpus : list of list of (int, int)
|
||||
Provided corpus.
|
||||
important_docs : list of list of (int, int)
|
||||
Most important docs of the corpus.
|
||||
word_count : int
|
||||
Number of returned words. If None full most important sentences will be returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||||
Most important sentences.
|
||||
|
||||
"""
|
||||
important_sentences = _get_important_sentences(sentences, corpus, important_docs)
|
||||
|
||||
# If no "word_count" option is provided, the number of sentences is
|
||||
# reduced by the provided ratio. Else, the ratio is ignored.
|
||||
return important_sentences \
|
||||
if word_count is None \
|
||||
else _get_sentences_with_word_count(important_sentences, word_count)
|
||||
|
||||
|
||||
def _format_results(extracted_sentences, split):
|
||||
"""Returns `extracted_sentences` in desired format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit
|
||||
Given sentences.
|
||||
split : bool
|
||||
If True sentences will be returned as list. Otherwise sentences will be merged and returned as string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of str
|
||||
If `split` **OR**
|
||||
str
|
||||
Formatted result.
|
||||
|
||||
"""
|
||||
if split:
|
||||
return [sentence for sentence in extracted_sentences]
|
||||
return "\n".join(sentence.text for sentence in extracted_sentences)
|
||||
|
||||
|
||||
def _build_hasheable_corpus(corpus):
|
||||
"""Hashes and get `corpus`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
corpus : list of list of (int, int)
|
||||
Given corpus.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of list of (int, int)
|
||||
Hashable corpus.
|
||||
|
||||
"""
|
||||
return [tuple(doc) for doc in corpus]
|
||||
|
||||
|
||||
def summarize_corpus(corpus):
|
||||
"""Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
|
||||
Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`
|
||||
|
||||
Note
|
||||
----
|
||||
The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
|
||||
to make sense.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
corpus : list of list of (int, int)
|
||||
Given corpus.
|
||||
ratio : float, optional
|
||||
Number between 0 and 1 that determines the proportion of the number of
|
||||
sentences of the original text to be chosen for the summary, optional.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of str
|
||||
Most important documents of given `corpus` sorted by the document score, highest first.
|
||||
|
||||
"""
|
||||
hashable_corpus = _build_hasheable_corpus(corpus)
|
||||
|
||||
logger.info('Building graph')
|
||||
graph = _build_graph(hashable_corpus)
|
||||
|
||||
logger.info('Filling graph')
|
||||
_set_graph_edge_weights(graph)
|
||||
|
||||
logger.info('Removing unreachable nodes of graph')
|
||||
_remove_unreachable_nodes(graph)
|
||||
|
||||
logger.info('Pagerank graph')
|
||||
pagerank_scores = _pagerank(graph)
|
||||
return pagerank_scores
|
||||
|
||||
# logger.info('Sorting pagerank scores')
|
||||
# hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)
|
||||
#
|
||||
# return [list(doc) for doc in hashable_corpus]
|
||||
|
60
macropodus/summarize/graph_base/textrank_sklearn.py
Normal file
60
macropodus/summarize/graph_base/textrank_sklearn.py
Normal file
@ -0,0 +1,60 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/8/21 22:01
|
||||
# @author :Mo
|
||||
# @function : textrank using tfidf of sklearn, pagerank of networkx
|
||||
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
from macropodus.preprocess.tools_ml import tdidf_sim
|
||||
import networkx as nx
|
||||
|
||||
|
||||
class TextrankSklearn:
|
||||
def __init__(self):
|
||||
self.algorithm = 'textrank_sklearn'
|
||||
|
||||
def summarize(self, text, num=320):
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# tf-idf相似度
|
||||
matrix = tdidf_sim(sentences)
|
||||
matrix_norm = TfidfTransformer().fit_transform(matrix)
|
||||
# 构建相似度矩阵
|
||||
tfidf_sim = nx.from_scipy_sparse_matrix(matrix_norm * matrix_norm.T)
|
||||
# nx.pagerank
|
||||
sens_scores = nx.pagerank(tfidf_sim)
|
||||
# 得分排序
|
||||
sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
# 保留topk个, 防止越界
|
||||
topk = min(len(sentences), num)
|
||||
# 返回原句子和得分
|
||||
return [(sr[1], sentences[sr[0]]) for sr in sen_rank][0:topk]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法," \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。" \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票," \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面" \
|
||||
"和投票目标的等级来决定新的等级。简单的说," \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。" \
|
||||
"PageRank The PageRank Citation Ranking: Bringing Order to the Web,"\
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设," \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重);" \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。" \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信息。"
|
||||
doc = doc.encode('utf-8').decode('utf-8')
|
||||
ts = TextrankSklearn()
|
||||
textrank_tfidf = ts.summarize(doc, 32)
|
||||
for score_sen in textrank_tfidf:
|
||||
print(score_sen)
|
156
macropodus/summarize/graph_base/textrank_word2vec.py
Normal file
156
macropodus/summarize/graph_base/textrank_word2vec.py
Normal file
@ -0,0 +1,156 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/12/20 20:39
|
||||
# @author :Mo
|
||||
# @function :textrank of word2vec, keyword and sentence
|
||||
|
||||
from macropodus.similarity.similarity_word2vec_char import SimW2vChar
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TextrankWord2vec(SimW2vChar):
|
||||
def __init__(self, use_cache=True):
|
||||
self.algorithm = 'textrank_word2vec'
|
||||
self.stop_words = stop_words
|
||||
super().__init__(use_cache) # self.w2v_char
|
||||
|
||||
def cut_window(self, sent_words, win_size=2):
|
||||
"""
|
||||
滑动窗口切词
|
||||
:param sent_words: list, like ["我", "是", "大漠帝国"]
|
||||
:param win_size: int, like 3
|
||||
:return: yield
|
||||
"""
|
||||
if win_size < 2:
|
||||
win_size = 2
|
||||
for i in range(1, win_size):
|
||||
if i >= len(sent_words):
|
||||
break
|
||||
sent_terms = sent_words[i:] # 后面的
|
||||
sent_zip = zip(sent_words, sent_terms) # 候选词对
|
||||
for sz in sent_zip:
|
||||
yield sz
|
||||
|
||||
def keyword(self, text, num=6, score_min=0.025, win_size=3, type_sim="total", type_encode="avg", config={"alpha": 0.86, "max_iter":100}):
|
||||
"""
|
||||
关键词抽取, textrank of word2vec cosine
|
||||
:param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道?嗯。"
|
||||
:param num: int, length of sentence like 6
|
||||
:param win_size: int, windows size of combine. like 2
|
||||
:param type_sim: str, type of simiilarity. like "total", "cosine"
|
||||
:param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100}
|
||||
:return: list, result of keyword. like [(0.020411696169510562, '手机'), (0.016149784106276977, '夏普')]
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# macropodus_cut 切词
|
||||
self.macropodus_word = [macropodus_cut(sentence) for sentence in self.sentences]
|
||||
# 去除停用词等
|
||||
self.sentences_word = [[w for w in mw if w not in self.stop_words.values()] for mw in self.macropodus_word]
|
||||
# 构建图的顶点
|
||||
word2index = {}
|
||||
index2word = {}
|
||||
word_index = 0
|
||||
for sent_words in self.sentences_word:
|
||||
for word in sent_words:
|
||||
if not word in word2index: # index
|
||||
word2index[word] = word_index
|
||||
index2word[word_index] = word
|
||||
word_index += 1
|
||||
graph_words = np.zeros((word_index, word_index))
|
||||
# 构建图的边, 以两个词语的余弦相似度为基础
|
||||
for sent_words in self.sentences_word:
|
||||
for cw_1, cw_2 in self.cut_window(sent_words, win_size=win_size):
|
||||
if cw_1 in word2index and cw_2 in word2index:
|
||||
idx_1, idx_2 = word2index[cw_1], word2index[cw_2]
|
||||
score_w2v_cosine = self.similarity(cw_1, cw_2, type_sim=type_sim,
|
||||
type_encode=type_encode)
|
||||
graph_words[idx_1][idx_2] = score_w2v_cosine
|
||||
graph_words[idx_2][idx_1] = score_w2v_cosine
|
||||
# 构建相似度矩阵
|
||||
w2v_cosine_sim = nx.from_numpy_matrix(graph_words)
|
||||
# nx.pagerank
|
||||
sens_scores = nx.pagerank(w2v_cosine_sim, **config)
|
||||
# 得分排序
|
||||
sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
# 保留topk个, 防止越界
|
||||
topk = min(len(sen_rank), num)
|
||||
# 返回原句子和得分
|
||||
return [(sr[1], index2word[sr[0]]) for sr in sen_rank if len(index2word[sr[0]])>1 and score_min<=sr[1]][0:topk]
|
||||
|
||||
def summarize(self, text, num=320, type_sim="cosine", type_encode="avg", config={"alpha": 0.86, "max_iter":100}):
|
||||
"""
|
||||
文本摘要抽取, textrank of word2vec cosine
|
||||
:param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道?嗯。"
|
||||
:param num: int, length of sentence like 6
|
||||
:param type_sim: str, type of simiilarity. like "total", "cosine"
|
||||
:param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100}
|
||||
:return: list, result of keyword. like [(0.06900223298930287, 'PageRank The PageRank Citation Ranking'), (0.08698940285163381, 'PageRank通过网络浩瀚的超链接关系来确定一个页面的等级')]
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# 输入文本句子长度
|
||||
len_sen = len(self.sentences)
|
||||
# 构建图的顶点
|
||||
sent2idx = {}
|
||||
idx2sent = {}
|
||||
sent_idx = 0
|
||||
for sent in self.sentences:
|
||||
sent2idx[sent] = sent_idx
|
||||
idx2sent[sent_idx] = sent
|
||||
sent_idx += 1
|
||||
graph_sents = np.zeros((sent_idx, sent_idx))
|
||||
# 构建图的边, 以两个句子的余弦相似度为基础
|
||||
for i in range(len_sen):
|
||||
for j in range(len_sen):
|
||||
score_w2v_cosine = self.similarity(self.sentences[i], self.sentences[j],
|
||||
type_sim=type_sim, type_encode=type_encode)
|
||||
graph_sents[i][j] = score_w2v_cosine
|
||||
graph_sents[j][i] = score_w2v_cosine
|
||||
# 构建相似度矩阵
|
||||
w2v_cosine_sim = nx.from_numpy_matrix(graph_sents)
|
||||
# nx.pagerank
|
||||
sens_scores = nx.pagerank(w2v_cosine_sim, **config)
|
||||
# 得分排序
|
||||
sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
# 保留topk个, 防止越界
|
||||
topk = min(len(sen_rank), num)
|
||||
# 返回原句子和得分
|
||||
return [(sr[1], self.sentences[sr[0]]) for sr in sen_rank][0:topk]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text = "是上世纪90年代末提出的一种计算网页权重的算法。" \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法," \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。" \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票," \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面" \
|
||||
"和投票目标的等级来决定新的等级。简单的说," \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。" \
|
||||
"PageRank The PageRank Citation Ranking: Bringing Order to the Web," \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设," \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重);" \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。" \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信息。"
|
||||
trww = TextrankWord2vec()
|
||||
keyword = trww.keyword(text, num=8)
|
||||
summary = trww.summarize(text, num=32)
|
||||
print(keyword)
|
||||
print(summary)
|
||||
|
5
macropodus/summarize/nous_base/__init__.py
Normal file
5
macropodus/summarize/nous_base/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/25 21:44
|
||||
# @author :Mo
|
||||
# @function :
|
65
macropodus/summarize/nous_base/lead_3.py
Normal file
65
macropodus/summarize/nous_base/lead_3.py
Normal file
@ -0,0 +1,65 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/8/24 22:43
|
||||
# @author :Mo
|
||||
# @function :text_summary with lead-3
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
|
||||
|
||||
class Lead3Sum:
|
||||
def __init__(self):
|
||||
self.algorithm = 'lead_3'
|
||||
|
||||
def summarize(self, text, type_l='mix', num=320):
|
||||
"""
|
||||
lead-s
|
||||
:param sentences: list
|
||||
:param type: str, you can choose 'begin', 'end' or 'mix'
|
||||
:return: list
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# 最小句子数
|
||||
num_min = min(num, len(sentences))
|
||||
if type_l=='begin':
|
||||
summers = sentences[0:num]
|
||||
elif type_l=='end':
|
||||
summers = sentences[-num:]
|
||||
else:
|
||||
summers = [sentences[0]] + [sentences[-1]] + sentences[1:num-1]
|
||||
summers_s = {}
|
||||
for i in range(len(summers)): # 得分计算
|
||||
if len(summers) - i == 1:
|
||||
summers_s[summers[i]] = (num - 0.75) / (num + 1)
|
||||
else:
|
||||
summers_s[summers[i]] = (num - i - 0.5) / (num + 1)
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(summers_s.items(), key=lambda d: d[1], reverse=True)][0:num_min]
|
||||
return score_sen
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法," \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。" \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票," \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面" \
|
||||
"和投票目标的等级来决定新的等级。简单的说," \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。" \
|
||||
"PageRank The PageRank Citation Ranking: Bringing Order to the Web,"\
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设," \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重);" \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。" \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信息。"
|
||||
text = doc.encode('utf-8').decode('utf-8')
|
||||
l3 = Lead3Sum()
|
||||
for score_sen in l3.summarize(text, type_l='mix', num=320):
|
||||
print(score_sen)
|
5
macropodus/summarize/topic_base/__init__.py
Normal file
5
macropodus/summarize/topic_base/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/29 20:35
|
||||
# @author :Mo
|
||||
# @function :
|
124
macropodus/summarize/topic_base/topic_lda.py
Normal file
124
macropodus/summarize/topic_base/topic_lda.py
Normal file
@ -0,0 +1,124 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/31 21:33
|
||||
# @author :Mo
|
||||
# @function :topic model of LDA
|
||||
# @paper :Latent Dirichlet Allocation
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
# sklearn
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.decomposition import LatentDirichletAllocation
|
||||
import numpy as np
|
||||
|
||||
|
||||
class LDASum:
|
||||
def __init__(self):
|
||||
self.stop_words = stop_words.values()
|
||||
self.algorithm = 'lda'
|
||||
|
||||
def summarize(self, text, num=8, topic_min=6, judge_topic=None):
|
||||
"""
|
||||
LDA
|
||||
:param text: str
|
||||
:param num: int
|
||||
:param topic_min: int
|
||||
:param judge_topic: boolean
|
||||
:return:
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
len_sentences_cut = len(self.sentences)
|
||||
# 切词
|
||||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||||
if word.strip()] for sentence in self.sentences]
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
|
||||
# # 计算每个句子的tf
|
||||
# vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words)
|
||||
# tf_ngram = vector_c.fit_transform(self.sentences_cut)
|
||||
# 计算每个句子的tfidf
|
||||
tf_ngram = tfidf_fit(self.sentences_cut)
|
||||
# 主题数, 经验判断
|
||||
topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3
|
||||
lda = LatentDirichletAllocation(n_topics=topic_num, max_iter=32,
|
||||
learning_method='online',
|
||||
learning_offset=50.,
|
||||
random_state=2019)
|
||||
res_lda_u = lda.fit_transform(tf_ngram.T)
|
||||
res_lda_v = lda.components_
|
||||
|
||||
if judge_topic:
|
||||
### 方案一, 获取最大那个主题的k个句子
|
||||
##################################################################################
|
||||
topic_t_score = np.sum(res_lda_v, axis=-1)
|
||||
# 对每列(一个句子topic_num个主题),得分进行排序,0为最大
|
||||
res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1]
|
||||
# 统计为最大每个主题的句子个数
|
||||
exist = (res_nmf_h_soft <= 0) * 1.0
|
||||
factor = np.ones(res_nmf_h_soft.shape[1])
|
||||
topic_t_count = np.dot(exist, factor)
|
||||
# 标准化
|
||||
topic_t_count /= np.sum(topic_t_count, axis=-1)
|
||||
topic_t_score /= np.sum(topic_t_score, axis=-1)
|
||||
# 主题最大个数占比, 与主题总得分占比选择最大的主题
|
||||
topic_t_tc = topic_t_count + topic_t_score
|
||||
topic_t_tc_argmax = np.argmax(topic_t_tc)
|
||||
# 最后得分选择该最大主题的
|
||||
res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist()
|
||||
res_combine = {}
|
||||
for l in range(len_sentences_cut):
|
||||
res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||||
#####################################################################################
|
||||
else:
|
||||
### 方案二, 获取最大主题概率的句子, 不分主题
|
||||
res_combine = {}
|
||||
for i in range(len_sentences_cut):
|
||||
res_row_i = res_lda_v[:, i]
|
||||
res_row_i_argmax = np.argmax(res_row_i)
|
||||
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||||
num_min = min(num, int(len_sentences_cut * 0.6))
|
||||
return score_sen[0:num_min]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
lda = LDASum()
|
||||
doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
|
||||
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
|
||||
"该基金认缴出资总规模为人民币3.01亿元。" \
|
||||
"基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \
|
||||
"各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \
|
||||
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
|
||||
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
|
||||
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||||
|
||||
doc = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
|
||||
sum = lda.summarize(doc, num=8)
|
||||
for i in sum:
|
||||
print(i)
|
98
macropodus/summarize/topic_base/topic_lsi.py
Normal file
98
macropodus/summarize/topic_base/topic_lsi.py
Normal file
@ -0,0 +1,98 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/2 21:03
|
||||
# @author :Mo
|
||||
# @function :topic model of LSI
|
||||
# @paper :Text summarization using Latent Semantic Analysis
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import cut_sentence, macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
# sklearn
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
import numpy as np
|
||||
|
||||
|
||||
class LSISum:
|
||||
def __init__(self):
|
||||
self.stop_words = stop_words.values()
|
||||
self.algorithm = 'lsi'
|
||||
|
||||
def summarize(self, text, num=320, topic_min=5, judge_topic='all'):
|
||||
"""
|
||||
|
||||
:param text:
|
||||
:param num:
|
||||
:return:
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
len_sentences_cut = len(self.sentences)
|
||||
# 切词
|
||||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||||
if word.strip()] for sentence in self.sentences]
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
|
||||
# 计算每个句子的tfidf
|
||||
sen_tfidf = tfidf_fit(self.sentences_cut)
|
||||
# 主题数, 经验判断
|
||||
topic_num = min(topic_min, int(len(sentences_cut)/2)) # 设定最小主题数为3
|
||||
svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32)
|
||||
res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T)
|
||||
res_svd_v = svd_tfidf.components_
|
||||
|
||||
if judge_topic:
|
||||
### 方案一, 获取最大那个主题的k个句子
|
||||
##################################################################################
|
||||
topic_t_score = np.sum(res_svd_v, axis=-1)
|
||||
# 对每列(一个句子topic_num个主题),得分进行排序,0为最大
|
||||
res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1]
|
||||
# 统计为最大每个主题的句子个数
|
||||
exist = (res_nmf_h_soft <= 0) * 1.0
|
||||
factor = np.ones(res_nmf_h_soft.shape[1])
|
||||
topic_t_count = np.dot(exist, factor)
|
||||
# 标准化
|
||||
topic_t_count /= np.sum(topic_t_count, axis=-1)
|
||||
topic_t_score /= np.sum(topic_t_score, axis=-1)
|
||||
# 主题最大个数占比, 与主题总得分占比选择最大的主题
|
||||
topic_t_tc = topic_t_count + topic_t_score
|
||||
topic_t_tc_argmax = np.argmax(topic_t_tc)
|
||||
# 最后得分选择该最大主题的
|
||||
res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist()
|
||||
res_combine = {}
|
||||
for l in range(len_sentences_cut):
|
||||
res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||||
#####################################################################################
|
||||
else:
|
||||
### 方案二, 获取最大主题概率的句子, 不分主题
|
||||
res_combine = {}
|
||||
for i in range(len_sentences_cut):
|
||||
res_row_i = res_svd_v[:, i]
|
||||
res_row_i_argmax = np.argmax(res_row_i)
|
||||
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||||
num_min = min(num, int(len_sentences_cut * 0.6))
|
||||
return score_sen[0:num_min]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
lsi = LSISum()
|
||||
doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
|
||||
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
|
||||
"该基金认缴出资总规模为人民币3.01亿元。" \
|
||||
"基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \
|
||||
"各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \
|
||||
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
|
||||
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
|
||||
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||||
sum = lsi.summarize(doc, num=8)
|
||||
for i in sum:
|
||||
print(i)
|
137
macropodus/summarize/topic_base/topic_nmf.py
Normal file
137
macropodus/summarize/topic_base/topic_nmf.py
Normal file
@ -0,0 +1,137 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/12/2 20:33
|
||||
# @author :Mo
|
||||
# @function :topic model of NMF
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
|
||||
from macropodus.data.words_common.stop_words import stop_words
|
||||
from macropodus.preprocess.tools_ml import macropodus_cut
|
||||
from macropodus.preprocess.tools_ml import cut_sentence
|
||||
# sklearn
|
||||
from sklearn.decomposition import NMF
|
||||
import numpy as np
|
||||
|
||||
|
||||
class NMFSum:
|
||||
def __init__(self):
|
||||
self.stop_words = stop_words.values()
|
||||
self.algorithm = 'lsi'
|
||||
|
||||
def summarize(self, text, num=320, topic_min=5, judge_topic="all"):
|
||||
"""
|
||||
|
||||
:param text: text or list, input docs
|
||||
:param num: int, number or amount of return
|
||||
:param topic_min: int, topic number
|
||||
:param judge_topic: str, calculate ways of topic
|
||||
:return:
|
||||
"""
|
||||
# 切句
|
||||
if type(text) == str:
|
||||
self.sentences = cut_sentence(text)
|
||||
elif type(text) == list:
|
||||
self.sentences = text
|
||||
else:
|
||||
raise RuntimeError("text type must be list or str")
|
||||
# 切词
|
||||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||||
if word.strip()] for sentence in self.sentences]
|
||||
len_sentences_cut = len(sentences_cut)
|
||||
# 去除停用词等
|
||||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||||
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
|
||||
# 计算每个句子的tfidf
|
||||
sen_tfidf = tfidf_fit(self.sentences_cut)
|
||||
# 主题数, 经验判断
|
||||
topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3
|
||||
nmf_tfidf = NMF(n_components=topic_num, max_iter=320)
|
||||
res_nmf_w = nmf_tfidf.fit_transform(sen_tfidf.T) # 基矩阵 or 权重矩阵
|
||||
res_nmf_h = nmf_tfidf.components_ # 系数矩阵 or 降维矩阵
|
||||
|
||||
if judge_topic:
|
||||
### 方案一, 获取最大那个主题的k个句子
|
||||
##################################################################################
|
||||
topic_t_score = np.sum(res_nmf_h, axis=-1)
|
||||
# 对每列(一个句子topic_num个主题),得分进行排序,0为最大
|
||||
res_nmf_h_soft = res_nmf_h.argsort(axis=0)[-topic_num:][::-1]
|
||||
# 统计为最大每个主题的句子个数
|
||||
exist = (res_nmf_h_soft <= 0) * 1.0
|
||||
factor = np.ones(res_nmf_h_soft.shape[1])
|
||||
topic_t_count = np.dot(exist, factor)
|
||||
# 标准化
|
||||
topic_t_count /= np.sum(topic_t_count, axis=-1)
|
||||
topic_t_score /= np.sum(topic_t_score, axis=-1)
|
||||
# 主题最大个数占比, 与主题总得分占比选择最大的主题
|
||||
topic_t_tc = topic_t_count + topic_t_score
|
||||
topic_t_tc_argmax = np.argmax(topic_t_tc)
|
||||
# 最后得分选择该最大主题的
|
||||
res_nmf_h_soft_argmax = res_nmf_h[topic_t_tc_argmax].tolist()
|
||||
res_combine = {}
|
||||
for l in range(len_sentences_cut):
|
||||
res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||||
#####################################################################################
|
||||
else:
|
||||
### 方案二, 获取最大主题概率的句子, 不分主题
|
||||
res_combine = {}
|
||||
for i in range(len_sentences_cut):
|
||||
res_row_i = res_nmf_h[:, i]
|
||||
res_row_i_argmax = np.argmax(res_row_i)
|
||||
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
|
||||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||||
num_min = min(num, int(len_sentences_cut * 0.6))
|
||||
return score_sen[0:num_min]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
nmf = NMFSum()
|
||||
doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
|
||||
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
|
||||
"该基金认缴出资总规模为人民币3.01亿元。" \
|
||||
"基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \
|
||||
"各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \
|
||||
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
|
||||
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
|
||||
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||||
|
||||
doc = "和投票目标的等级来决定新的等级.简单的说。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。" \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。" \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。" \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。" \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。" \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。" \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。"
|
||||
|
||||
doc = '早年林志颖带kimi上《爸爸去哪儿》的时候,当时遮遮掩掩的林志颖老婆低调探班,总让人觉得格外神秘,大概是特别不喜欢' \
|
||||
'在公众面前曝光自己日常的那种人。可能这么些年过去,心态不断调整过了,至少在微博上,陈若仪越来越放得开,晒自己带' \
|
||||
'娃照顾双子星的点滴,也晒日常自己的护肤心得,时不时安利一些小东西。都快晚上十点半,睡美容觉的最佳时候,结果才带' \
|
||||
'完一天娃的陈若仪还是不忘先保养自己,敷起了面膜。泡完澡,这次用的是一个稍微平价的面膜,脸上、甚至仔细到脖子上都' \
|
||||
'抹上了。陈若仪也是多此一举,特别说自己不是裸体,是裹着浴巾的,谁在意这个呀,目光完全被你那又长又扑闪的睫毛给吸' \
|
||||
'引住了。这也太吓人吧,怎么能够长那么长那么密那么翘。嫉妒地说一句,真的很像种的假睫毛呐。陈若仪的睫毛应该是天生' \
|
||||
'的基础好吧,要不然也不会遗传给小孩,一家子都是睫毛精,几个儿子现在这么小都是长睫毛。只是陈若仪现在这个完美状态,' \
|
||||
'一定是后天再经过悉心的呵护培养。网友已经迫不及待让她教教怎么弄睫毛了,陈若仪也是答应地好好的。各种私人物品主动' \
|
||||
'揭秘,安利一些品牌给大家,虽然一再强调是自己的日常小物,还是很让人怀疑,陈若仪是不是在做微商当网红呐,网友建议' \
|
||||
'她开个店,看这回复,也是很有意愿了。她应该不缺这个钱才对。隔三差五介绍下自己用的小刷子之类,陈若仪乐于向大家传' \
|
||||
'授自己的保养呵护之道。她是很容易就被晒出斑的肤质,去海岛参加婚礼,都要必备这几款超爱用的防晒隔离。日常用的、太' \
|
||||
'阳大时候用的,好几个种类,活得相当精致。你们按照自己的需要了解一下。画眉毛,最爱用的是intergrate的眉笔。也是个' \
|
||||
'念旧的人,除了Dior,陈若仪的另一个眉粉其中一个是她高中就开始用的Kate。一般都是大学才开始化妆修饰自己,感受得到' \
|
||||
'陈若仪从小就很爱美。各种小零小碎的化妆品,已经买过七八次的粉红胡椒抛光美体油,每天洗完澡陈若仪都会喷在肚子、大' \
|
||||
'腿、屁股和膝盖手肘,说是能保持肌肤的平滑紧致程度。每安利一样东西,总有网友要在下面问其他问题咋个办,真是相当信' \
|
||||
'任陈若仪了。每次她也很耐心的解答,"去黑头我用的是SUQQU洁面去角质按摩膏磨砂洁面洗面奶,"一定要先按摩再用。她自己' \
|
||||
'已经回购过好几次,意思是你们再了解一下。了解归了解,买不买随意。毕竟像她另一个爱用的达尔肤面膜,效果好是好,价' \
|
||||
'格据说比sk2都还要贵,不是大多数人日常能够消费得起的,大家就看个热闹就好了,还是多买多试多用才能找到最适合自己的' \
|
||||
'护肤方法。'
|
||||
|
||||
sum = nmf.summarize(doc, num=320)
|
||||
for i in sum:
|
||||
print(i)
|
||||
|
||||
|
20
macropodus/tookit/__init__.py
Normal file
20
macropodus/tookit/__init__.py
Normal file
@ -0,0 +1,20 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/11/28 20:49
|
||||
# @author : Mo
|
||||
# @function:
|
||||
|
||||
|
||||
# tookit
|
||||
from macropodus.tookit.chinese2number.chinese2number import Chi2Num, Num2Chi
|
||||
from macropodus.tookit.calculator_sihui.calcultor_sihui import Calculator
|
||||
from macropodus.tookit.trie_tree.trie_tree import TrieTree
|
||||
|
||||
# 常用工具(tookit, 计算器, 中文与阿拉伯数字转化, 前缀树)
|
||||
Calcul = Calculator()
|
||||
Chi2num = Chi2Num()
|
||||
Num2chi = Num2Chi()
|
||||
Trie = TrieTree()
|
||||
calculate = Calcul.calculator_sihui
|
||||
chi2num = Chi2num.compose_decimal
|
||||
num2chi = Num2chi.decimal_chinese
|
5
macropodus/tookit/calculator_sihui/__init__.py
Normal file
5
macropodus/tookit/calculator_sihui/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/3 20:25
|
||||
# @author : Mo
|
||||
# @function:
|
246
macropodus/tookit/calculator_sihui/calcultor_formula.py
Normal file
246
macropodus/tookit/calculator_sihui/calcultor_formula.py
Normal file
@ -0,0 +1,246 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/21 23:38
|
||||
# @author :Mo
|
||||
# @function :calcultor of text, not filter and redundancy
|
||||
|
||||
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
import re
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
def change_symbol(formula):
|
||||
"""
|
||||
提取负号
|
||||
eg:-9-2-5-2*3-5/3-40*4/-1.0/5+6*3 ===> -(9+2+5+2*3+5/3+40*4/1.0/5-6*3)
|
||||
:param formula:
|
||||
:return:
|
||||
"""
|
||||
def primary_change(for_str): # 把算式中的全角 + - 对应换成 - +
|
||||
temp = for_str.split("+")
|
||||
new_formula = []
|
||||
for value in temp:
|
||||
value = value.replace("-", "+")
|
||||
new_formula.append(value)
|
||||
return "-".join(new_formula)
|
||||
|
||||
if formula.startswith("-"):
|
||||
formula = formula.replace("-", "", 1)
|
||||
formula = primary_change(formula)
|
||||
formula = formula.join(["-(", ")"])
|
||||
elif formula.startswith("+"):
|
||||
formula = primary_change(formula)
|
||||
formula = formula.join(["-(", ")"])
|
||||
else:
|
||||
formula = primary_change(formula)
|
||||
formula = formula.join(["-(-", ")"])
|
||||
return formula
|
||||
|
||||
|
||||
def remove_repeat(formula):
|
||||
"""
|
||||
去掉连续的重复的运算符
|
||||
:param formula: str, like: "1++2"
|
||||
:return: str, like:"1+2"
|
||||
"""
|
||||
temp = formula.replace("++", "+")
|
||||
temp = temp.replace("+-", "-")
|
||||
temp = temp.replace("-+", "-")
|
||||
temp = temp.replace("--", "+")
|
||||
temp = temp.replace("*+", "*")
|
||||
temp = temp.replace("+*", "*")
|
||||
temp = temp.replace("/+", "/")
|
||||
temp = temp.replace("+/", "/")
|
||||
return temp
|
||||
|
||||
|
||||
def has_special_operator(formula, special_operator):
|
||||
"""
|
||||
判断是否有 *+ +- /- 之类的运算符
|
||||
:param formula:
|
||||
:param special_operator:
|
||||
:return:
|
||||
"""
|
||||
for operator in special_operator:
|
||||
if formula.find(operator) != -1:
|
||||
return operator
|
||||
return ""
|
||||
|
||||
|
||||
def handle_special_operator(formula, operator):
|
||||
"""
|
||||
如果有 "*-", "-*", "/-", "-/" 这些运算符,
|
||||
提取负号,去掉重复的运算符
|
||||
:param formula:
|
||||
:param operator:
|
||||
:return:
|
||||
"""
|
||||
temp = ""
|
||||
regex = "\d*[.]?\d+"
|
||||
opera = operator.replace("*", "[*]")
|
||||
ret = re.compile(opera.join([regex, regex]))
|
||||
while ret.search(formula):
|
||||
search_res = ret.search(formula).group()
|
||||
if operator.find("*") != -1:
|
||||
temp = search_res.replace(operator, "*")
|
||||
elif operator.find("/") != -1:
|
||||
temp = search_res.replace(operator, "/")
|
||||
temp = "-".join(["", temp])
|
||||
formula = formula.replace(search_res, temp, 1)
|
||||
return formula
|
||||
|
||||
|
||||
def has_parentheses(formula):
|
||||
"""
|
||||
判断是否还有括号
|
||||
:param formula: str
|
||||
:return: boolean
|
||||
"""
|
||||
if re.search("[()]", formula):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def judge_illegal(formula):
|
||||
"""
|
||||
判断括号是否匹配完全,运算符是否合法
|
||||
没有考虑 ** // 的计算
|
||||
:param formula: str
|
||||
:return: str
|
||||
"""
|
||||
if len(re.findall("[(]", formula)) != len(re.findall("[)]", formula)):
|
||||
return True
|
||||
if formula.startswith("*") or formula.startswith("/"):
|
||||
return True
|
||||
if has_special_operator(formula, ["*/", "/*", "**", "//"]):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def calculator_formula(formula):
|
||||
"""
|
||||
计算算式,这里计算的是不带括号的算式
|
||||
计算次序是 / * - +
|
||||
计算过程中出现括号则停止计算,返回当前的算式
|
||||
:param formula:
|
||||
:return:
|
||||
"""
|
||||
def primary_operator(for_str, operation):
|
||||
try:
|
||||
primary_result = 0
|
||||
regex = "\d*[.]?\d*"
|
||||
ret = re.compile(operation.join(["[", "]"]).join([regex, regex]))
|
||||
while ret.search(for_str):
|
||||
ret_opera = has_special_operator(for_str, ["*-", "-*", "/-", "-/"])
|
||||
while ret_opera:
|
||||
for_str = handle_special_operator(for_str, ret_opera)
|
||||
ret_opera = has_special_operator(for_str, ["*-", "-*", "/-", "-/"])
|
||||
while has_special_operator(for_str, ["+-", "-+", "++", "--", "+*", "*+", "+/", "/+"]):
|
||||
for_str = remove_repeat(for_str)
|
||||
# print("primary_operator:", for_str)
|
||||
if has_parentheses(for_str):
|
||||
return for_str
|
||||
if for_str.startswith("-"):
|
||||
temp = re.findall("^-\d*[.]?\d*$", for_str)
|
||||
if temp:
|
||||
return temp[0]
|
||||
return change_symbol(for_str)
|
||||
if for_str.startswith("+"):
|
||||
for_str = for_str.replace("+", "", 1)
|
||||
if not ret.search(for_str):
|
||||
continue
|
||||
search_res = ret.search(for_str).group()
|
||||
operand_list = search_res.split(operation)
|
||||
if operation == "/":
|
||||
primary_result = float(operand_list[0]) / float(operand_list[1])
|
||||
elif operation == "*":
|
||||
primary_result = float(operand_list[0]) * float(operand_list[1])
|
||||
elif operation == "-":
|
||||
primary_result = float(operand_list[0]) - float(operand_list[1])
|
||||
elif operation == "+":
|
||||
primary_result = float(operand_list[0]) + float(operand_list[1])
|
||||
for_str = for_str.replace(search_res, '%f' % (primary_result), 1)
|
||||
return for_str
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return None
|
||||
try:
|
||||
formula = primary_operator(formula, "/")
|
||||
formula = primary_operator(formula, "*")
|
||||
formula = primary_operator(formula, "-")
|
||||
formula = primary_operator(formula, "+")
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return None
|
||||
return formula
|
||||
|
||||
|
||||
def remove_parentheses(formula):
|
||||
"""
|
||||
去掉算式的括号,计算括号里算式
|
||||
:param formula:
|
||||
:return:
|
||||
"""
|
||||
parentheses = re.compile("\([^()]+\)")
|
||||
while parentheses.search(formula):
|
||||
search_res = parentheses.search(formula).group()
|
||||
for_str = re.sub("[()]", "", search_res)
|
||||
if judge_illegal(for_str):
|
||||
return ""
|
||||
for_str = calculator_formula(for_str)
|
||||
formula = formula.replace(search_res, for_str, 1)
|
||||
"""
|
||||
会有去掉所有括号算式还没算完的情况
|
||||
eg:1-2*65
|
||||
需要再计算一遍算式
|
||||
"""
|
||||
formula = calculator_formula(formula)
|
||||
return formula
|
||||
|
||||
|
||||
def result_formula(formula):
|
||||
"""
|
||||
简单计算器, 纯粹四则运算
|
||||
去完括号后额外计算的那一次若再次出现括号,
|
||||
则重复去括号运算,直至再没有括号
|
||||
:param formula: str
|
||||
:return: str
|
||||
"""
|
||||
|
||||
def remove_space(formula):
|
||||
"""
|
||||
去掉算式的空格
|
||||
:param formula: str
|
||||
:return: str
|
||||
"""
|
||||
return formula.replace(" ", "")
|
||||
|
||||
def first_calculator(for_str):
|
||||
"""
|
||||
先计算括号里边的
|
||||
:param for_str:
|
||||
:return:
|
||||
"""
|
||||
if judge_illegal(for_str):
|
||||
return None
|
||||
return remove_parentheses(for_str)
|
||||
|
||||
formula = remove_space(formula)
|
||||
|
||||
formula = first_calculator(formula)
|
||||
if not formula:
|
||||
return None
|
||||
while has_parentheses(formula):
|
||||
formula = first_calculator(formula)
|
||||
# print("calculator_result:", formula)
|
||||
if not formula:
|
||||
return None
|
||||
return formula
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cal = result_formula("1+1+2+3*(35+1-5*7-10/5)/2*2")
|
||||
print(cal)
|
246
macropodus/tookit/calculator_sihui/calcultor_function.py
Normal file
246
macropodus/tookit/calculator_sihui/calcultor_function.py
Normal file
@ -0,0 +1,246 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/21 23:36
|
||||
# @author :Mo
|
||||
# @function :function of some basic Extraction Scientific Computing
|
||||
|
||||
|
||||
from macropodus.tookit.calculator_sihui.calcultor_number import extract_number
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
import math
|
||||
import re
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
|
||||
def rackets_replace(rackets_char, myformula):
|
||||
"""
|
||||
将2(3换成2*(3, 3)4换成3)*4
|
||||
:param rackets_char:
|
||||
:param myformula:
|
||||
:return:
|
||||
"""
|
||||
if rackets_char in myformula: # "("在算式里边
|
||||
if rackets_char =="(":
|
||||
rackets_re = r'\('
|
||||
else:
|
||||
rackets_re = r'\)'
|
||||
pos_rackets = re.finditer(rackets_re, myformula)
|
||||
count = 0
|
||||
for pos in pos_rackets:
|
||||
pos_single = pos.start() + count
|
||||
if pos_single != 0 and rackets_char =="(":
|
||||
if myformula[pos_single-1] in '零一二两三四五六七八九0123456789百十千万亿':
|
||||
myformula = myformula[:pos_single] + "*" + myformula[pos_single:]
|
||||
count += 1
|
||||
if pos_single != len(myformula)-1 and rackets_char ==")":
|
||||
if myformula[pos_single+1] in '零一二两三四五六七八九0123456789百十千万亿':
|
||||
myformula = myformula[:pos_single+1] + "*" + myformula[pos_single+1:]
|
||||
count += 1
|
||||
return myformula
|
||||
else:
|
||||
return myformula
|
||||
|
||||
|
||||
|
||||
def reagan(words, wordsminus):
|
||||
"""
|
||||
求平方根,立方根,n次方根
|
||||
:param words: str, 原句
|
||||
:param wordsminus:str , 处理后的句子
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if '根号' in words:
|
||||
reagan = wordsminus.replace("开", "").replace("根号", "").replace("的", "")
|
||||
radicalaa = float(extract_number(reagan)[0])
|
||||
if radicalaa < 0.0:
|
||||
return 'illegal math'
|
||||
radicalbb = math.sqrt(radicalaa)
|
||||
results = str(radicalbb)
|
||||
elif "平方根" in words:
|
||||
reagan = wordsminus.replace("开", "").replace("平方根", "").replace("平方", "").replace("的", "")
|
||||
reagan = extract_number(reagan)[0]
|
||||
squarerootaa = float(reagan)
|
||||
if squarerootaa < 0.0:
|
||||
return 'illegal math'
|
||||
squarerootbb = math.sqrt(squarerootaa)
|
||||
results = str(squarerootbb)
|
||||
elif "立方根" in words:
|
||||
reagan = wordsminus.replace("开", "").replace("立方根", "").replace("立方", "").replace("的", "")
|
||||
reagan = extract_number(reagan)[0]
|
||||
squarerootaa = float(reagan)
|
||||
squarerootbb = math.pow(squarerootaa, 1.0 / 3)
|
||||
results = str(squarerootbb)
|
||||
elif "次方根" in words:
|
||||
reagan = wordsminus.replace("开", "").replace("次方根", "").replace("次方", "")
|
||||
squareroot = reagan.split("的")
|
||||
squarerootaa = float(extract_number(squareroot[0])[0])
|
||||
squarerootbb = float(extract_number(squareroot[1])[0])
|
||||
if squarerootaa % 2 == 0 and squarerootbb < 0.0:
|
||||
return 'illegal math'
|
||||
squarerootcc = math.pow(squarerootaa, 1.0 / squarerootbb)
|
||||
results = str(squarerootcc)
|
||||
else:
|
||||
results = words
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
def power(words, wordsminus):
|
||||
"""
|
||||
求指数,求平方
|
||||
:param words:
|
||||
:param wordsminus:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "平方根" not in words and "平方" in words:
|
||||
reagan = wordsminus.replace("平方", "").replace("开", "").replace("的", "")
|
||||
reagan = extract_number(reagan)[0]
|
||||
square = float(reagan)
|
||||
radicalbb = math.pow(square, 2)
|
||||
results = str(radicalbb)
|
||||
elif "立方根" not in words and "立方" in words:
|
||||
reagan = wordsminus.replace("立方", "").replace("开", "").replace("的", "")
|
||||
reagan = extract_number(reagan)[0]
|
||||
square = float(reagan)
|
||||
radicalbb = math.pow(square, 3)
|
||||
results = str(radicalbb)
|
||||
elif (("次方" in words or "次幂" in words) and "次方根" not in words and "次幂根" not in words):
|
||||
reagan = wordsminus.replace("次方", "").replace("开", "").replace("次幂", "")
|
||||
squareroot = reagan.split("的")
|
||||
squarerootaa = float(extract_number(squareroot[0])[0])
|
||||
squarerootbb = float(extract_number(squareroot[1])[0])
|
||||
squarerootcc = math.pow(squarerootaa, squarerootbb)
|
||||
results = str(squarerootcc)
|
||||
else:
|
||||
results = words
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
def logarithm(words, wordsminus):
|
||||
"""
|
||||
求对数
|
||||
:param words:
|
||||
:param wordsminus:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "LG" in words or "LOG" in words:
|
||||
Lg = wordsminus.replace("LOG", "").replace("LG", "").replace(" ", "").replace("的", "")
|
||||
Lg = float(extract_number(Lg)[0])
|
||||
if Lg <= 0.0:
|
||||
return 'illegal math'
|
||||
lgbb = math.log(Lg)
|
||||
results = str(lgbb)
|
||||
elif "对数" in words:
|
||||
Logg = wordsminus.replace("以", "").replace("对数", "").replace("的对数", "").replace(" ", "").replace("的", "")
|
||||
root = Logg.split("为底")
|
||||
rootaa = float(extract_number(root[0])[0])
|
||||
rootbb = float(extract_number(root[1])[0])
|
||||
if rootaa <= 0.0 or rootbb <= 0.0:
|
||||
return 'illegal math'
|
||||
rootcc = math.log(rootbb) / math.log(rootaa)
|
||||
results = str(rootcc)
|
||||
else:
|
||||
results = words
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
def fraction(words, wordsminus):
|
||||
"""
|
||||
求分数
|
||||
:param words:
|
||||
:param wordsminus:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "fenzhi" in words:
|
||||
fenzhi = wordsminus.replace("fenzhi", "/").replace(" ", "").replace("的", "")
|
||||
root = fenzhi.split("/")
|
||||
rootaa = float(extract_number(root[0])[0])
|
||||
rootbb = float(extract_number(root[1])[0])
|
||||
rootcc = rootbb / rootaa
|
||||
results = str(rootcc)
|
||||
else:
|
||||
results = words
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
def fractiontwo(words, wordsminus):
|
||||
"""
|
||||
取分数
|
||||
:param words:
|
||||
:param wordsminus:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "fenzhi" in words:
|
||||
fenzhi = wordsminus.replace("fenzhi", "/").replace(" ", "").replace("的", "")
|
||||
root = fenzhi.split("/")
|
||||
rootaa = float(extract_number(root[0])[0])
|
||||
rootbb = float(extract_number(root[1])[0])
|
||||
results = str(rootaa/rootbb)
|
||||
else:
|
||||
results = words
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
def factorial(words, wordsminus):
|
||||
"""
|
||||
求阶乘
|
||||
:param words:
|
||||
:param wordsminus:
|
||||
:return:
|
||||
"""
|
||||
results = words
|
||||
try:
|
||||
if "jiecheng的" in words:
|
||||
factory = wordsminus.replace("jiecheng的", "").replace("的", "").replace(" ", "")
|
||||
fact = float(extract_number(factory)[0])
|
||||
if fact <= 10000:
|
||||
results = str(math.factorial(fact))
|
||||
else:
|
||||
results = words
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
res = reagan("根号4", "根号4")
|
||||
print(res)
|
||||
res = reagan("27的3次方根是多少", "27的3次方根")
|
||||
print(res)
|
||||
res = power("9的平方", "9的平方")
|
||||
print(res)
|
||||
res = power("27的立方是几", "9的立方")
|
||||
print(res)
|
||||
res = power("3的3次方是几", "3的3次方实")
|
||||
print(res)
|
||||
res = logarithm("LG8", "LG8")
|
||||
print(res)
|
||||
res = logarithm("以2为底64的对数", "以2为底64的对数")
|
||||
print(res)
|
||||
res = fraction("1fenzhi6是多少", "1fenzhi6")
|
||||
print(res)
|
||||
res = factorial("10jiecheng的", "10jiecheng的")
|
||||
print(res)
|
210
macropodus/tookit/calculator_sihui/calcultor_number.py
Normal file
210
macropodus/tookit/calculator_sihui/calcultor_number.py
Normal file
@ -0,0 +1,210 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/22 20:09
|
||||
# @author :Mo
|
||||
# @function :extract number from sentence of chinese or mix。提取数字,中文,或者混合中文-阿拉伯数字
|
||||
|
||||
|
||||
import regex as re
|
||||
# import re
|
||||
|
||||
|
||||
# * 字符串预处理模块,为分析器TimeNormalizer提供相应的字符串预处理服务
|
||||
class StringPreHandler:
|
||||
# @Author : zhm
|
||||
# @codes : code from github: https://github.com/zhanzecheng/Time_NLP
|
||||
# @function :StringPreHandler.py
|
||||
@classmethod
|
||||
def delKeyword(cls, target, rules):
|
||||
"""
|
||||
该方法删除一字符串中所有匹配某一规则字串
|
||||
可用于清理一个字符串中的空白符和语气助词
|
||||
:param target: 待处理字符串
|
||||
:param rules: 删除规则
|
||||
:return: 清理工作完成后的字符串
|
||||
"""
|
||||
pattern = re.compile(rules)
|
||||
res = pattern.sub('', target)
|
||||
# print res
|
||||
return res
|
||||
|
||||
|
||||
@classmethod
|
||||
def numberTranslator(cls, target):
|
||||
"""
|
||||
该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字
|
||||
如"这里有一千两百个人,六百零五个来自中国"可以转化为
|
||||
"这里有1200个人,605个来自中国"
|
||||
此外添加支持了部分不规则表达方法
|
||||
如两万零六百五可转化为20650
|
||||
两百一十四和两百十四都可以转化为214
|
||||
一六零加一五八可以转化为160+158
|
||||
该方法目前支持的正确转化范围是0-99999999
|
||||
该功能模块具有良好的复用性
|
||||
:param target: 待转化的字符串
|
||||
:return: 转化完毕后的字符串
|
||||
"""
|
||||
pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"万")
|
||||
s = list(filter(None, s))
|
||||
num = 0
|
||||
if len(s) == 2:
|
||||
num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"千")
|
||||
s = list(filter(None, s))
|
||||
num = 0
|
||||
if len(s) == 2:
|
||||
num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"百")
|
||||
s = list(filter(None, s))
|
||||
num = 0
|
||||
if len(s) == 2:
|
||||
num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
pattern = re.compile(u"[零一二两三四五六七八九]")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
|
||||
|
||||
pattern = re.compile(u"(?<=(周|星期))[末天日]")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
|
||||
|
||||
pattern = re.compile(u"(?<!(周|星期))0?[0-9]?十[0-9]?")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"十")
|
||||
num = 0
|
||||
ten = cls.strToInt(s[0])
|
||||
if ten == 0:
|
||||
ten = 1
|
||||
unit = cls.strToInt(s[1])
|
||||
num = ten * 10 + unit
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
pattern = re.compile(u"0?[1-9]百[0-9]?[0-9]?")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"百")
|
||||
s = list(filter(None, s))
|
||||
num = 0
|
||||
if len(s) == 1:
|
||||
hundred = int(s[0])
|
||||
num += hundred * 100
|
||||
elif len(s) == 2:
|
||||
hundred = int(s[0])
|
||||
num += hundred * 100
|
||||
num += int(s[1])
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
pattern = re.compile(u"0?[1-9]千[0-9]?[0-9]?[0-9]?")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"千")
|
||||
s = list(filter(None, s))
|
||||
num = 0
|
||||
if len(s) == 1:
|
||||
thousand = int(s[0])
|
||||
num += thousand * 1000
|
||||
elif len(s) == 2:
|
||||
thousand = int(s[0])
|
||||
num += thousand * 1000
|
||||
num += int(s[1])
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
pattern = re.compile(u"[0-9]+万[0-9]?[0-9]?[0-9]?[0-9]?")
|
||||
match = pattern.finditer(target)
|
||||
for m in match:
|
||||
group = m.group()
|
||||
s = group.split(u"万")
|
||||
s = list(filter(None, s))
|
||||
num = 0
|
||||
if len(s) == 1:
|
||||
tenthousand = int(s[0])
|
||||
num += tenthousand * 10000
|
||||
elif len(s) == 2:
|
||||
tenthousand = int(s[0])
|
||||
num += tenthousand * 10000
|
||||
num += int(s[1])
|
||||
target = pattern.sub(str(num), target, 1)
|
||||
|
||||
return target
|
||||
|
||||
@classmethod
|
||||
def wordToNumber(cls, s):
|
||||
"""
|
||||
方法numberTranslator的辅助方法,可将[零-九]正确翻译为[0-9]
|
||||
:param s: 大写数字
|
||||
:return: 对应的整形数,如果不是数字返回-1
|
||||
"""
|
||||
if (s == u'零') or (s == '0'):
|
||||
return 0
|
||||
elif (s == u'一') or (s == '1'):
|
||||
return 1
|
||||
elif (s == u'二') or (s == u'两') or (s == '2'):
|
||||
return 2
|
||||
elif (s == u'三') or (s == '3'):
|
||||
return 3
|
||||
elif (s == u'四') or (s == '4'):
|
||||
return 4
|
||||
elif (s == u'五') or (s == '5'):
|
||||
return 5
|
||||
elif (s == u'六') or (s == '6'):
|
||||
return 6
|
||||
elif (s == u'七') or (s == u'天') or (s == u'日') or (s == u'末') or (s == '7'):
|
||||
return 7
|
||||
elif (s == u'八') or (s == '8'):
|
||||
return 8
|
||||
elif (s == u'九') or (s == '9'):
|
||||
return 9
|
||||
else:
|
||||
return -1
|
||||
|
||||
@classmethod
|
||||
def strToInt(cls, s):
|
||||
try:
|
||||
res = int(s)
|
||||
except:
|
||||
res = 0
|
||||
return res
|
||||
|
||||
|
||||
sph = StringPreHandler()
|
||||
|
||||
|
||||
def extract_number(sentence):
|
||||
"""
|
||||
提取数字,纯数字
|
||||
:param sentence: str
|
||||
:return: list<str>
|
||||
"""
|
||||
res = sph.numberTranslator(target=sentence)
|
||||
find_list = []
|
||||
for i in re.finditer('(\d+(\.\d+)?)', res):
|
||||
find_list.append(i.group())
|
||||
return find_list
|
||||
|
||||
if __name__ == '__main__':
|
||||
sen = "1000.一加1等于几"
|
||||
res = extract_number(sen)
|
||||
print(res)
|
224
macropodus/tookit/calculator_sihui/calcultor_sihui.py
Normal file
224
macropodus/tookit/calculator_sihui/calcultor_sihui.py
Normal file
@ -0,0 +1,224 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/21 20:22
|
||||
# @author :Mo
|
||||
# @function :an ai text calcultor of xiaomo
|
||||
|
||||
|
||||
from macropodus.tookit.calculator_sihui.calcultor_function import rackets_replace, reagan, power, logarithm, fraction, factorial, fractiontwo
|
||||
from macropodus.tookit.calculator_sihui.calcultor_number import extract_number, sph
|
||||
from macropodus.tookit.calculator_sihui.calcultor_formula import result_formula
|
||||
from macropodus.conf.path_log import get_logger_root
|
||||
import re
|
||||
|
||||
|
||||
logger = get_logger_root()
|
||||
|
||||
|
||||
def StringToCalculateZero(words=''):
|
||||
"""
|
||||
混合运算去除非计算式等无用词
|
||||
:param words: str
|
||||
:return: str
|
||||
"""
|
||||
wordsspot = words.replace("点", ".")
|
||||
wordsmark = wordsspot.replace("分之", "fenzhi")
|
||||
wordsin = wordsmark.replace("正切", "zheng切").replace("正弦", "zheng弦").replace("正割", "zheng割").replace("正矢", "zheng矢")
|
||||
wordsadd = wordsin.replace("加上", "+").replace("加到", "+").replace("加", "+").replace("+", "+").replace("正", "+")
|
||||
wordsminus = wordsadd.replace("减去", "-").replace("减", "-").replace("-", "-").replace("负", "-")
|
||||
wordsmult = wordsminus.replace("阶乘", "jiecheng的").replace("乘上", "*").replace("乘以", "*").replace("乘于","*").replace("乘", "*").replace("×", "*")
|
||||
wordsdivis01 = wordsmult.replace("除去", "/").replace("除以", "/").replace("除于", "/").replace("除","/").replace("÷", "/")
|
||||
wordsdivis02 = wordsdivis01.replace("从", "").replace("再", "").replace("在", "").replace("然后", "").replace("直", "").replace("到", "")
|
||||
wordbrackets = wordsdivis02.replace("(", "(").replace(")", ")").replace("=", "").replace("=", "")
|
||||
formula = wordbrackets.replace("左括号", "(").replace("右括号", "(").replace("的和", "").replace("的差", "").replace("的商", "").replace("的积", "")
|
||||
myformula_1 = formula.replace("*-", "*(-1)*").replace("\\*\\+", "*").replace("\\/\\-", "/(-1)/")
|
||||
myformula_2 = myformula_1.replace(" ", "").replace("\\+\\-", "\\-").replace("\\+\\+", "\\+").replace("\\-\\+", "\\-").replace("\\-\\-", "\\+")
|
||||
myformula_2 = rackets_replace("(", myformula_2)
|
||||
myformula_2 = rackets_replace(")", myformula_2)
|
||||
|
||||
return myformula_2
|
||||
|
||||
|
||||
def StringToCalculateOne(words):
|
||||
"""
|
||||
简单句总调用
|
||||
求乘方,阶乘,指数,根式,三角函数,对数,最大最小公约数公倍数
|
||||
:param words: str
|
||||
:return: str
|
||||
"""
|
||||
try:
|
||||
res_reagan = reagan(words, words) # 报错或不执行返回原来的数据
|
||||
res_power = power(words, words)
|
||||
# aa22 = triangle(complex[i], complex[i])
|
||||
res_logarithm = logarithm(words, words)
|
||||
rees_factorial = factorial(words, words)
|
||||
res_fraction = fraction(words, words)
|
||||
if (res_reagan != words):
|
||||
goal = res_reagan
|
||||
elif (res_power != words):
|
||||
goal = res_power
|
||||
# elif (aa22 != complex[i]):
|
||||
# goal = aa22
|
||||
elif (res_logarithm != words):
|
||||
goal = res_logarithm
|
||||
elif (rees_factorial != words):
|
||||
goal = rees_factorial
|
||||
elif (res_fraction != words):
|
||||
goal = res_fraction
|
||||
else:
|
||||
oldwords = words.replace("的", "")
|
||||
oldwords = extract_number(oldwords)[0]
|
||||
goal = oldwords
|
||||
return goal
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return words
|
||||
|
||||
|
||||
def StringToCalculateTwo(sentence=''):
|
||||
"""
|
||||
复杂算式, 总调用, 分步计算,先计算三角函数,指数,对数
|
||||
1.取出操作符与数据(注意--,++,-,+开头这种)
|
||||
2.计算中间的,比如说根号12,2的7次方这种
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if sentence[0] == '+' or sentence[0] == '-':
|
||||
sentence = '0' + sentence
|
||||
minus = 0
|
||||
operators = []
|
||||
complex = re.split("[+*/-]", sentence)
|
||||
for s in sentence:
|
||||
if ((s == '+' or s == '-' or s == '*' or s == '/') & minus != 0 & minus != 2):
|
||||
operators.append("" + s)
|
||||
minus = minus + 1
|
||||
else:
|
||||
minus = 1
|
||||
# complex.append(float(formula[prePos:].strip()))
|
||||
formula = ""
|
||||
for i in range(len(complex)):
|
||||
if "" == complex[i]:
|
||||
complex[i] = " "
|
||||
formula = formula + complex[i] + operators[i]
|
||||
continue
|
||||
res_reagan = reagan(complex[i], complex[i]) #报错或不执行返回原来的数据
|
||||
res_power = power(complex[i], complex[i])
|
||||
# aa22 = triangle(complex[i], complex[i])
|
||||
res_logarithm = logarithm(complex[i], complex[i])
|
||||
res_factorial = factorial(complex[i], complex[i])
|
||||
res_fraction = fraction(complex[i], complex[i])
|
||||
|
||||
if (res_reagan != complex[i]):
|
||||
goal = res_reagan
|
||||
elif (res_power != complex[i]):
|
||||
goal = res_power
|
||||
# elif (aa22 != complex[i]):
|
||||
# goal = aa22
|
||||
elif (res_logarithm != complex[i]):
|
||||
goal = res_logarithm
|
||||
elif (res_factorial != complex[i]):
|
||||
goal = res_factorial
|
||||
elif (res_fraction != complex[i]):
|
||||
goal = res_fraction
|
||||
elif "(" in complex[i] or ")" in complex[i]:
|
||||
goal = sph.numberTranslator(target=complex[i].replace("的", ""))
|
||||
else:
|
||||
oldwords = complex[i].replace("的", "")
|
||||
oldwords = extract_number(oldwords)[0]
|
||||
goal = oldwords
|
||||
if goal == 'illegal math': #非法抛出
|
||||
return 'illegal math'
|
||||
if (i < len(complex) - 1):
|
||||
rest = goal + operators[i]
|
||||
else:
|
||||
rest = goal
|
||||
formula = formula + rest
|
||||
myformula = formula.replace("*-", "*(-1)*").replace("*+", "*").replace("/-", "/(-1)/")
|
||||
formulalast = myformula.replace(" ", "").replace("+-", "-").replace("++", "+").replace("-+", "-").replace("--","+")
|
||||
except Exception as e:
|
||||
logger.info(str(e))
|
||||
return sentence
|
||||
|
||||
return formulalast
|
||||
|
||||
|
||||
class Calculator:
|
||||
def __init__(self):
|
||||
self.tookit = "calculator_sihui"
|
||||
|
||||
def calculator_sihui(self, sentence = ''):
|
||||
"""
|
||||
思慧计算器总调用接口
|
||||
:param sentence:str, 输入句子,TEXT
|
||||
:return:
|
||||
"""
|
||||
# 运算符转换
|
||||
sentence_wise = StringToCalculateZero(sentence)
|
||||
if not sentence_wise:
|
||||
return sentence
|
||||
# 混合运算
|
||||
sentence_replace = StringToCalculateTwo(sentence_wise)
|
||||
if ('/0' in sentence_replace and '/0.' not in sentence_replace) or sentence_replace == 'illegal math':
|
||||
return 'illegal math'
|
||||
for char in sentence_replace:
|
||||
if char not in '+-*/().0123456789':
|
||||
return 'could not calculate'
|
||||
#
|
||||
result = result_formula(sentence_replace)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cal = Calculator()
|
||||
equation_sample = [
|
||||
'',
|
||||
'2(3*4)6+4(4)4',
|
||||
'(1+2)=',
|
||||
'1+2等于几',
|
||||
'100+30',
|
||||
'111+90-9等于几',
|
||||
"23 + 13 * ((25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3) * (9-2*6/3 + 5 *3*9/9*5 +10 * 56/(-14) )) - (-4*3)/ (3+3*3) )",
|
||||
'1-2-3-4-5-6',
|
||||
'134+123*898*123456789212310',
|
||||
'4*5*6/6/5',
|
||||
'(1+(2+(3-(4*6/4/3)-1)-1)-3)+(6-7)',
|
||||
'1+1+1',
|
||||
'1*1*2',
|
||||
'1+2+3+4+6',
|
||||
'1/2+2*3*4*5/5/4-1',
|
||||
'1+2(12/13)',
|
||||
'1+2+3+4+5+6+7+8+9+10',
|
||||
'-1+1+2+(-2)',
|
||||
'((-3)*2+1/2-3*4/4) +1',
|
||||
'LG0',
|
||||
'LOG100',
|
||||
'以2为底4的对数',
|
||||
'根号一百二十三加上1',
|
||||
'1加2的根号',
|
||||
'根号1加2的和',
|
||||
'以2为底4的对数',
|
||||
'2的六次方',
|
||||
'2的3次幂',
|
||||
'二的一次方',
|
||||
'四的平方',
|
||||
'十一的立方',
|
||||
'开11的立方根',
|
||||
'开3的7次方根',
|
||||
'13的阶乘',
|
||||
'根号四',
|
||||
'1除以0',
|
||||
|
||||
'负一加上100加上50000',
|
||||
'2的8次方减6',
|
||||
'根号5乘以90',
|
||||
'2的8次方减6',
|
||||
'1的平方加根号2',
|
||||
'30的阶乘加90',
|
||||
'二分之一加1/3',
|
||||
''
|
||||
]
|
||||
|
||||
for es in equation_sample:
|
||||
print('ff算式: ' + es)
|
||||
print('思慧计算器结果: ' + str(cal.calculator_sihui(es)))
|
5
macropodus/tookit/chinese2number/__init__.py
Normal file
5
macropodus/tookit/chinese2number/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:00
|
||||
# @author : Mo
|
||||
# @function:
|
283
macropodus/tookit/chinese2number/chinese2number.py
Normal file
283
macropodus/tookit/chinese2number/chinese2number.py
Normal file
@ -0,0 +1,283 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/17 22:17
|
||||
# @author :Mo
|
||||
# @function :change chinese digit to Arab or reversed
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_clear import is_total_num
|
||||
import random
|
||||
|
||||
|
||||
# chinese_to_number, 单位-数字
|
||||
unit_dict = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000}
|
||||
unit_dict_keys = unit_dict.keys()
|
||||
digit_dict = {"零": 0, "一": 1, "二": 2, "两": 2, "俩": 2, "三": 3,
|
||||
"四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
|
||||
|
||||
# number_to_chinese, 单位-数字
|
||||
num_dict = { 0: "零", 1: "一", 2: "二", 3: "三", 4: "四",
|
||||
5: "五", 6: "六", 7: "七", 8: "八", 9: "九" }
|
||||
unit_map = [ ["", "十", "百", "千"], ["万", "十万", "百万", "千万"],
|
||||
["亿", "十亿", "百亿", "千亿"], ["兆", "十兆", "百兆", "千兆"] ]
|
||||
unit_step = ["万", "亿", "兆"]
|
||||
|
||||
|
||||
class Chi2Num():
|
||||
def __init__(self):
|
||||
self.result = 0.0
|
||||
self.result_last = 0.0
|
||||
# 字符串分离
|
||||
self.str_billion = "" # 亿
|
||||
self.str_billion_hundred = "" # 亿万
|
||||
self.str_billion_one = ""
|
||||
self.str_thousand_ten = "" # 万
|
||||
self.str_single = "" # one
|
||||
|
||||
def free_zero_and_split_three_parts(self, text):
|
||||
"""
|
||||
去零切分成三部分
|
||||
:param text:str
|
||||
:return:
|
||||
"""
|
||||
assert type(text) == str
|
||||
if "零" in text:
|
||||
text = text.replace("零", "")
|
||||
# 分切成三部分
|
||||
index = 0
|
||||
flag_billion = True # 亿
|
||||
flag_billion_hundred = True # 万亿
|
||||
flag_thousand_ten = True #万
|
||||
len_text = len(text)
|
||||
for i in range(len_text):
|
||||
if "亿" == text[i]:
|
||||
# 存在亿前面也有万的情况,小分节
|
||||
self.str_billion = text[0:i]
|
||||
if text.find("亿") > text.find("万"):
|
||||
for j in range(len(self.str_billion)):
|
||||
if "万" == self.str_billion[j]:
|
||||
flag_billion_hundred = False
|
||||
self.str_billion_hundred = self.str_billion[0:j]
|
||||
self.str_billion_one = self.str_billion[j+1:]
|
||||
# 如何亿字节中没有万,直接赋值
|
||||
if flag_billion_hundred:
|
||||
self.str_billion_one = self.str_billion
|
||||
index = i + 1
|
||||
flag_billion = False
|
||||
# 分节完毕
|
||||
self.str_single = text[i + 1:]
|
||||
if "万" == text[i] and text.find("亿") < text.find("万"):
|
||||
self.str_thousand_ten = text[index:i]
|
||||
self.str_single = text[i+1:]
|
||||
flag_thousand_ten = False
|
||||
if flag_billion and flag_thousand_ten:
|
||||
self.str_single = text
|
||||
|
||||
def str_to_number(self, text):
|
||||
"""
|
||||
string change to number
|
||||
:param text: str
|
||||
:return:
|
||||
"""
|
||||
assert type(text) == str
|
||||
number_res = 0
|
||||
number_1 = 0
|
||||
number_2 = 0
|
||||
number_3 = 0
|
||||
if not text:
|
||||
return 0
|
||||
len_text = len(text)
|
||||
for i in range(len_text):
|
||||
# 数字
|
||||
if text[i] in digit_dict:
|
||||
number_1 = digit_dict[text[i]]
|
||||
if i == len_text - 1:
|
||||
number_res += number_1
|
||||
# 单位
|
||||
elif text[i] in unit_dict:
|
||||
number_2 = unit_dict[text[i]]
|
||||
if number_1==0 and number_2==10:
|
||||
number_3 = number_2
|
||||
else:
|
||||
number_3 = number_1 * number_2
|
||||
# 清零避免重复读取
|
||||
number_1 = 0
|
||||
number_res += number_3
|
||||
# 处理形如 "二点13亿", "1.56万" 这样的情况
|
||||
else:
|
||||
try:
|
||||
text_else_str = [str(digit_dict[tet]) if tet in digit_dict else tet for tet in text]
|
||||
number_res = float("".join(text_else_str))
|
||||
except:
|
||||
number_res = 0
|
||||
return number_res
|
||||
|
||||
def compose_integer(self, text):
|
||||
"""
|
||||
整数转数字, 合并
|
||||
:param text:str, input of chinese, eg.["一百", "三千零七十八亿三千零十五万零三百一十二"]
|
||||
:return: float, result of change chinese to digit
|
||||
"""
|
||||
assert type(text) == str
|
||||
self.result = 0.0
|
||||
self.result_last = 0.0
|
||||
|
||||
text = text.replace("兆", "万亿").replace("点", ".").strip(".").strip()
|
||||
len_text = len(text)
|
||||
# 判断十百千万在不在text里边,在的话就走第二个
|
||||
flag_pos = True
|
||||
for unit_dict_key in unit_dict_keys:
|
||||
if unit_dict_key in text:
|
||||
flag_pos = False
|
||||
break
|
||||
# 分三种情况,全数字返回原值,有中文unit_dict_keys就组合, 没有中文unit_dict_keys整合
|
||||
if is_total_num(text):
|
||||
digit_float = float(text)
|
||||
return digit_float
|
||||
elif flag_pos:
|
||||
result_pos = ""
|
||||
for i in range(len_text):
|
||||
if "."!=text[i] and not text[i].isdigit():
|
||||
result_pos += str(digit_dict[text[i]])
|
||||
else:
|
||||
result_pos += text[i]
|
||||
self.result_last = float(result_pos)
|
||||
else:
|
||||
self.free_zero_and_split_three_parts(text)
|
||||
float_billion_hundred = self.str_to_number(self.str_billion_hundred)
|
||||
float_billion_one = self.str_to_number(self.str_billion_one)
|
||||
float_thousand_ten = self.str_to_number(self.str_thousand_ten)
|
||||
float_single = self.str_to_number(self.str_single)
|
||||
|
||||
self.result = float((float_billion_hundred * 10000 + float_billion_one) * 100000000 + float_thousand_ten * 10000 + float_single)
|
||||
self.result_last = self.result
|
||||
# 重置
|
||||
self.str_billion = "" # 亿
|
||||
self.str_billion_hundred = "" # 亿万
|
||||
self.str_billion_one = ""
|
||||
self.str_thousand_ten = "" # 万
|
||||
self.str_single = "" # one
|
||||
return self.result_last
|
||||
|
||||
def compose_decimal(self, text):
|
||||
"""
|
||||
中文小数转数字
|
||||
:param text:str, input of chinese, eg.["一百", "三千零七十八亿三千零十五万零三百一十二"]
|
||||
:return: float, result of change chinese to digit
|
||||
"""
|
||||
assert type(text) == str
|
||||
self.result = 0.0
|
||||
self.result_last = 0.0
|
||||
self.result_start = 0.0
|
||||
|
||||
text = text.replace("兆", "万亿").replace("点", ".").strip()
|
||||
if "." in text:
|
||||
# 判断十百千万在不在.号后边,在的话就走compose_integer(),并且返回
|
||||
pos_point = text.find(".")
|
||||
for unit_dict_key in unit_dict_keys:
|
||||
if unit_dict_key in text:
|
||||
if pos_point < text.find(unit_dict_key):
|
||||
return self.compose_integer(text)
|
||||
# 否则就是有小数
|
||||
texts = text.split(".")
|
||||
text_start = texts[0]
|
||||
text_end = texts[1]
|
||||
|
||||
# 处理整数部分
|
||||
if "0"==text_start or "零"==text_start:
|
||||
self.result_start = "0."
|
||||
else:
|
||||
self.result_start = str(int(self.compose_integer(text_start))) + "."
|
||||
# 处理尾部,就是后边小数部分
|
||||
result_pos = ""
|
||||
len_text = len(text_end)
|
||||
for i in range(len_text):
|
||||
if "."!=text_end[i] and not text_end[i].isdigit():
|
||||
result_pos += str(digit_dict[text_end[i]])
|
||||
else:
|
||||
result_pos += text_end[i]
|
||||
# 拼接
|
||||
self.result_last = float(self.result_start + result_pos) if result_pos.isdigit() else self.result_start
|
||||
|
||||
return self.result_last
|
||||
|
||||
|
||||
class Num2Chi():
|
||||
"""
|
||||
codes reference: https://github.com/tyong920/a2c
|
||||
"""
|
||||
def __init__(self):
|
||||
self.result = ""
|
||||
|
||||
def number_to_str(self, data):
|
||||
assert type(data) == float or int
|
||||
res = []
|
||||
count = 0
|
||||
# 倒转
|
||||
str_rev = reversed(str(data)) # seq -- 要转换的序列,可以是 tuple, string, list 或 range。返回一个反转的迭代器。
|
||||
for i in str_rev:
|
||||
if i is not "0":
|
||||
count_cos = count // 4 # 行
|
||||
count_col = count % 4 # 列
|
||||
res.append(unit_map[count_cos][count_col])
|
||||
res.append(num_dict[int(i)])
|
||||
count += 1
|
||||
else:
|
||||
count += 1
|
||||
if not res:
|
||||
res.append("零")
|
||||
elif res[-1] is not "零":
|
||||
res.append("零")
|
||||
# 再次倒序,这次变为正序了
|
||||
res.reverse()
|
||||
# 去掉"一十零"这样整数的“零”
|
||||
if res[-1] is "零" and len(res) is not 1:
|
||||
res.pop()
|
||||
|
||||
return "".join(res)
|
||||
|
||||
def decimal_chinese(self, data):
|
||||
assert type(data) == float or int
|
||||
data_str = str(data)
|
||||
if "." not in data_str:
|
||||
res = self.number_to_str(data_str)
|
||||
else:
|
||||
data_str_split = data_str.split(".")
|
||||
if len(data_str_split) is 2:
|
||||
res_start = self.number_to_str(data_str_split[0])
|
||||
res_end = "".join([num_dict[int(number)] for number in data_str_split[1]])
|
||||
res = res_start + random.sample(["点", "."], 1)[0] + res_end
|
||||
else:
|
||||
res = str(data)
|
||||
return res
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
###### 1.测试阿拉伯数字转中文 ######################################################
|
||||
ntc = Num2Chi()
|
||||
for i in range(1945, 2100):
|
||||
print(ntc.decimal_chinese(i))
|
||||
print(ntc.decimal_chinese(0.112354))
|
||||
print(ntc.decimal_chinese(1024.112354))
|
||||
|
||||
|
||||
|
||||
###### 2.测试中文转阿拉伯 ########################################################
|
||||
ctn = Chi2Num()
|
||||
tet_base = [ "1.3", "一.12", "一", "一十五", "十五", "二十", "二十三", "一百","一百零一",
|
||||
"一百一十", "一百一十一", "一千", "一千零一","一千零三十一",
|
||||
"一万零二十一", "一万零三百二十一", "一万一千三百二十一", "三千零十五万",
|
||||
"三千零一十五万", "三千五百六十八万零一百零一", "五十亿三千零七十五万零六百二十二",
|
||||
"十三亿三千零十五万零三百一十二", "一千二百五十八亿","三千零十五万零三百一十二",
|
||||
"一千二百五十八万亿", "三千三百二十一", "三百三十一", "二十一", "三百二十一",
|
||||
"一千二百五十八亿零三千三百二十一", "两百", "两千两百二十二", "两亿两千万两百万两百二十二",
|
||||
"三千零七十八亿三千零十五万零三百一十二"]
|
||||
|
||||
tet_decimal = ["1.3", "三千零七十八亿三千零十五万零三百一十二", "二点13亿", "1.56万", "十八.12", "一十八点九一", "零点一", "零点123", "十八点一", "零", "一千零九十九点六六"]
|
||||
# 测试是小数
|
||||
for tet_d in tet_decimal:
|
||||
print(ctn.compose_decimal(tet_d))
|
||||
# 测试是整数
|
||||
for tet_b in tet_base:
|
||||
print(ctn.compose_integer(tet_b))
|
||||
|
5
macropodus/tookit/trie_tree/__init__.py
Normal file
5
macropodus/tookit/trie_tree/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/5 22:06
|
||||
# @author : Mo
|
||||
# @function:
|
150
macropodus/tookit/trie_tree/trie_tree.py
Normal file
150
macropodus/tookit/trie_tree/trie_tree.py
Normal file
@ -0,0 +1,150 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/11/19 20:40
|
||||
# @author :Mo
|
||||
# @function :TrieTree of keywords find, 只返回查全的情况, 查找句子中的关键词(例如影视名、人名、关键词、实体等)
|
||||
|
||||
|
||||
class TrieNode:
|
||||
"""
|
||||
前缀树节点-链表
|
||||
"""
|
||||
def __init__(self):
|
||||
self.child = {}
|
||||
|
||||
|
||||
class TrieTree:
|
||||
"""
|
||||
前缀树构建, 新增关键词, 关键词词语查找等
|
||||
"""
|
||||
def __init__(self):
|
||||
self.root = TrieNode()
|
||||
|
||||
def add_keyword(self, keyword):
|
||||
"""
|
||||
新增一个关键词
|
||||
:param keyword: str, 构建的关键词
|
||||
:return: None
|
||||
"""
|
||||
node_curr = self.root
|
||||
for word in keyword:
|
||||
if node_curr.child.get(word) is None:
|
||||
node_next = TrieNode()
|
||||
node_curr.child[word] = node_next
|
||||
node_curr = node_curr.child[word]
|
||||
# 每个关键词词后边, 加入end标志位
|
||||
if node_curr.child.get('[END]') is None:
|
||||
node_next = TrieNode()
|
||||
node_curr.child['[END]'] = node_next
|
||||
node_curr = node_curr.child['[END]']
|
||||
|
||||
# def delete_keyword(self, keyword):
|
||||
# """
|
||||
# 删除一个关键词
|
||||
# :param keyword: str, 构建的关键词
|
||||
# :return: None
|
||||
# """
|
||||
# node_curr = self.root
|
||||
# for word in keyword:
|
||||
# if node_curr.child.get(word) is not None:
|
||||
# node_curr = node_curr.child[word]
|
||||
# # 每个关键词词后边, 加入end标志位
|
||||
# if node_curr.child.get('[END]') is not None:
|
||||
|
||||
|
||||
def add_keywords_from_list(self, keywords):
|
||||
"""
|
||||
新增关键词s, 格式为list
|
||||
:param keyword: list, 构建的关键词
|
||||
:return: None
|
||||
"""
|
||||
for keyword in keywords:
|
||||
self.add_keyword(keyword)
|
||||
|
||||
def find_keyword(self, sentence):
|
||||
"""
|
||||
从句子中提取关键词, 可提取多个
|
||||
:param sentence: str, 输入的句子
|
||||
:return: list, 提取到的关键词
|
||||
"""
|
||||
assert type(sentence) == str
|
||||
if not sentence: # 空格字符不取
|
||||
return []
|
||||
|
||||
node_curr = self.root # 关键词的头, 每遍历完一遍后需要重新初始化
|
||||
index_last = len(sentence)
|
||||
keyword_list = []
|
||||
keyword = ''
|
||||
count = 0
|
||||
for word in sentence:
|
||||
count += 1
|
||||
if node_curr.child.get(word) is None: # 查看有无后缀, 即匹配到一个关键词最后一个字符的时候
|
||||
if keyword: # 提取到的关键词(也可能是前面的几位)
|
||||
if node_curr.child.get('[END]') is not None: # 取以end结尾的关键词
|
||||
keyword_list.append(keyword)
|
||||
if self.root.child.get(word) is not None: # 处理连续的关键词情况, 如"第九区流浪地球"
|
||||
keyword = word
|
||||
node_curr = self.root.child[word]
|
||||
else: #
|
||||
keyword = ''
|
||||
node_curr = self.root # 重新初始化
|
||||
else: # 有后缀就加到name里边
|
||||
keyword = keyword + word
|
||||
node_curr = node_curr.child[word]
|
||||
if count == index_last: # 实体结尾的情况
|
||||
if node_curr.child.get('[END]') is not None:
|
||||
keyword_list.append(keyword)
|
||||
return keyword_list
|
||||
|
||||
def match_keyword(self, keyword):
|
||||
"""
|
||||
判断keyword在不在trietree里边
|
||||
:param keyword: str, input word
|
||||
:return: boolean, True or False
|
||||
"""
|
||||
node = self.root
|
||||
for kw in keyword:
|
||||
if not node.child.get(kw):
|
||||
return False
|
||||
node = node.child[kw]
|
||||
if not node.child.get('[END]'):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def get_trie_tree_class(keywords):
|
||||
"""
|
||||
根据list关键词,初始化trie树
|
||||
:param keywords: list, input
|
||||
:return: objext, 返回实例化的trie
|
||||
"""
|
||||
trie = TrieTree()
|
||||
trie.add_keywords_from_list(keywords)
|
||||
return trie
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试1, class实例
|
||||
trie = TrieTree()
|
||||
keywords = ['英雄', '人在囧途', '那些年,我们一起追过的女孩', '流浪地球', '华娱',
|
||||
'犬夜叉', '火影', '名侦探柯南', '约会大作战', '名作之壁', '动漫',
|
||||
'乃木坂46', 'akb48', '飘', '最后的武士', '约会', '英雄2', '日娱',
|
||||
'2012', '第九区', '星球大战', '侏罗纪公园', '泰坦尼克号', 'Speed']
|
||||
keywords = [list(keyword.strip()) for keyword in keywords]
|
||||
trie.add_keywords_from_list(keywords) # 创建树
|
||||
keyword = trie.find_keyword('第九区约会, 侏罗纪公园和泰坦尼克号泰坦尼克号')
|
||||
print(keyword)
|
||||
|
||||
keyword = trie.match_keyword('第九')
|
||||
print(keyword)
|
||||
|
||||
|
||||
# 测试2, get树
|
||||
trie_tree = get_trie_tree_class(keywords) # 创建树并返回实例化class
|
||||
while True:
|
||||
print("sihui请你输入:")
|
||||
input_ques = input()
|
||||
keywords = trie_tree.find_keyword(input_ques)
|
||||
print(keywords)
|
||||
|
||||
|
8
macropodus/version.py
Normal file
8
macropodus/version.py
Normal file
@ -0,0 +1,8 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/21 22:24
|
||||
# @author : Mo
|
||||
# @function: version of Macropodus
|
||||
|
||||
|
||||
__version__ = "0.0.2"
|
5
macropodus_images/__init__.py
Normal file
5
macropodus_images/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/20 9:54
|
||||
# @author : Mo
|
||||
# @function:
|
BIN
macropodus_images/macropodus_logo.png
Normal file
BIN
macropodus_images/macropodus_logo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 37 KiB |
5
macropodus_preprocess/__init__.py
Normal file
5
macropodus_preprocess/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/21 23:11
|
||||
# @author : Mo
|
||||
# @function:
|
33
macropodus_preprocess/create_segnment_data.py
Normal file
33
macropodus_preprocess/create_segnment_data.py
Normal file
@ -0,0 +1,33 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/21 23:11
|
||||
# @author : Mo
|
||||
# @function:
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_common import load_json, save_json
|
||||
from macropodus.preprocess.tools_common import txt_write, txt_read
|
||||
import json
|
||||
|
||||
pku_training = txt_read("pku_training.utf8")
|
||||
file = open("pku_train.json", "w", encoding="utf-8")
|
||||
pku_ = []
|
||||
for pku in pku_training:
|
||||
pkus = pku.split(" ")
|
||||
label_pkus = ""
|
||||
for pku_sig in pkus:
|
||||
len_pku = len(pku_sig)
|
||||
if len_pku==1:
|
||||
label_pkus += "S"
|
||||
elif len_pku==2:
|
||||
label_pkus += "BE"
|
||||
else:
|
||||
label_pkus += "B" + "M"*(len_pku-2) + "E"
|
||||
label_pkus_l = list(label_pkus)
|
||||
pku_res = {}
|
||||
pku_res["question"] = list("".join(pkus))
|
||||
pku_res["label"] = label_pkus_l
|
||||
p_json = json.dumps(pku_res, ensure_ascii=False)
|
||||
file.write(p_json + "\n")
|
||||
# pku_.append(pku_res)
|
||||
# save_json(pku_, "pku_train.json")
|
5
macropodus_preprocess/pku_training.utf8
Normal file
5
macropodus_preprocess/pku_training.utf8
Normal file
@ -0,0 +1,5 @@
|
||||
迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 )
|
||||
中共中央 总书记 、 国家 主席 江 泽民
|
||||
( 一九九七年 十二月 三十一日 )
|
||||
12月 31日 , 中共中央 总书记 、 国家 主席 江 泽民 发表 1998年 新年 讲话 《 迈向 充满 希望 的 新 世纪 》 。
|
||||
同胞 们 、 朋友 们 、 女士 们 、 先生 们 :
|
91
macropodus_survey_report/nlp_platfom_survey.md
Normal file
91
macropodus_survey_report/nlp_platfom_survey.md
Normal file
@ -0,0 +1,91 @@
|
||||
# 中文自然语言处理(nlp)工具调研与汇总(截至2019.11.16)
|
||||
|
||||
|
||||
## 1.常见平台与功能
|
||||
平台|语言|star|year|中文分词|词性标注|依存句法|实体识别|关键词提取|文本摘要|文本聚类|情感识别|文本相似|关系抽取|free|
|
||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
|
||||
jieba|python|20.8k|7/0.5|是|是|否|否|是|否|否|是|否|否|MIT
|
||||
THULAC-Python|python|1.2k|4/1|是|是|否|否|否|否|否|否|否|否|MIT
|
||||
pkuseg-python|python|4.3k|0.9/0.5|是|是|否|否|否|否|否|否|否|否|MIT
|
||||
snownlp|python|4.4k|6/3/*|是|是|否|否|是|是|否|是|是|否|MIT
|
||||
deepnlp|python|1.3k|2/2/!|是|是|是|是|是|是|否|否|否|否|MIT
|
||||
fastNLP|python|0.9k|2/0|是|是|否|是|否|否|否|是|否|否|MIT
|
||||
Jiagu|python|0.97k|0.9/0|是|是|是|是|是|是|是|是|否|是|MIT
|
||||
YaYaNLP|python|0.05k|4/4/!|是|是|否|是|否|否|否|否|否|否|MIT
|
||||
HanLP|java|16.4k|0.9/0|是|是|是|是|是|是|是|是|否|否|MIT
|
||||
ansj-seg|java|5.2k|3/0.4|是|是|是|是|是|是|否|是|否|否|Apache-2.0
|
||||
word|java|1.4k|5/1|是|是|否|是|否|否|否|否|是|否|Apache-2.0
|
||||
Jcseg|java|0.69k|3/0|是|是|是|是|是|是|否|否|否|否|Apache-2.0
|
||||
ik-analyzer|java|0.53k|9/9/!|是|是|是|否|否|否|否|否|否|否|LGPL-3.0
|
||||
CoreNLP|java|6.7k|9/9/!|是|是|是|是|是|否|否|否|否|否|GUN2.0
|
||||
fnlp|java|2.2k|6/0.9/!|是|是|是|是|是|是|是|否|否|否|LGPL-3.0
|
||||
NLPIR|java|2.5k|?/1/!|是|是|否|否|否|否|是|否|否|否|not open
|
||||
sego|go|1.2k|6/1/!|是|是|否|否|否|否|是|否|否|否|Apache-2.0
|
||||
ltp|c++|2.3k|6/1/!|是|是|是|是|是|是|是|否|否|否|LGPL-3.0
|
||||
PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0
|
||||
|
||||
|
||||
##备注
|
||||
* 1.year中"6/3/*"表示"项目开始时间/最近更新时间/在维护";!表示不维护,超过一年不维护,不回复issiue则认为放弃;
|
||||
* 2.其他功能
|
||||
* snownlp: 拼音转换,繁简转换,tf-idf计算,切句子
|
||||
* deepnlp: tensorflow1.4训练的各种模型
|
||||
* NLPIR: 检索,敏感信息,文档去重,编码转换
|
||||
* Ltp: 事件抽取,srl,时间抽取,
|
||||
* HanLP: 人民日报2014分词,文本推荐(相似度),索引分词
|
||||
* ansj-seg: 比较混乱,主页没有调用说明,词典是个大杂烩
|
||||
* word: 词频统计、词性标注、同义标注、反义标注、拼音标注
|
||||
* ltp: 特征裁剪策略,语义角色标注
|
||||
* PaddleNLP: Paddle训练,以及基础包,enienr生成等各种任务
|
||||
* 3.更多的统计学习方法
|
||||
摘要,情感识别(酸甜苦辣),新词发现,实体与关系抽取,领域分类,生成
|
||||
|
||||
|
||||
##分词算法
|
||||
* 1.jieba
|
||||
* 1.1 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG)
|
||||
* 1.2 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
|
||||
* 1.3 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法
|
||||
* 2.THULAC,pkuseg,Jiagu,fastNLP
|
||||
* 2.1 CRF(char,word,elmo,bert)
|
||||
* 2.2 feature+CRF
|
||||
* 3.ansj-seg
|
||||
* 3.1 n-Gram+CRF+HMM
|
||||
* 4.HanLP
|
||||
* 4.1 n-Gram, CRF
|
||||
* 5.sego
|
||||
* 5.1 基于词频的最短路径加动态规划
|
||||
* 6.Ltp
|
||||
* 6.1 bilstm+crf
|
||||
* 6.2 英文、URI一类特殊词识别规则
|
||||
利用空格等自然标注线索
|
||||
在统计模型中融入词典信息
|
||||
从大规模未标注数据中统计的字间互信息、上下文丰富程度
|
||||
* 7.PaddleNLP
|
||||
* 7.1 gru+crf
|
||||
* 8.word(最大匹配法、最大概率法、最短路径法)
|
||||
* 8.1 正向最大匹配算法,逆向最大匹配算法,正向最小匹配算法,逆向最小匹配算法
|
||||
* 8.2 双向最大匹配算法,双向最小匹配算法,双向最大最小匹配算法
|
||||
* 8.3 全切分算法,最少词数算法,最大Ngram分值算法,最短路径法
|
||||
* 8.4 语义切分:扩充转移网络法、知识分词语义分析法、邻接约束法、综合匹配法、后缀分词法、特征词库法、矩阵约束法、语法分析法
|
||||
|
||||
|
||||
## 工具包地址
|
||||
* jiba:[https://github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
|
||||
* HanLP:[https://github.com/hankcs/HanLP](https://github.com/hankcs/HanLP)
|
||||
* CoreNLP:[https://github.com/stanfordnlp/CoreNLP](https://github.com/stanfordnlp/CoreNLP)
|
||||
* ansj-seg:[https://github.com/lionsoul2014/jcseg](https://github.com/lionsoul2014/jcseg)
|
||||
* THULAC-Python:[https://github.com/thunlp/THULAC-Python](https://github.com/thunlp/THULAC-Python)
|
||||
* pkuseg-python:[https://github.com/lancopku/pkuseg-python](https://github.com/lancopku/pkuseg-python)
|
||||
* snownlp:[https://github.com/isnowfy/snownlp](https://github.com/isnowfy/snownlp)
|
||||
* deepnlp:[https://github.com/rockingdingo/deepnlp](https://github.com/rockingdingo/deepnlp)
|
||||
* fastNLP:[https://github.com/fastnlp/fastNLP](https://github.com/fastnlp/fastNLP)
|
||||
* Jiagu:[https://github.com/ownthink/Jiagu](https://github.com/ownthink/Jiagu)
|
||||
* xmnlp:[https://github.com/SeanLee97/xmnlp](https://github.com/SeanLee97/xmnlp)
|
||||
* word:[https://github.com/ysc/word](https://github.com/ysc/word)
|
||||
* jcseg:[https://github.com/lionsoul2014/jcseg](https://github.com/lionsoul2014/jcseg)
|
||||
* paddleNLP:[https://github.com/PaddlePaddle/models](https://github.com/PaddlePaddle/models)
|
||||
* sego:[https://github.com/huichen/sego](https://github.com/huichen/sego)
|
||||
* ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer)
|
||||
* fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp)
|
||||
* NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR)
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@ -0,0 +1,10 @@
|
||||
# tensorflow-gpu==1.15.0, tensorflow==1.15.0
|
||||
scikit-learn>=0.19.1
|
||||
pandas>=0.23.4
|
||||
passlib>=1.7.1
|
||||
gensim>=3.7.1
|
||||
numpy>=1.16.2
|
||||
tqdm>=4.31.1
|
||||
keras-bert>=0.80.0
|
||||
keras-adaptive-softmax>=0.6.0
|
||||
regex
|
67
setup.py
Normal file
67
setup.py
Normal file
@ -0,0 +1,67 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/12/30 22:17
|
||||
# @author :Mo
|
||||
# @function :setup of Macropodus
|
||||
# @codes :fix it and copy reference from https://github.com/TianWenQAQ/Kashgari/blob/master/setup.py
|
||||
|
||||
|
||||
from macropodus.version import __version__
|
||||
from setuptools import setup
|
||||
import codecs
|
||||
|
||||
|
||||
# Package meta-data.
|
||||
NAME = 'Macropodus'
|
||||
DESCRIPTION = 'Macropodus: Tookit of Chinese Natural Language Processing'
|
||||
URL = 'https://github.com/yongzhuo/Macropodus'
|
||||
EMAIL = '1903865025@qq.com'
|
||||
AUTHOR = 'yongzhuo'
|
||||
LICENSE = 'MIT'
|
||||
|
||||
with codecs.open('README.md', 'r', 'utf8') as reader:
|
||||
long_description = reader.read()
|
||||
with codecs.open('requirements.txt', 'r', 'utf8') as reader:
|
||||
install_requires = list(map(lambda x: x.strip(), reader.readlines()))
|
||||
|
||||
setup(name=NAME,
|
||||
version=__version__,
|
||||
description=DESCRIPTION,
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
author=AUTHOR,
|
||||
author_email=EMAIL,
|
||||
url=URL,
|
||||
packages=['macropodus'],
|
||||
package_dir={'macropodus': 'macropodus'},
|
||||
package_data={'macropodus': ['*.*', 'data/*', 'data/dict/*', 'data/cache/*',
|
||||
'data/embedding/*', 'data/embedding/word2vec/*']},
|
||||
install_requires=install_requires,
|
||||
license=LICENSE,
|
||||
classifiers=['License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: Implementation :: CPython',
|
||||
'Programming Language :: Python :: Implementation :: PyPy'],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("setup ok!")
|
||||
|
||||
# 说明, tensorflow>=1.14.0 or tensorflow-gpu>=1.14.0
|
||||
# 项目工程目录这里Macropodus, 实际上, 下边还要有一层macropodus, 也就是说, macropodus和setup同一层
|
||||
# data包里必须要有__init__.py, 否则文件不会生成, .py文件才能copy
|
||||
# 编译的2种方案:
|
||||
|
||||
# 方案一
|
||||
# 打开cmd
|
||||
# 到达安装目录
|
||||
# python setup.py build
|
||||
# python setup.py install
|
||||
|
||||
# 方案二
|
||||
# python setup.py bdist_wheel --universal
|
||||
# twine upload dist/*
|
5
test/evaluate/__init__.py
Normal file
5
test/evaluate/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/17 10:38
|
||||
# @author : Mo
|
||||
# @function:
|
5
test/evaluate/data/__init__.py
Normal file
5
test/evaluate/data/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/17 10:41
|
||||
# @author : Mo
|
||||
# @function:
|
52
test/evaluate/data/ambiguity.txt
Normal file
52
test/evaluate/data/ambiguity.txt
Normal file
@ -0,0 +1,52 @@
|
||||
工信处 女 干事 每月 经过 下属 科室 都要 亲口 交代 24 口 交换机 等 技术性 器件 的 安装 工作
|
||||
研究 生命科学 \t 研究 生命 科学
|
||||
研究生 命令 本科生
|
||||
我 从 马 上 下来
|
||||
我 马上 下来
|
||||
北京 大学生 喝 进口 红酒
|
||||
在 北京大学 生活区 喝 进口 红酒
|
||||
从小 学 电脑
|
||||
从 小学 毕业
|
||||
美军 中将 竟 公然 说
|
||||
新建 地铁 中 将 禁止 商业 摊点
|
||||
这块 地 面积 还真 不小
|
||||
地面 积了 厚厚 的 雪
|
||||
让 我们 以 爱心 和 平等 来 对待 动物
|
||||
阿美 首脑 会议 将 讨论 巴以 和平 等 问题
|
||||
锌 合金 把手 的 相关 求购 信息
|
||||
别 把 手 伸进 别人 的 口袋 里
|
||||
将 信息 技术 应用 于 教学 实践
|
||||
信息 技术 应用 于 教学 中 的 哪个 方面
|
||||
上级 解除 了 他 的 职务
|
||||
方程 的 解 除了 零 以外 还有 …
|
||||
我们 一起 去 故宫
|
||||
一起 恶性 交通 事故
|
||||
我 不想 吃 东西
|
||||
你 就 不 想想
|
||||
各 国有 企业 相继 倒闭
|
||||
各国 有 各国 的 困难
|
||||
老人家 身体 不错
|
||||
老人 家中 很 干净
|
||||
和服 务必 归还
|
||||
技术 和 服务
|
||||
他 站 起 身
|
||||
他 起身 去 北京
|
||||
问题 的 确定
|
||||
这的 确定 不 下来
|
||||
结合 成分
|
||||
为 人民 工作
|
||||
中国 产品 质量
|
||||
原子 结合 成 分子 时
|
||||
部分 居民 生活 水平
|
||||
治理 解放 大道 路面 积水
|
||||
这样 的 人 才能 经受 住 考验
|
||||
他俩 儿 谈 恋爱 是 从 头年 元月 开始 的
|
||||
在 这些 企业 中 国有 企业 有 十个
|
||||
结婚 的 和 尚未 结婚 的
|
||||
热海 景区
|
||||
热海 景区 +
|
||||
崔永元 炮轰 范冰冰
|
||||
这 源自 萧红 写给 萧军 信中 的 一句话
|
||||
阿里 大华 腾讯 百度
|
||||
亲家公 亲家母
|
||||
情侣 们 在 海南岛 上 海誓山盟
|
54
test/evaluate/tet_evaluate.py
Normal file
54
test/evaluate/tet_evaluate.py
Normal file
@ -0,0 +1,54 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/17 21:13
|
||||
# @author : Mo
|
||||
# @function: test evulate
|
||||
|
||||
|
||||
from macropodus.preprocess.tools_common import txt_write, txt_read
|
||||
import macropodus
|
||||
import time
|
||||
|
||||
|
||||
def evulate_file(path_file):
|
||||
"""
|
||||
验证切词的各种指标
|
||||
:param path_file: str, like '/train.txt'
|
||||
:return: float
|
||||
"""
|
||||
# 读取数据
|
||||
sents = txt_read(path_file)
|
||||
# 初始化统计计数
|
||||
count_macropodus = 0
|
||||
count_real = 0
|
||||
count_true = 0
|
||||
count = 0
|
||||
# 切词与统计, true
|
||||
for sent in sents:
|
||||
sent_sp = sent.strip()
|
||||
res_real = sent_sp.split(' ')
|
||||
sentence = sent_sp.replace(' ','')
|
||||
res_macropodus = macropodus.cut(sentence)
|
||||
print(res_macropodus)
|
||||
count += 1
|
||||
count_real += len(res_real)
|
||||
count_macropodus += len(res_macropodus)
|
||||
for cm in res_macropodus:
|
||||
if cm in res_real:
|
||||
count_true += 1
|
||||
res_real.remove(cm)
|
||||
# precision, recall, f1
|
||||
precision = count_true / count_macropodus
|
||||
recall = count_true / count_real
|
||||
f1 = (precision * recall * 2) / (precision + recall)
|
||||
|
||||
return precision, recall, f1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
path_file = 'data/ambiguity.txt'
|
||||
time_start = time.time()
|
||||
precision, recall, f1 = evulate_file(path_file)
|
||||
print('time: ' + str(time.time()-time_start))
|
||||
print('precision\t', 'recall\t', 'f1')
|
||||
print(precision, recall, f1)
|
84
test/evaluate/tet_macropodus.py
Normal file
84
test/evaluate/tet_macropodus.py
Normal file
@ -0,0 +1,84 @@
|
||||
# !/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2019/12/17 21:13
|
||||
# @author : Mo
|
||||
# @function: test macropodus
|
||||
|
||||
|
||||
import time
|
||||
time_start = time.time()
|
||||
import macropodus
|
||||
print('macropodus初始化耗时: ' + str(time.time()-time_start) + 's')
|
||||
|
||||
# import sys
|
||||
# import os
|
||||
# print(os.name)
|
||||
# print(sys.platform)
|
||||
|
||||
# macropodus.load_user_dict(path_user="user.json", type_user="json")
|
||||
macropodus.add_word(word="斗鱼属")
|
||||
macropodus.add_word(word="斗鱼科")
|
||||
macropodus.add_word(word="鲈形目")
|
||||
macropodus.save_add_words(word_freqs={"喜斗":32, "护卵":64, "护幼":132})
|
||||
macropodus.add_word(word="坑爹的平衡性基金")
|
||||
macropodus.save_add_words(word_freqs={"BBC":132})
|
||||
|
||||
print(macropodus.cut("坑爹的平衡性基金啊,坑爹呀斗鱼属,Macropodus (Lacépède, 1801),鲈形目斗鱼科的一属鱼类。"
|
||||
"本属鱼类通称斗鱼。因喜斗而得名。分布于亚洲东南部。中国有2种,即叉尾斗鱼,分布于长江及以南各省;"
|
||||
"叉尾斗鱼,分布于辽河到珠江流域。其喜栖居于小溪、河沟、池塘、稻田等缓流或静水中。"
|
||||
"雄鱼好斗,产卵期集草成巢,雄鱼口吐粘液泡沫,雌鱼产卵其中,卵浮性,受精卵在泡沫内孵化。雄鱼尚有护卵和护幼现象。"
|
||||
))
|
||||
|
||||
sen_calculate = "23 + 13 * (25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3))加根号144你算得几多"
|
||||
sen_chi2num = "三千零七十八亿三千零十五万零三百一十二点一九九四"
|
||||
sen_num2chi = 1994.1994
|
||||
sent1 = "PageRank算法简介"
|
||||
sent2 = "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。"
|
||||
summary = "PageRank算法简介。" \
|
||||
"是上世纪90年代末提出的一种计算网页权重的算法! " \
|
||||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
|
||||
"业界急需一种相对比较准确的网页重要性计算方法。 " \
|
||||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
|
||||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
|
||||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
|
||||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
|
||||
"和投票目标的等级来决定新的等级。简单的说, " \
|
||||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
|
||||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
|
||||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
|
||||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
|
||||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
|
||||
|
||||
# 分词
|
||||
words = macropodus.cut(summary)
|
||||
print(words)
|
||||
new_words = macropodus.find(summary)
|
||||
print(new_words)
|
||||
|
||||
# 新词发现(findword, 默认接口)
|
||||
sents = macropodus.find(text=summary, freq_min=2, len_max=7, entropy_min=1.2, aggregation_min=0.5, use_avg=True)
|
||||
print(sents)
|
||||
# 摘要
|
||||
sum = macropodus.summarize(summary)
|
||||
print(sum)
|
||||
keyword = macropodus.keyword(summary)
|
||||
print(keyword)
|
||||
# 相似度
|
||||
sim = macropodus.sim(sent1, sent2, type_sim="cosine")
|
||||
|
||||
sent1 = "叉尾斗鱼"
|
||||
sent2 = "中国斗鱼生性好斗,适应性强,能在恶劣的环境中生存"
|
||||
|
||||
# 文本相似度(similarity)
|
||||
sents = macropodus.sim(sent1, sent2, type_sim="total", type_encode="avg")
|
||||
print(sents)
|
||||
sents = macropodus.sim(sent1, sent2, type_sim="cosine", type_encode="single")
|
||||
print(sents)
|
||||
print(sim)
|
||||
# tookit
|
||||
score_calcul = macropodus.calculate(sen_calculate)
|
||||
print(score_calcul)
|
||||
res_chi2num = macropodus.chi2num(sen_chi2num)
|
||||
print(res_chi2num)
|
||||
res_num2chi = macropodus.num2chi(sen_num2chi)
|
||||
print(res_num2chi)
|
Loading…
Reference in New Issue
Block a user