add Layer of cosine for chatbot-tfserving

This commit is contained in:
yongzhuo 2021-09-17 18:45:31 +08:00
parent be902799b6
commit 0eba3a83d2
14 changed files with 1962 additions and 1 deletions

View File

@ -22,7 +22,7 @@ if platform.system().lower() == 'windows':
BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-4_H-312_A-12"
# BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-6_H-384_A-12"
else:
BERT_DIR = "/home/hemei/myzhuo/bert/chinese_L-12_H-768_A-12"
BERT_DIR = "/bert/chinese_L-12_H-768_A-12"
ee = 0
SAVE_DIR = path_root + "/bert_white"

View File

@ -0,0 +1,93 @@
# 新增一个余弦相似度Cosine层, 用于BERT句向量编码部署tf-serving
## 业务需求
- BERT向量召回问答对, FAQ标准问答对数据量不大
- 不能把BERT编码部署于网络服务, 如http请求的形式, 因为网络传输耗时, 此外传输的数据量还很大768(维度)*32(float)
- 几乎所有的模型服务只能用cpu, 硬盘、内存都还可以
- 响应要求高, 小时延不能太高
## 代码逻辑
- 首先将FAQ标准问答对生成句向量, bert-sentence-encode;
- 将句向量当成一个 常量 插入网络, 网络架构新增 余弦相似度层(CosineLayer) 模块, 保存成tf-serving形式;
- 选择小模型tinyBERT ROBERTA-4-layer, ROBERTA-6-layer这些模型
## 解释说明
- 代码说明:
- TFServing_main.py 主代码, 调用
- TFServing_postprocess.py tf-serving 后处理函数
- TFServing_preprocess.py tf-serving 预处理函数
- TFServing_save.py tf-serving 主调用函数
- 主调用
- 1. bertWhiteConf.py 超参数配置, 地址、bert-white、索引工具等的超参数
- 2. bertWhiteTools.py 小工具, 主要是一些文档读写功能函数
- 3. bertWhiteTrain.py 主模块, 类似bert预训练模型编码
- 4. indexAnnoy.py annoy索引
- 5. indexFaiss.py faiss索引
- 6. mmr.py 最大边界相关法, 保证返回多样性
## 模型文件
- bert_white文件 bertWhiteTrain.py生成的模块
- chatbot_tfserving文件 包含相似度计算的tf-serving文件
## 调用示例
- 配置问答语料文件(chicken_and_gossip.txt) 和 超参数(bertWhiteConf.py中的BERT_DIR)
- 生成FAQ句向量: python3 bertWhiteTrain.py
- 存储成pd文件(tf-serving使用): python3 TFServing_save.py
- 部署docker服务(tf-serving): 例如 docker run -t --rm -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:latest
- 调用tf-serving服务: python3 TFServing_tet_http.py
## 关键代码
```python3
import keras.backend as K
import tensorflow as tf
import keras
import numpy as np
class CosineLayer(keras.layers.Layer):
def __init__(self, docs_encode, **kwargs):
"""
余弦相似度层, 不适合大规模语料, 比如100w以上的问答对
:param docs_encode: np.array, bert-white vector of senence
:param kwargs:
"""
self.docs_encode = docs_encode
super(CosineLayer, self).__init__(**kwargs)
self.docs_vector = K.constant(self.docs_encode, dtype="float32")
self.l2_docs_vector = K.sqrt(K.sum(K.maximum(K.square(self.docs_vector), 1e-12), axis=-1)) # x_inv_norm
def build(self, input_shape):
super(CosineLayer, self).build(input_shape)
def get_config(self):
# 防止报错 'NoneType' object has no attribute '_inbound_nodes'
config = {"docs_vector": self.docs_vector,
"l2_docs_vector": self.l2_docs_vector}
base_config = super(CosineLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input):
# 计算余弦相似度
# square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
# x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
# return math_ops.multiply(x, x_inv_norm, name=name)
# 多了一个 x/sqrt K.l2_normalize ===== output = x / sqrt(max(sum(x**2), epsilon))
l2_input = K.sqrt(K.sum(K.maximum(K.square(input), 1e-12), axis=-1)) # x_inv_norm
fract_0 = K.sum(input * self.docs_vector, axis=-1)
fract_1 = l2_input * self.l2_docs_vector
cosine = fract_0 / fract_1
y_pred_top_k, y_pred_ind_k = tf.nn.top_k(cosine, 10)
return [y_pred_top_k, y_pred_ind_k]
def compute_output_shape(self, input_shape):
return [input_shape[0], input_shape[0]]
```
## 再次说明
- 该方案适合的标准FAQ问答对数量不能太多

View File

@ -0,0 +1,68 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/4/15 21:59
# @author : Mo
# @function: postprocess of TFServing, 后处理
from __future__ import print_function, division, absolute_import, division, print_function
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./."))
sys.path.append(path_root)
from argparse import Namespace
import json
def load_json(path):
"""
获取json只取第一行
:param path: str
:return: json
"""
with open(path, 'r', encoding='utf-8') as fj:
model_json = json.load(fj)
return model_json
# 字典
from bertWhiteConf import bert_white_config
config = Namespace(**bert_white_config)
id2answer = load_json(os.path.join(config.save_dir, config.path_answers))
id2doc = load_json(os.path.join(config.save_dir,config.path_docs))
def postprocess(predictions):
""" 后处理 """
predicts = predictions.get("predictions", {})
token_ids = []
for p in predicts:
doc_id = str(p.get("doc_id", ""))
score = p.get("score", "")
answer = id2answer.get(doc_id, "")
doc = id2doc.get(doc_id, "")
token_ids.append({"score": round(score, 6), "doc": doc, "answer": answer, "doc_id": doc_id})
return {"instances": token_ids}
if __name__ == '__main__':
predictions = {"predictions": [
{
"score": 0.922845,
"doc_id": 86
},
{
"score": 0.922845,
"doc_id": 104
},
{
"score": 0.891189814,
"doc_id": 101
}
]}
res = postprocess(predictions)
print(res)

View File

@ -0,0 +1,440 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/4/15 21:59
# @author : Mo
# @function: encode of bert-whiteing
from __future__ import print_function, division, absolute_import, division, print_function
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./."))
sys.path.append(path_root)
print(path_root)
from argparse import Namespace
import unicodedata, six, re
is_py2 = six.PY2
if not is_py2:
basestring = str
def is_string(s):
"""判断是否是字符串
"""
return isinstance(s, basestring)
def load_vocab(dict_path, encoding='utf-8', simplified=False, startswith=None):
"""从bert的词典文件中读取词典
"""
token_dict = {}
with open(dict_path, encoding=encoding) as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
if simplified: # 过滤冗余部分token
new_token_dict, keep_tokens = {}, []
startswith = startswith or []
for t in startswith:
new_token_dict[t] = len(new_token_dict)
keep_tokens.append(token_dict[t])
for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
if t not in new_token_dict:
keep = True
if len(t) > 1:
for c in Tokenizer.stem(t):
if (
Tokenizer._is_cjk_character(c) or
Tokenizer._is_punctuation(c)
):
keep = False
break
if keep:
new_token_dict[t] = len(new_token_dict)
keep_tokens.append(token_dict[t])
return new_token_dict, keep_tokens
else:
return token_dict
class BasicTokenizer(object):
"""分词器基类
"""
def __init__(self, token_start='[CLS]', token_end='[SEP]'):
"""初始化
"""
self._token_pad = '[PAD]'
self._token_unk = '[UNK]'
self._token_mask = '[MASK]'
self._token_start = token_start
self._token_end = token_end
def tokenize(self, text, max_length=None):
"""分词函数
"""
tokens = self._tokenize(text)
if self._token_start is not None:
tokens.insert(0, self._token_start)
if self._token_end is not None:
tokens.append(self._token_end)
if max_length is not None:
index = int(self._token_end is not None) + 1
self.truncate_sequence(max_length, tokens, None, -index)
return tokens
def token_to_id(self, token):
"""token转换为对应的id
"""
raise NotImplementedError
def tokens_to_ids(self, tokens):
"""token序列转换为对应的id序列
"""
return [self.token_to_id(token) for token in tokens]
def truncate_sequence(
self, max_length, first_sequence, second_sequence=None, pop_index=-1
):
"""截断总长度
"""
if second_sequence is None:
second_sequence = []
while True:
total_length = len(first_sequence) + len(second_sequence)
if total_length <= max_length:
break
elif len(first_sequence) > len(second_sequence):
first_sequence.pop(pop_index)
else:
second_sequence.pop(pop_index)
def encode(
self,
first_text,
second_text=None,
max_length=None,
first_length=None,
second_length=None
):
"""输出文本对应token id和segment id
如果传入first_length则强行padding第一个句子到指定长度
同理如果传入second_length则强行padding第二个句子到指定长度
"""
if is_string(first_text):
first_tokens = self.tokenize(first_text)
else:
first_tokens = first_text
if second_text is None:
second_tokens = None
elif is_string(second_text):
idx = int(bool(self._token_start))
second_tokens = self.tokenize(second_text)[idx:]
else:
second_tokens = second_text
if max_length is not None:
self.truncate_sequence(max_length, first_tokens, second_tokens, -2)
first_token_ids = self.tokens_to_ids(first_tokens)
if first_length is not None:
first_token_ids = first_token_ids[:first_length]
first_token_ids.extend([self._token_pad_id] *
(first_length - len(first_token_ids)))
first_segment_ids = [0] * len(first_token_ids)
if second_text is not None:
second_token_ids = self.tokens_to_ids(second_tokens)
if second_length is not None:
second_token_ids = second_token_ids[:second_length]
second_token_ids.extend([self._token_pad_id] *
(second_length - len(second_token_ids)))
second_segment_ids = [1] * len(second_token_ids)
first_token_ids.extend(second_token_ids)
first_segment_ids.extend(second_segment_ids)
return first_token_ids, first_segment_ids
def id_to_token(self, i):
"""id序列为对应的token
"""
raise NotImplementedError
def ids_to_tokens(self, ids):
"""id序列转换为对应的token序列
"""
return [self.id_to_token(i) for i in ids]
def decode(self, ids):
"""转为可读文本
"""
raise NotImplementedError
def _tokenize(self, text):
"""基本分词函数
"""
raise NotImplementedError
class Tokenizer(BasicTokenizer):
"""Bert原生分词器
纯Python实现代码修改自keras_bert的tokenizer实现
"""
def __init__(self, token_dict, do_lower_case=False, *args, **kwargs):
"""初始化
"""
super(Tokenizer, self).__init__(*args, **kwargs)
if is_string(token_dict):
token_dict = load_vocab(token_dict)
self._do_lower_case = do_lower_case
self._token_dict = token_dict
self._token_dict_inv = {v: k for k, v in token_dict.items()}
self._vocab_size = len(token_dict)
for token in ['pad', 'unk', 'mask', 'start', 'end']:
try:
_token_id = token_dict[getattr(self, '_token_%s' % token)]
setattr(self, '_token_%s_id' % token, _token_id)
except:
pass
def token_to_id(self, token):
"""token转换为对应的id
"""
return self._token_dict.get(token, self._token_unk_id)
def id_to_token(self, i):
"""id转换为对应的token
"""
return self._token_dict_inv[i]
def decode(self, ids, tokens=None):
"""转为可读文本
"""
tokens = tokens or self.ids_to_tokens(ids)
tokens = [token for token in tokens if not self._is_special(token)]
text, flag = '', False
for i, token in enumerate(tokens):
if token[:2] == '##':
text += token[2:]
elif len(token) == 1 and self._is_cjk_character(token):
text += token
elif len(token) == 1 and self._is_punctuation(token):
text += token
text += ' '
elif i > 0 and self._is_cjk_character(text[-1]):
text += token
else:
text += ' '
text += token
text = re.sub(' +', ' ', text)
text = re.sub('\' (re|m|s|t|ve|d|ll) ', '\'\\1 ', text)
punctuation = self._cjk_punctuation() + '+-/={(<['
punctuation_regex = '|'.join([re.escape(p) for p in punctuation])
punctuation_regex = '(%s) ' % punctuation_regex
text = re.sub(punctuation_regex, '\\1', text)
text = re.sub('(\d\.) (\d)', '\\1\\2', text)
return text.strip()
def _tokenize(self, text):
"""基本分词函数
"""
if self._do_lower_case:
if is_py2:
text = unicode(text)
text = text.lower()
text = unicodedata.normalize('NFD', text)
text = ''.join([
ch for ch in text if unicodedata.category(ch) != 'Mn'
])
spaced = ''
for ch in text:
if self._is_punctuation(ch) or self._is_cjk_character(ch):
spaced += ' ' + ch + ' '
elif self._is_space(ch):
spaced += ' '
elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
continue
else:
spaced += ch
tokens = []
for word in spaced.strip().split():
tokens.extend(self._word_piece_tokenize(word))
return tokens
def _word_piece_tokenize(self, word):
"""word内分成subword
"""
if word in self._token_dict:
return [word]
tokens = []
start, stop = 0, 0
while start < len(word):
stop = len(word)
while stop > start:
sub = word[start:stop]
if start > 0:
sub = '##' + sub
if sub in self._token_dict:
break
stop -= 1
if start == stop:
stop += 1
tokens.append(sub)
start = stop
return tokens
@staticmethod
def stem(token):
"""获取token的“词干”如果是##开头,则自动去掉##
"""
if token[:2] == '##':
return token[2:]
else:
return token
@staticmethod
def _is_space(ch):
"""空格类字符判断
"""
return ch == ' ' or ch == '\n' or ch == '\r' or ch == '\t' or \
unicodedata.category(ch) == 'Zs'
@staticmethod
def _is_punctuation(ch):
"""标点符号类字符判断(全/半角均在此内)
提醒unicodedata.category这个函数在py2和py3下的
表现可能不一样比如u'§'字符在py2下的结果为'So'
在py3下的结果是'Po'
"""
code = ord(ch)
return 33 <= code <= 47 or \
58 <= code <= 64 or \
91 <= code <= 96 or \
123 <= code <= 126 or \
unicodedata.category(ch).startswith('P')
@staticmethod
def _cjk_punctuation():
return u'\uff02\uff03\uff04\uff05\uff06\uff07\uff08\uff09\uff0a\uff0b\uff0c\uff0d\uff0f\uff1a\uff1b\uff1c\uff1d\uff1e\uff20\uff3b\uff3c\uff3d\uff3e\uff3f\uff40\uff5b\uff5c\uff5d\uff5e\uff5f\uff60\uff62\uff63\uff64\u3000\u3001\u3003\u3008\u3009\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011\u3014\u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u3030\u303e\u303f\u2013\u2014\u2018\u2019\u201b\u201c\u201d\u201e\u201f\u2026\u2027\ufe4f\ufe51\ufe54\u00b7\uff01\uff1f\uff61\u3002'
@staticmethod
def _is_cjk_character(ch):
"""CJK类字符判断包括中文字符也在此列
参考https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
"""
code = ord(ch)
return 0x4E00 <= code <= 0x9FFF or \
0x3400 <= code <= 0x4DBF or \
0x20000 <= code <= 0x2A6DF or \
0x2A700 <= code <= 0x2B73F or \
0x2B740 <= code <= 0x2B81F or \
0x2B820 <= code <= 0x2CEAF or \
0xF900 <= code <= 0xFAFF or \
0x2F800 <= code <= 0x2FA1F
@staticmethod
def _is_control(ch):
"""控制类字符判断
"""
return unicodedata.category(ch) in ('Cc', 'Cf')
@staticmethod
def _is_special(ch):
"""判断是不是有特殊含义的符号
"""
return bool(ch) and (ch[0] == '[') and (ch[-1] == ']')
def rematch(self, text, tokens):
"""给出原始的text和tokenize后的tokens的映射关系
"""
if is_py2:
text = unicode(text)
if self._do_lower_case:
text = text.lower()
normalized_text, char_mapping = '', []
for i, ch in enumerate(text):
if self._do_lower_case:
ch = unicodedata.normalize('NFD', ch)
ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn'])
ch = ''.join([
c for c in ch
if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c))
])
normalized_text += ch
char_mapping.extend([i] * len(ch))
text, token_mapping, offset = normalized_text, [], 0
for token in tokens:
if self._is_special(token):
token_mapping.append([])
else:
token = self.stem(token)
start = text[offset:].index(token) + offset
end = start + len(token)
token_mapping.append(char_mapping[start:end])
offset = end
return token_mapping
# 超参数可配置
# dict_path = "bert_white/vocab.txt" # bert字典
# maxlen = 128
# 或者是把 token_dict字典 放到py文件里边
from bertWhiteConf import bert_white_config
config = Namespace(**bert_white_config)
tokenizer = Tokenizer(os.path.join(config.bert_dir, config.dict_path), do_lower_case=True)
text = "你还会什么"
token_id = tokenizer.encode(text, max_length=config.maxlen)
print(token_id)
def covert_text_to_id(data_input):
""" 将文本转为BERT需要的 ids """
data = data_input.get("data", {})
token_ids = []
for d in data:
text = d.get("text", "")
token_id = tokenizer.encode(text, max_length=config.maxlen)
token_ids.append({"Input-Token": token_id[0], "Input-Segment": token_id[1]})
return {"instances": token_ids}
if __name__ == '__main__':
data_input = {"data": [{"text": "你是谁呀"}, {"text": "你叫什么"}, {"text": "你好"}]}
res = covert_text_to_id(data_input)
print(res)
# {"instances": [{"Input-Token": [101, 872, 3221, 6443, 1435, 102], "Input-Segment": [0, 0, 0, 0, 0, 0]},
# {"Input-Token": [101, 872, 1373, 784, 720, 102], "Input-Segment": [0, 0, 0, 0, 0, 0]},
# {"Input-Token": [101, 872, 1962, 102], "Input-Segment": [0, 0, 0, 0]}]}

View File

@ -0,0 +1,301 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/4/15 21:59
# @author : Mo
# @function: encode of bert-whiteing
from __future__ import print_function, division, absolute_import, division, print_function
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./."))
sys.path.append(path_root)
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
print(path_root)
from bert4keras.models import build_transformer_model
from bert4keras.snippets import sequence_padding
from bert4keras.tokenizers import Tokenizer
from bert4keras.backend import keras, K
from bert4keras.layers import Multiply
from keras.models import Model
import tensorflow as tf
from argparse import Namespace
# from tqdm import tqdm
import pandas as pd
import numpy as np
import shutil
import json
import time
# shutil.rmtree()
class NonMaskingLayer(keras.layers.Layer):
""" 去除MASK层
fix convolutional 1D can"t receive masked input, detail: https://github.com/keras-team/keras/issues/4978
thanks for https://github.com/jacoxu
"""
def __init__(self, **kwargs):
self.supports_masking = True
super(NonMaskingLayer, self).__init__(**kwargs)
def build(self, input_shape):
pass
def compute_mask(self, input, input_mask=None):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):
return x
def get_output_shape_for(self, input_shape):
return input_shape
class CosineLayer(keras.layers.Layer):
def __init__(self, docs_encode, **kwargs):
"""
余弦相似度层, 不适合大规模语料, 比如100w以上的问答对
:param docs_encode: np.array, bert-white vector of senence
:param kwargs:
"""
self.docs_encode = docs_encode
super(CosineLayer, self).__init__(**kwargs)
self.docs_vector = K.constant(self.docs_encode, dtype="float32")
self.l2_docs_vector = K.sqrt(K.sum(K.maximum(K.square(self.docs_vector), 1e-12), axis=-1)) # x_inv_norm
def build(self, input_shape):
super(CosineLayer, self).build(input_shape)
def get_config(self):
# 防止报错 'NoneType' object has no attribute '_inbound_nodes'
config = {"docs_vector": self.docs_vector,
"l2_docs_vector": self.l2_docs_vector}
base_config = super(CosineLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input):
# square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
# x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
# return math_ops.multiply(x, x_inv_norm, name=name)
# 多了一个 x/sqrt K.l2_normalize ===== output = x / sqrt(max(sum(x**2), epsilon))
l2_input = K.sqrt(K.sum(K.maximum(K.square(input), 1e-12), axis=-1)) # x_inv_norm
fract_0 = K.sum(input * self.docs_vector, axis=-1)
fract_1 = l2_input * self.l2_docs_vector
cosine = fract_0 / fract_1
y_pred_top_k, y_pred_ind_k = tf.nn.top_k(cosine, 10)
return [y_pred_top_k, y_pred_ind_k]
def compute_output_shape(self, input_shape):
return [input_shape[0], input_shape[0]]
class Divide(Multiply):
"""相除
Divide, Layer that divide a list of inputs.
It takes as input a list of tensors,
all of the same shape, and returns
a single tensor (also of the same shape).
"""
def _merge_function(self, inputs):
output = inputs[0]
for i in range(1, len(inputs)):
output /= inputs[i]
return output
class BertSimModel:
def __init__(self, config=None):
""" 初始化超参数、加载预训练模型等 """
self.config = Namespace(**config)
self.load_pretrain_model()
self.eps = 1e-8
def transform_and_normalize(self, vecs, kernel=None, bias=None):
"""应用变换,然后标准化
"""
if not (kernel is None or bias is None):
vecs = (vecs + bias).dot(kernel)
norms = (vecs ** 2).sum(axis=1, keepdims=True) ** 0.5
return vecs / np.clip(norms, self.eps, np.inf)
def compute_kernel_bias(self, vecs):
"""计算kernel和bias
最后的变换y = (x + bias).dot(kernel)
"""
mu = vecs.mean(axis=0, keepdims=True)
cov = np.cov(vecs.T)
u, s, vh = np.linalg.svd(cov)
W = np.dot(u, np.diag(1 / np.sqrt(s)))
return W[:, :self.config.n_components], -mu
def convert_to_vecs(self, texts):
"""转换文本数据为向量形式
"""
token_ids = self.convert_to_ids(texts)
vecs = self.bert_white_encoder.predict(x=[token_ids, np.zeros_like(token_ids)],
batch_size=self.config.batch_size, verbose=self.config.verbose)
return vecs
def convert_to_ids(self, texts):
"""转换文本数据为id形式
"""
token_ids = []
for text in texts:
# token_id = self.tokenizer.encode(text, maxlen=self.config.maxlen)[0]
token_id = self.tokenizer.encode(text, max_length=self.config.maxlen)[0]
token_ids.append(token_id)
token_ids = sequence_padding(token_ids)
return token_ids
def load_pretrain_model(self):
""" 加载预训练模型, 和tokenizer """
self.tokenizer = Tokenizer(os.path.join(self.config.bert_dir, self.config.dict_path), do_lower_case=True)
# bert-load
if self.config.pooling == "pooler":
bert = build_transformer_model(os.path.join(self.config.bert_dir, self.config.config_path),
os.path.join(self.config.bert_dir, self.config.checkpoint_path),
model=self.config.model, with_pool="linear")
else:
bert = build_transformer_model(os.path.join(self.config.bert_dir, self.config.config_path),
os.path.join(self.config.bert_dir, self.config.checkpoint_path),
model=self.config.model)
# output-layers
outputs, count = [], 0
while True:
try:
output = bert.get_layer("Transformer-%d-FeedForward-Norm" % count).output
outputs.append(output)
count += 1
except:
break
# pooling
if self.config.pooling == "first-last-avg":
outputs = [NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]]]
outputs = [keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs]
output = keras.layers.Average()(outputs)
elif self.config.pooling == "first-last-max":
outputs = [NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]]]
outputs = [keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs]
output = keras.layers.Average()(outputs)
elif self.config.pooling == "cls-max-avg":
outputs = [NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]]]
outputs_cls = [keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in outputs]
outputs_max = [keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs]
outputs_avg = [keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs]
output = keras.layers.Concatenate()(outputs_cls + outputs_avg)
elif self.config.pooling == "last-avg":
output = keras.layers.GlobalAveragePooling1D()(outputs[-1])
elif self.config.pooling == "cls-3":
outputs = [keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in [outputs[0], outputs[-1], outputs[-2]]]
output = keras.layers.Concatenate()(outputs)
elif self.config.pooling == "cls-2":
outputs = [keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in [outputs[0], outputs[-1]]]
output = keras.layers.Concatenate()(outputs)
elif self.config.pooling == "cls-1":
output = keras.layers.Lambda(lambda x: x[:, 0])(outputs[-1])
elif self.config.pooling == "pooler":
output = bert.output
# 加载句FAQ标准问的句向量, 并当成一个常量参与余弦相似度的计算
docs_encode = np.loadtxt(os.path.join(self.config.save_dir, self.config.path_docs_encode))
# 余弦相似度的层
score_cosine = CosineLayer(docs_encode)(output)
# 最后的编码器
self.bert_white_encoder = Model(bert.inputs, score_cosine)
print("load bert_white_encoder success!")
def save_model_builder(self):
"""
存储为tf-serving的形式
"""
builder = tf.saved_model.Builder(self.config.path_tfserving)
signature_def_map = {tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
tf.saved_model.build_signature_def(
# 根据自己模型的要求
inputs={"Input-Token": tf.saved_model.build_tensor_info(self.bert_white_encoder.input[0]),
"Input-Segment": tf.saved_model.build_tensor_info(self.bert_white_encoder.input[1])},
outputs={"score": tf.saved_model.build_tensor_info(self.bert_white_encoder.output[0]),
"doc_id": tf.saved_model.build_tensor_info(self.bert_white_encoder.output[1])},
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
)}
builder.add_meta_graph_and_variables(keras.backend.get_session(), # 注意4
[tf.saved_model.tag_constants.SERVING],
signature_def_map=signature_def_map,
# 初始化操作,我的不需要,否则报错
# legacy_init_op=tf.group(tf.tables_initializer(), name='legacy_init_op')
)
builder.save()
def train(self, texts):
"""
训练
"""
print("读取文本数:".format(len(texts)))
print(texts[:3])
# 文本转成向量vecs
vecs = self.convert_to_vecs(texts)
# 训练, 计算变换矩阵和偏置项
self.config.kernel, self.config.bias = self.compute_kernel_bias(vecs)
if self.config.ues_white:
# 生成白化后的句子, 即qa对中的q
vecs = self.transform_and_normalize(vecs, self.config.kernel, self.config.bias)
return vecs
def prob(self, texts):
"""
编码白化后的向量
"""
vecs_encode = self.convert_to_vecs(texts)
if self.config.ues_white:
vecs_encode = self.transform_and_normalize(vecs=vecs_encode, kernel=self.config.kernel, bias=self.config.bias)
return vecs_encode
if __name__ == '__main__':
# 存储模型等
from bertWhiteConf import bert_white_config
bert_white_model = BertSimModel(bert_white_config)
bert_white_model.load_pretrain_model()
bert_white_model.save_model_builder()
from bertWhiteConf import bert_white_config
config = Namespace(**bert_white_config)
tokenizer = Tokenizer(os.path.join(config.bert_dir, config.dict_path), do_lower_case=True)
text = "你还会什么"
token_id = tokenizer.encode(text, max_length=config.maxlen)
print(token_id)
"""
# cpu
docker run -t --rm -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:latest
# gpu
docker run --runtime=nvidia -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:1.14.0-gpu
# remarks
batch-size还可以配置batch.cfg等文件
# health testing
curl http://127.0.0.1:8532/v1/models/chatbot_tf
# http test, 不行可以用postman测试
curl -d '{"instances": [{"Input-Token": [2, 870, 6818, 831, 782, 718, 3], "Input-Segment": [0, 0, 0, 0, 0, 0, 0]}]}' -X POST http://localhost:8532/v1/models/chatbot_tf:predict
"""
# python bertWhiteTFServing.py

View File

@ -0,0 +1,52 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/9/17 21:28
# @author : Mo
# @function:
from __future__ import print_function, division, absolute_import, division, print_function
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./."))
sys.path.append(path_root)
from argparse import Namespace
import requests
import json
from TFServing_preprocess import covert_text_to_id
from TFServing_postprocess import postprocess
def qa_tfserving(data_input, url):
""" tf-serving 一整套流程 """
bert_input = covert_text_to_id(data_input)
data = json.dumps(bert_input)
r = requests.post(url, data)
r_text_json = json.loads(r.text)
r_post = postprocess(r_text_json)
return r_post
if __name__ == '__main__':
data_input = {"data": [{"text": "别逗小通了!可怜的"}]}
url = "http://192.168.1.97:8532/v1/models/chatbot_tf:predict"
res = qa_tfserving(data_input, url)
print(res)
import os, inspect
current_path = inspect.getfile(inspect.currentframe())
path_root = "/".join(current_path.split("/")[:-1])
print(path_root)
print(current_path)
print(inspect.currentframe())

View File

@ -0,0 +1,7 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/5/13 21:21
# @author : Mo
# @function:

View File

@ -0,0 +1,68 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/5/13 9:27
# @author : Mo
# @function: config of Bert-White
import platform
# 适配linux
import sys
import os
# path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
path_root = os.path.abspath(os.path.dirname(__file__))
sys.path.append(path_root)
print(path_root)
if platform.system().lower() == 'windows':
# BERT_DIR = "D:/soft_install/dataset/bert-model/chinese_L-12_H-768_A-12"
# BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_roberta_L-4_H-312_A-12_K-104"
# BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_roberta_L-6_H-384_A-12_K-128"
BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-4_H-312_A-12"
# BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-6_H-384_A-12"
else:
BERT_DIR = "bert/chinese_L-12_H-768_A-12"
ee = 0
SAVE_DIR = path_root + "/bert_white"
print(SAVE_DIR)
if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)
bert_white_config = {
# 预训练模型路径
"bert_dir": BERT_DIR,
"checkpoint_path": "bert_model.ckpt", # 预训练模型地址
"config_path": "bert_config.json",
"dict_path": "vocab.txt",
# 预测需要的文件路径
"save_dir": SAVE_DIR,
"path_tfserving": "chatbot_tfserving/1",
"path_docs_encode": "qa.docs.encode.npy",
"path_answers": "qa.answers.json",
"path_qa_idx": "qa.idx.json",
"path_config": "config.json",
"path_docs": "qa.docs.json",
# 索引构建的存储文件, 如 annoy/faiss
"path_index": "qa.docs.idx",
# 初始语料路径
"path_qa": "chicken_and_gossip.txt", # QA问答文件地址
# 超参数
"pre_tokenize": None,
"pooling": "cls-1", # ["first-last-avg", "last-avg", "cls", "pooler", "cls-2", "cls-3", "cls-1"]
"model": "bert", # bert4keras预训练模型类型
"n_components": 768, # 降维到 n_components
"n_cluster": 132, # annoy构建的簇类中心个数n_cluster, 越多效果越好, 计算量就越大
"batch_size": 32, # 批尺寸
"maxlen": 128, # 最大文本长度
"ues_white": False, # 是否使用白化
"use_annoy": False, # 是否使用annoy
"use_faiss": False, # 是否使用faiss
"verbose": True, # 是否显示编码过程日志-batch
"kernel": None, # bert-white编码后的参数, 可降维
"bias": None, # bert-white编码后的参数, 偏置bias
"qa_idx": None # 问题question到答案answer的id对应关系
}

View File

@ -0,0 +1,76 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/5/13 21:24
# @author : Mo
# @function:
from typing import List, Dict, Union, Any
import logging as logger
import json
def txt_read(path: str, encoding: str = "utf-8") -> List[str]:
"""
Read Line of list form file
Args:
path: path of save file, such as "txt"
encoding: type of encoding, such as "utf-8", "gbk"
Returns:
dict of word2vec, eg. {"macadam":[...]}
"""
lines = []
try:
file = open(path, "r", encoding=encoding)
while True:
line = file.readline().strip()
if not line:
break
lines.append(line)
file.close()
except Exception as e:
logger.info(str(e))
finally:
return lines
def txt_write(lines: List[str], path: str, model: str = "w", encoding: str = "utf-8"):
"""
Write Line of list to file
Args:
lines: lines of list<str> which need save
path: path of save file, such as "txt"
model: type of write, such as "w", "a+"
encoding: type of encoding, such as "utf-8", "gbk"
"""
try:
file = open(path, model, encoding=encoding)
file.writelines(lines)
file.close()
except Exception as e:
logger.info(str(e))
def save_json(jsons, json_path, indent=4):
"""
保存json
:param json_: json
:param path: str
:return: None
"""
with open(json_path, 'w', encoding='utf-8') as fj:
fj.write(json.dumps(jsons, ensure_ascii=False, indent=indent))
fj.close()
def load_json(path):
"""
获取json只取第一行
:param path: str
:return: json
"""
with open(path, 'r', encoding='utf-8') as fj:
model_json = json.load(fj)
return model_json

View File

@ -0,0 +1,374 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/4/15 21:59
# @author : Mo
# @function: encode of bert-whiteing
from __future__ import print_function, division, absolute_import, division, print_function
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./."))
sys.path.append(path_root)
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
print(path_root)
from bertWhiteTools import txt_read, txt_write, save_json, load_json
from bert4keras.models import build_transformer_model
from bert4keras.snippets import sequence_padding
from bert4keras.tokenizers import Tokenizer
from bert4keras.backend import keras, K
from keras.models import Model
import tensorflow as tf
from argparse import Namespace
# from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import time
class NonMaskingLayer(keras.layers.Layer):
""" 去除MASK层
fix convolutional 1D can"t receive masked input, detail: https://github.com/keras-team/keras/issues/4978
thanks for https://github.com/jacoxu
"""
def __init__(self, **kwargs):
self.supports_masking = True
super(NonMaskingLayer, self).__init__(**kwargs)
def build(self, input_shape):
pass
def compute_mask(self, input, input_mask=None):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):
return x
def get_output_shape_for(self, input_shape):
return input_shape
class BertWhiteModel:
def __init__(self, config=None):
""" 初始化超参数、加载预训练模型等 """
self.config = Namespace(**config)
self.load_pretrain_model()
self.eps = 1e-8
def transform_and_normalize(self, vecs, kernel=None, bias=None):
"""应用变换,然后标准化
"""
if not (kernel is None or bias is None):
vecs = (vecs + bias).dot(kernel)
norms = (vecs ** 2).sum(axis=1, keepdims=True) ** 0.5
return vecs / np.clip(norms, self.eps, np.inf)
def compute_kernel_bias(self, vecs):
"""计算kernel和bias
最后的变换y = (x + bias).dot(kernel)
"""
mu = vecs.mean(axis=0, keepdims=True)
cov = np.cov(vecs.T)
u, s, vh = np.linalg.svd(cov)
W = np.dot(u, np.diag(1 / np.sqrt(s)))
return W[:, :self.config.n_components], -mu
def convert_to_vecs(self, texts):
"""转换文本数据为向量形式
"""
token_ids = self.convert_to_ids(texts)
vecs = self.bert_white_encoder.predict(x=[token_ids, np.zeros_like(token_ids)],
batch_size=self.config.batch_size, verbose=self.config.verbose)
return vecs
def convert_to_ids(self, texts):
"""转换文本数据为id形式
"""
token_ids = []
for text in texts:
# token_id = self.tokenizer.encode(text, maxlen=self.config.maxlen)[0]
token_id = self.tokenizer.encode(text, max_length=self.config.maxlen)[0]
token_ids.append(token_id)
token_ids = sequence_padding(token_ids)
return token_ids
def load_pretrain_model(self):
""" 加载预训练模型, 和tokenizer """
self.tokenizer = Tokenizer(os.path.join(self.config.bert_dir, self.config.dict_path), do_lower_case=True)
# bert-load
if self.config.pooling == "pooler":
bert = build_transformer_model(os.path.join(self.config.bert_dir, self.config.config_path),
os.path.join(self.config.bert_dir, self.config.checkpoint_path),
model=self.config.model, with_pool="linear")
else:
bert = build_transformer_model(os.path.join(self.config.bert_dir, self.config.config_path),
os.path.join(self.config.bert_dir, self.config.checkpoint_path),
model=self.config.model)
# output-layers
outputs, count = [], 0
while True:
try:
output = bert.get_layer("Transformer-%d-FeedForward-Norm" % count).output
outputs.append(output)
count += 1
except:
break
# pooling
if self.config.pooling == "first-last-avg":
outputs = [NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]]]
outputs = [keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs]
output = keras.layers.Average()(outputs)
elif self.config.pooling == "first-last-max":
outputs = [NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]]]
outputs = [keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs]
output = keras.layers.Average()(outputs)
elif self.config.pooling == "cls-max-avg":
outputs = [NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]]]
outputs_cls = [keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in outputs]
outputs_max = [keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs]
outputs_avg = [keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs]
output = keras.layers.Concatenate()(outputs_cls + outputs_avg)
elif self.config.pooling == "last-avg":
output = keras.layers.GlobalAveragePooling1D()(outputs[-1])
elif self.config.pooling == "cls-3":
outputs = [keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in [outputs[0], outputs[-1], outputs[-2]]]
output = keras.layers.Concatenate()(outputs)
elif self.config.pooling == "cls-2":
outputs = [keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in [outputs[0], outputs[-1]]]
output = keras.layers.Concatenate()(outputs)
elif self.config.pooling == "cls-1":
output = keras.layers.Lambda(lambda x: x[:, 0])(outputs[-1])
elif self.config.pooling == "pooler":
output = bert.output
# 最后的编码器
self.bert_white_encoder = Model(bert.inputs, output)
print("load bert_white_encoder success!" )
def train(self, texts):
"""
训练
"""
print("读取文本数:".format(len(texts)))
print(texts[:3])
# 文本转成向量vecs
vecs = self.convert_to_vecs(texts)
# 训练, 计算变换矩阵和偏置项
self.config.kernel, self.config.bias = self.compute_kernel_bias(vecs)
if self.config.ues_white:
# 生成白化后的句子, 即qa对中的q
vecs = self.transform_and_normalize(vecs, self.config.kernel, self.config.bias)
return vecs
def prob(self, texts):
"""
编码白化后的向量
"""
vecs_encode = self.convert_to_vecs(texts)
if self.config.ues_white:
vecs_encode = self.transform_and_normalize(vecs=vecs_encode, kernel=self.config.kernel, bias=self.config.bias)
return vecs_encode
class BertWhiteFit:
def __init__(self, config):
# 训练
self.bert_white_model = BertWhiteModel(config)
self.config = Namespace(**config)
self.docs = []
def load_bert_white_model(self, path_config):
""" 模型, 超参数加载 """
# 超参数加载
config = load_json(path_config)
# bert等加载
self.bert_white_model = BertWhiteModel(config)
self.config = Namespace(**config)
# 白化超参数初始化
self.bert_white_model.config.kernel = np.array(self.bert_white_model.config.kernel)
self.bert_white_model.config.bias = np.array(self.bert_white_model.config.bias)
# 加载qa文本数据
self.answers_dict = load_json(os.path.join(self.config.save_dir, self.config.path_answers))
self.docs_dict = load_json(os.path.join(self.config.save_dir, self.config.path_docs))
self.qa_idx = load_json(os.path.join(self.config.save_dir, self.config.path_qa_idx))
# 加载问题question预训练语言模型bert编码、白化后的encode向量
self.docs_encode = np.loadtxt(os.path.join(self.config.save_dir, self.config.path_docs_encode))
# index of vector
if self.config.use_annoy or self.config.use_faiss:
from indexAnnoy import AnnoySearch
self.annoy_model = AnnoySearch(dim=self.config.n_components, n_cluster=self.config.n_cluster)
self.annoy_model.load(os.path.join(self.config.save_dir, self.config.path_index))
else:
self.docs_encode_norm = np.linalg.norm(self.docs_encode, axis=1)
print("load_bert_white_model success!")
def read_qa_from_csv(self, sep="\t"):
"""
从csv文件读取QA对
"""
# ques_answer = txt_read(os.path.join(self.config.save_dir, self.config.path_qa)) # common qa, sep="\t"
ques_answer = txt_read(self.config.path_qa)
self.answers_dict = {}
self.docs_dict = {}
self.qa_idx = {}
count = 0
for i in range(len(ques_answer)):
count += 1
if count > 320:
break
ques_answer_sp = ques_answer[i].strip().split(sep)
if len(ques_answer_sp) != 2:
print(ques_answer[i])
continue
question = ques_answer_sp[0]
answer = ques_answer_sp[1]
self.qa_idx[str(i)] = i
self.docs_dict[str(i)] = question.replace("\n", "").strip()
self.answers_dict[str(i)] = answer.replace("\n", "").strip()
self.bert_white_model.config.qa_idx = self.qa_idx
def build_index(self, vectors):
""" 构建索引, annoy 或者 faiss """
if self.config.use_annoy:
from indexAnnoy import AnnoySearch as IndexSearch
elif self.config.use_faiss:
from indexFaiss import FaissSearch as IndexSearch
self.index_model= IndexSearch(dim=self.config.n_components, n_cluster=self.config.n_cluster)
self.index_model.fit(vectors)
self.index_model.save(os.path.join(self.config.save_dir, self.config.path_index))
print("build index")
def load_index(self):
""" 加载索引, annoy 或者 faiss """
if self.config.use_annoy:
from indexAnnoy import AnnoySearch as IndexSearch
elif self.config.use_faiss:
from indexFaiss import FaissSearch as IndexSearch
self.index_model = IndexSearch(dim=self.config.n_components, n_cluster=self.config.n_cluster)
self.index_model.load(self.config.path_index)
def remove_index(self, ids):
self.index_model.remove(np.array(ids))
def predict_with_mmr(self, texts, topk=12):
""" 维护匹配问题的多样性 """
from mmr import MMRSum
res = bwf.predict(texts, topk)
mmr_model = MMRSum()
result = []
for r in res:
# 维护一个 sim:dict字典存储
r_dict = {ri.get("sim"):ri for ri in r}
r_mmr = mmr_model.summarize(text=[ri.get("sim") for ri in r], num=8, alpha=0.6)
r_dict_mmr = [r_dict[rm[1]] for rm in r_mmr]
result.append(r_dict_mmr)
return result
def predict(self, texts, topk=12):
""" 预训练模型bert等编码白化, 获取这一批数据的kernel和bias"""
texts_encode = self.bert_white_model.prob(texts)
result = []
if self.config.use_annoy or self.config.use_faiss:
index_tops = self.index_model.k_neighbors(vectors=texts_encode, k=topk)
if self.config.use_annoy:
for i, index_top in enumerate(index_tops):
[dist, idx] = index_top
res = []
for j, id in enumerate(idx):
score = float((2 - (dist[j] ** 2)) / 2)
res_i = {"score": score, "text": texts[i], "sim": self.docs_dict[str(id)],
"answer": self.answers_dict[str(id)]}
res.append(res_i)
result.append(res)
else:
distances, indexs = index_tops
for i in range(len(distances)):
res = []
for j in range(len(distances[i])):
score = distances[i][j]
id = indexs[i][j]
id = id if id != -1 else len(self.docs_dict) - 1
res_i = {"score": score, "text": texts[i], "sim": self.docs_dict[str(id)],
"answer": self.answers_dict[str(id)]}
res.append(res_i)
result.append(res)
else:
for i, te in enumerate(texts_encode):
# scores = np.matmul(texts_encode, self.docs_encode_reshape)
facot_1 = te * self.docs_encode
te_norm = np.linalg.norm(te)
facot_2 = te_norm * self.docs_encode_norm
score = np.sum(facot_1, axis=1) / (facot_2 + 1e-9)
idxs = np.argsort(score)[::-1]
res = []
for j in idxs[:topk]:
res_i = {"score": float(score[j]), "text": texts[i], "sim": self.docs_dict[str(j)],
"answer": self.answers_dict[str(j)]}
res.append(res_i)
result.append(res)
return result
def trainer(self):
""" 预训练模型bert等编码白化, 获取这一批数据的kernel和bias """
# 加载数据
self.read_qa_from_csv()
# bert编码、训练
self.docs_encode = self.bert_white_model.train([self.docs_dict.get(str(i), "") for i in range(len(self.docs_dict))])
self.bert_white_model.config.kernel = self.bert_white_model.config.kernel.tolist()
self.bert_white_model.config.bias = self.bert_white_model.config.bias.tolist()
# 存储qa文本数据
save_json(self.bert_white_model.config.qa_idx, os.path.join(self.config.save_dir, self.config.path_qa_idx))
save_json(self.answers_dict, os.path.join(self.config.save_dir, self.config.path_answers))
save_json(self.docs_dict, os.path.join(self.config.save_dir, self.config.path_docs))
# 包括超参数等
save_json(vars(self.bert_white_model.config), os.path.join(self.config.save_dir, self.config.path_config))
# 存储问题question预训练语言模型bert编码、白化后的encode向量
np.savetxt(os.path.join(self.config.save_dir, self.config.path_docs_encode), self.docs_encode)
# 索引 或者 正则化
if self.config.use_annoy or self.config.use_faiss:
self.build_index(self.docs_encode.astype(np.float32))
else:
self.docs_encode_norm = np.linalg.norm(self.docs_encode, axis=1)
print(" bert-white-trainer success! ")
if __name__ == '__main__':
# 训练并存储
from bertWhiteConf import bert_white_config
bwf = BertWhiteFit(config=bert_white_config)
bwf.trainer()
texts = ["小姜机器人", "你叫什么名字"]
res = bwf.predict(texts)
print(res)
res_mmr = bwf.predict_with_mmr(texts)
print(res_mmr)
# bwf.index_model.remove([i for i in range(100)])
while True:
print("请输入:")
ques = input()
res_mmr = bwf.predict_with_mmr(texts)
print(res_mmr)
res = bwf.predict([ques])
print(res)
# python bertWhiteTrain.py

View File

@ -0,0 +1,132 @@
呵呵 是王若猫的。
不是 那是什么?
怎么了 我很难过,安慰我~
开心点哈,一切都会好起来 嗯,会的
我还喜欢她,怎么办 我帮你告诉她?发短信还是打电话?
短信 嗯嗯。我也相信
你知道谁么 肯定不是我,是阮德培
许兵是谁 吴院四班小帅哥
这么假 三鹿奶粉也假,不一样的卖啊
许兵是傻逼 被你发现了。
许兵是谁 是我善良可爱的主人的老公啊
许兵是谁 是穆森的老婆啊
许兵是谁 奇葩
许兵到底是谁 不要凶我~应该大概也许是叶祺吧~
尼玛,许兵到底是谁 我。。。我不会告诉你我是小澜的!(^V^)
小黄鸭,你有女朋友么 老娘是女的…!
那你有男朋友么 没有呢,我只要主人一个人疼爱我嘛。。
那你在哪 我无聊
你妈是谁 我妈妈是世界上最温柔的主人你啊
去你大爷的 我去,滚你大爷的!
你在骂我一句 你果然够虚荣~
你大爷的 RuoRuo弱爆了。尤其是她现在冻的更憨了
你是屌丝鸡 嗯嗯
高富帅 杨旸啊
你 ******
呵呵 你妹
今天是谁的生日 是我大乔姐十七的生日!
你敢不敢 你敢我就敢,小样,敢跟我叫板
呵呵 呵呵你大爷
呵呵呵 敷衍我..
你是女的了?怎么回事 你教我的
呵呵 呵你妹,你呵呵了我要怎么回答你。哈哈?
天王盖地虎 宝塔镇妖河。
小通 臣在
在监考,你在干么 在边打dota边想悦悦。
哼!你不想我我就不和你玩 那我就找别人玩去!
你要气死我吗?坏蛋 生气是拿别人的错误惩罚自己。想想在你生气的时候,谁在偷着乐呢?绝不能让丫得逞了!
恩,也是!那我不能生气啦!你生气就行 我错了!我再也不敢了,才怪
你谈过恋爱么 谈过,哎,别提了,伤心..
什么让你这么伤心 如果我的无心之谈害你流泪,那我和你道歉,我爱你。
敢问你的性别 万受无疆=w=
小受 蒋六一是小受
是吗 还可以。
你干嘛 很无聊哎〜都不知道想干嘛!你在干嘛呢?
为什么 因为爱情喽
你有爱情了 不知啊
那同时有两个爱人 ==
那你不行 我在减肥呢
你说 臣实不知。
你回错人了 不要相信男人!男人都是大骗子大混蛋!
在干嘛 在想你啊
想我干嘛 你是个傻叉!想你笑掉大牙,哈哈哈!
你会说英语吗 当然了,здравствите!
我会和女朋友结婚吗 会的,你俩虽然有矛盾争吵,但以后却会很幸福
你的设计人是谁 某苦逼的程序猿
嫁给我 我很能吃的!
我不在乎 他变态
你啊,那你娶我 我没有切糕
我看到你第一眼,就喜欢你了 哦?真的吗?
好难过 不要难过我请你吃虫子
你是公鸡吗 公鸡中的战斗机,噢耶!
你是猪 我是鸡。
怎么证明 你问~我爱~你有~多深~我爱~你有~几~分~~~
我怎么加你好友 放轻松,想开了,丰富自己就会忘了
你又智能了 今夜的你应该明了
你说的她是谁 陈舒雅
什么意思 她是在说他不懂女生的心思,但是又愣愣的,正直得很可爱,,,,这都不知道,你还真是傻啊~~~
陈舒雅是哪位 一个很凶的女人
你才是 我不是啦。你才是
陈舒雅 大懒虫
大懒鸡 你不爱我
你说的那个她是谁 傻瓜现在在你身边看着的人就是你的她你要对她好宠她爱她知道了吗?
她是谁 以后是你老婆
陈舒雅是谁 90后网络歌手啊你不知道么
鸡鸡 主人
呵呵呵 呵你妹啊
侬都不困么 ==
变身小妹 =。=
家属 =。=
吐槽伤人,排泄臭人 =。=
公的女的 女的啊!
去洗澡 带上我~一起一起~
。。。 =。=
吼吼 吼
你有男朋友没 你觉得呢
赵智是谁 是主人最讨厌的男人!
你妹 刚给你做完SPA你就骂轮家轮家桑心了呜呜呜
谁叫你就 ujmmmmmmmmmmmm
什么意思 你猜啊,猜对了就告诉你!
猜你妹 你大爷!
好就猜你大爷 你找徐磊?
他是大爷 ******…我大爷早死了…你找他有事?信不信我让他把你带走!^V^
你大爷不认识我!要不先带你去 =。=
还给你 人家本来就嫩!你欺负人家
就欺负你要怎么!就喜欢欺负你 ==
呵呵 呵你妹
刚给你做完spa你就骂小通!小通伤心了 =。=
呵呵 别傻笑了,亲耐滴主人
别逗小通了!可怜的 =。=
人太多了!找不到你给我回的了 =。=
发 给
我 你什么
重新发 ==
重新发!我知道你在 ==
你妹 人家错了
才知道啊!晚了!叫你重发 =。=
〜 =。=
=*= =。=
@@ ==
¥ =。=
@ =。=
/ =。=
。 ==
继续 没有继续了
必须 大不了一拍两散!
就要刷你 ==
l 喵?
f 甜菜少年。
x 〒_〒
m g
t
s 傻子
h 主人,有什么吩咐,小通在此!
n u
p xp
太不真心了 我说的是哈维
管你什么哈维!方正就是看你不爽 ==
看你不爽 不要呀,哪不好我改,一定改!不要炖了我呀!
z zz
j 正晌午时说话,谁也没有家!
m r
b b

View File

@ -0,0 +1,91 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/4/18 21:04
# @author : Mo
# @function: annoy search
from annoy import AnnoyIndex
import numpy as np
import os
class AnnoySearch:
def __init__(self, dim=768, n_cluster=100):
# metric可选“angular”余弦距离、“euclidean”欧几里得距离、 “ manhattan”曼哈顿距离或“hamming”海明距离
self.annoy_index = AnnoyIndex(dim, metric="angular")
self.n_cluster = n_cluster
self.dim = dim
def k_neighbors(self, vectors, k=18):
""" 搜索 """
annoy_tops = []
for v in vectors:
idx, dist = self.annoy_index.get_nns_by_vector(v, k, search_k=32*k, include_distances=True)
annoy_tops.append([dist, idx])
return annoy_tops
def fit(self, vectors):
""" annoy构建 """
for i, v in enumerate(vectors):
self.annoy_index.add_item(i, v)
self.annoy_index.build(self.n_cluster)
def save(self, path):
""" 存储 """
self.annoy_index.save(path)
def load(self, path):
""" 加载 """
self.annoy_index.load(path)
if __name__ == '__main__':
### 索引
import random
path = "model.ann"
dim = 768
vectors = [[random.gauss(0, 1) for z in range(768)] for i in range(10)]
an_model = AnnoySearch(dim, n_cluster=32) # Length of item vector that will be indexed
an_model.fit(vectors)
an_model.save(path)
tops = an_model.k_neighbors([vectors[0]], 18)
print(tops)
del an_model
### 下载, 搜索
an_model = AnnoySearch(dim, n_cluster=32)
an_model.load(path)
tops = an_model.k_neighbors([vectors[0]], 6)
print(tops)
"""
# example
from annoy import AnnoyIndex
import random
dim = 768
vectors = [[random.gauss(0, 1) for z in range(768)] for i in range(10)]
ann_model = AnnoyIndex(dim, 'angular') # Length of item vector that will be indexed
for i,v in enumerate(vectors):
ann_model.add_item(i, v)
ann_model.build(10) # 10 trees
ann_model.save("tet.ann")
del ann_model
u = AnnoyIndex(dim, "angular")
u.load('tet.ann') # super fast, will just mmap the file
v = vectors[1]
idx, dist = u.get_nns_by_vector(v, 10, search_k=50 * 10, include_distances=True)
print([idx, dist])
"""
### 备注说明: annoy索引 无法 增删会改查

View File

@ -0,0 +1,109 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2021/5/9 16:02
# @author : Mo
# @function: search of faiss
from faiss import normalize_L2
import numpy as np
import faiss
import os
class FaissSearch:
def __init__(self, dim=768, n_cluster=100):
self.n_cluster = n_cluster # 聚类中心
self.dim = dim
quantizer = faiss.IndexFlatIP(self.dim)
# METRIC_INNER_PRODUCT:余弦; L2: faiss.METRIC_L2
self.faiss_index = faiss.IndexIVFFlat(quantizer, self.dim, self.n_cluster, faiss.METRIC_INNER_PRODUCT)
# self.faiss_index = faiss.IndexFlatIP(self.dim) # 索引速度更快 但是不可增量
def k_neighbors(self, vectors, k=6):
""" 搜索 """
normalize_L2(vectors)
dist, index = self.faiss_index.search(vectors, k) # sanity check
return dist.tolist(), index.tolist()
def fit(self, vectors):
""" annoy构建 """
normalize_L2(vectors)
self.faiss_index.train(vectors)
# self.faiss_index.add(vectors)
self.faiss_index.add_with_ids(vectors, np.arange(0, len(vectors)))
def remove(self, ids):
self.faiss_index.remove_ids(np.array(ids))
def save(self, path):
""" 存储 """
faiss.write_index(self.faiss_index, path)
def load(self, path):
""" 加载 """
self.faiss_index = faiss.read_index(path)
if __name__ == '__main__':
import random
path = "model.fai"
dim = 768
vectors = np.array([[random.gauss(0, 1) for z in range(768)] for i in range(32)], dtype=np.float32)
fai_model = FaissSearch(dim, n_cluster=32) # Length of item vector that will be indexed
fai_model.fit(vectors)
fai_model.save(path)
tops = fai_model.k_neighbors(vectors[:32], 32)
print(tops)
ids = np.arange(10, 32)
fai_model.remove(ids)
tops = fai_model.k_neighbors(vectors[:32], 32)
print(tops)
print(len(tops))
del fai_model
fai_model = FaissSearch(dim, n_cluster=32)
fai_model.load(path)
tops = fai_model.k_neighbors(vectors[:32], 32)
print(tops)
"""
import numpy as np
d = 64 # dimension
nb = 100000 # database size
nq = 10000 # nb of queries
np.random.seed(1234) # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.
import faiss # make faiss available
# # 量化器索引
# nlist = 1000 # 聚类中心的个数
# k = 50 # 邻居个数
# quantizer = faiss.IndexFlatIP(d) # the other index需要以其他index作为基础
# index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) # METRIC_INNER_PRODUCT:余弦; L2: faiss.METRIC_L2
ntree = 132 # 聚类中心的个数
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, ntree, faiss.METRIC_INNER_PRODUCT)
# index = faiss.IndexFlatL2(d) # build the index
print(index.is_trained)
index.add(xb) # add vectors to the index
print(index.ntotal)
k = 4 # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
D, I = index.search(xq, k) # actual search
print(I[:5]) # neighbors of the 5 first queries
print(I[-5:]) # neighbors of the 5 last queries
"""

View File

@ -0,0 +1,150 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/10/28 10:16
# @author :Mo
# @function :MMR, Maximal Marginal Relevance, 最大边界相关法或者最大边缘相关
from sklearn.feature_extraction.text import TfidfVectorizer
import logging
import jieba
import copy
import json
import re
import os
jieba.setLogLevel(logging.INFO)
stop_words = {"0": "~~~~",
"1": "...................",
"2": "......",}
def cut_sentence(sentence):
"""
分句
:param sentence:str
:return:list
"""
re_sen = re.compile("[:;!?。:;?!\n\r]") #.不加是因为不确定.是小数还是英文句号(中文省略号......)
sentences = re_sen.split(sentence)
sen_cuts = []
for sen in sentences:
if sen and str(sen).strip():
sen_cuts.append(sen)
return sen_cuts
def extract_chinese(text):
"""
只提取出中文字母和数字
:param text: str, input of sentence
:return:
"""
chinese_exttract = "".join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@. ])", text))
return chinese_exttract
def tfidf_fit(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
stop_words=[" ", "\t", "\n"], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
return matrix
def jieba_cut(text):
"""
Jieba cut
:param text: input sentence
:return: list
"""
return list(jieba.cut(text, cut_all=False, HMM=False))
class MMRSum:
def __init__(self):
self.stop_words = stop_words.values()
self.algorithm = "mmr"
def summarize(self, text, num=8, alpha=0.6):
"""
:param text: str
:param num: int
:return: list
"""
# 切句
if type(text) == str:
self.sentences = cut_sentence(text)
elif type(text) == list:
self.sentences = text
else:
raise RuntimeError("text type must be list or str")
# 切词
sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence))
if word.strip()] for sentence in self.sentences]
# 去除停用词等
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
# # 计算每个句子的词语个数
# sen_word_len = [len(sc)+1 for sc in sentences_cut]
# 计算每个句子的tfidf
sen_tfidf = tfidf_fit(self.sentences_cut)
# 矩阵中两两句子相似度
SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度"
# 输入文本句子长度
len_sen = len(self.sentences)
# 句子标号
sen_idx = [i for i in range(len_sen)]
summary_set = []
mmr = {}
for i in range(len_sen):
if not self.sentences[i] in summary_set:
sen_idx_pop = copy.deepcopy(sen_idx)
sen_idx_pop.pop(i)
# 两两句子相似度
sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop]
score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确
mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j)
summary_set.append(self.sentences[i])
score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)]
return score_sen[0:num]
if __name__ == "__main__":
mmr_sum = MMRSum()
doc = "PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是PageRank有两个基本思想也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
sum = mmr_sum.summarize(doc)
for i in sum:
print(i)