From 30536dfffeaf97a9ebed01de8afe899394d11d6c Mon Sep 17 00:00:00 2001 From: yongzhuo <2714618994@qq.com> Date: Wed, 28 Aug 2019 01:51:17 +0800 Subject: [PATCH] xlnet embedding and similarity --- .../__init__.py | 8 +- FeatureProject/xlnet/__init__.py | 5 + FeatureProject/xlnet/args.py | 33 +++++ .../xlnet/extract_keras_xlnet_feature.py | 135 ++++++++++++++++++ FeatureProject/xlnet/layers_keras.py | 31 ++++ FeatureProject/xlnet/tet_xlnet_keras_sim.py | 77 ++++++++++ readme.md | 5 + 7 files changed, 290 insertions(+), 4 deletions(-) rename Data/{corpus/ner => chinese_xlnet_mid_L-24_H-768_A-12}/__init__.py (68%) create mode 100644 FeatureProject/xlnet/__init__.py create mode 100644 FeatureProject/xlnet/args.py create mode 100644 FeatureProject/xlnet/extract_keras_xlnet_feature.py create mode 100644 FeatureProject/xlnet/layers_keras.py create mode 100644 FeatureProject/xlnet/tet_xlnet_keras_sim.py diff --git a/Data/corpus/ner/__init__.py b/Data/chinese_xlnet_mid_L-24_H-768_A-12/__init__.py similarity index 68% rename from Data/corpus/ner/__init__.py rename to Data/chinese_xlnet_mid_L-24_H-768_A-12/__init__.py index 079ab0b..46efc70 100644 --- a/Data/corpus/ner/__init__.py +++ b/Data/chinese_xlnet_mid_L-24_H-768_A-12/__init__.py @@ -1,5 +1,5 @@ -# -*- coding: UTF-8 -*- -# !/usr/bin/python -# @time :2019/5/21 15:23 -# @author :Mo +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/28 1:49 +# @author :Mo # @function : \ No newline at end of file diff --git a/FeatureProject/xlnet/__init__.py b/FeatureProject/xlnet/__init__.py new file mode 100644 index 0000000..0447f5b --- /dev/null +++ b/FeatureProject/xlnet/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/27 22:26 +# @author :Mo +# @function : \ No newline at end of file diff --git a/FeatureProject/xlnet/args.py b/FeatureProject/xlnet/args.py new file mode 100644 index 0000000..953c566 --- /dev/null +++ b/FeatureProject/xlnet/args.py @@ -0,0 +1,33 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/27 23:03 +# @author :Mo +# @function : + + +import pathlib +import sys +import os + + +# base dir +projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent) +sys.path.append(projectdir) + + +# path of BERT model +model_dir = os.path.join(projectdir, 'Data', 'chinese_xlnet_mid_L-24_H-768_A-12') +config_name = os.path.join(model_dir, 'xlnet_config.json') +ckpt_name = os.path.join(model_dir, 'xlnet_model.ckpt') +spiece_model = os.path.join(model_dir, 'spiece.model') +attention_type = 'bi' # or 'uni' +# 批处理尺寸 +batch_size = 1 +# 历史序列长度 +memory_len=0 +# 当前目标序列长度 +target_len=32 +# 默认取倒数第二层的输出值作为句向量 +layer_indexes = [0, 23] # 可填 0, 1, 2, 3, 4, 5, 6, 7..., 24,其中0为embedding层 +# gpu使用率 +gpu_memory_fraction = 0.64 \ No newline at end of file diff --git a/FeatureProject/xlnet/extract_keras_xlnet_feature.py b/FeatureProject/xlnet/extract_keras_xlnet_feature.py new file mode 100644 index 0000000..c5c05ff --- /dev/null +++ b/FeatureProject/xlnet/extract_keras_xlnet_feature.py @@ -0,0 +1,135 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/27 22:27 +# @author :Mo +# @function : + + + +from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI +from keras_xlnet import load_trained_model_from_checkpoint + +from FeatureProject.bert.layers_keras import NonMaskingLayer +import keras.backend.tensorflow_backend as ktf_keras +from keras.models import Model +from keras.layers import Add +import tensorflow as tf +import numpy as np +import codecs +import os + +from FeatureProject.xlnet import args + + +# 全局使用,使其可以django、flask、tornado等调用 +graph = None +model = None +# gpu配置与使用率设置 +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +config = tf.ConfigProto() +config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction +sess = tf.Session(config=config) +ktf_keras.set_session(sess) + + +class KerasXlnetVector(): + def __init__(self): + self.attention_type = ATTENTION_TYPE_BI if args.attention_type[0] == 'bi' else ATTENTION_TYPE_UNI + self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, args.batch_size + self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name + self.layer_indexes, self.in_train_phase = args.layer_indexes, False + + print("load KerasXlnetEmbedding start! ") + # 全局使用,使其可以django、flask、tornado等调用 + global graph + graph = tf.get_default_graph() + global model + # 模型加载 + model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, + attention_type=self.attention_type, + in_train_phase=self.in_train_phase, + config_path=self.config_path, + memory_len=self.memory_len, + target_len=self.target_len, + batch_size=self.batch_size, + mask_index=0) + # 字典加载 + self.tokenizer = Tokenizer(args.spiece_model) + # debug时候查看layers + self.model_layers = model.layers + len_layers = self.model_layers.__len__() + print(len_layers) + len_couche = int((len_layers - 6) / 10) + # 一共246个layer + # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 + # 一共24层 + layer_dict = [5] + layer_0 = 6 + for i in range(len_couche): + layer_0 = layer_0 + 10 + layer_dict.append(layer_0-2) + # 输出它本身 + if len(self.layer_indexes) == 0: + encoder_layer = model.output + # 分类如果只有一层,取得不正确的话就取倒数第二层 + elif len(self.layer_indexes) == 1: + if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: + encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]]).output + else: + encoder_layer = model.get_layer(index=layer_dict[-2]).output + # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 + else: + # layer_indexes must be [0, 1, 2,3,......24] + all_layers = [model.get_layer(index=layer_dict[lay]).output + if lay in [i + 1 for i in range(len_couche + 1)] + else model.get_layer(index=layer_dict[-2]).output # 如果给出不正确,就默认输出倒数第二层 + for lay in self.layer_indexes] + print(self.layer_indexes) + print(all_layers) + all_layers_select = [] + for all_layers_one in all_layers: + all_layers_select.append(all_layers_one) + encoder_layer = Add()(all_layers_select) + print(encoder_layer.shape) + output_layer = NonMaskingLayer()(encoder_layer) + model = Model(model.inputs, output_layer) + print("load KerasXlnetEmbedding end") + model.summary(132) + + + def xlnet_encode(self, texts): + + # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py + mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1) + masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9) + + # 文本预处理 + predicts = [] + for text in texts: + # print(text) + tokens = self.tokenizer.encode(text) + tokens = tokens + [0]*(self.target_len-len(tokens)) if len(tokens) < self.target_len else tokens[0:self.target_len] + token_input = np.expand_dims(np.array(tokens), axis=0) + mask_input = np.array([0 if ids == 0 else 1 for ids in tokens]) + segment_input = np.zeros_like(token_input) + memory_length_input = np.zeros((1, 1)) + # 全局使用,使其可以django、flask、tornado等调用 + with graph.as_default(): + predict = model.predict([token_input, segment_input, memory_length_input], batch_size=1) + # print(predict) + prob = predict[0] + pooled = masked_reduce_mean(prob, [mask_input]) + pooled = pooled.tolist() + predicts.append(pooled[0]) + return predicts + + +if __name__ == "__main__": + xlnet_vector = KerasXlnetVector() + pooled = xlnet_vector.xlnet_encode(['你是谁呀', '小老弟']) + print(pooled) + while True: + print("input:") + ques = input() + print(ques) + print(xlnet_vector.xlnet_encode([ques])) diff --git a/FeatureProject/xlnet/layers_keras.py b/FeatureProject/xlnet/layers_keras.py new file mode 100644 index 0000000..06bffb3 --- /dev/null +++ b/FeatureProject/xlnet/layers_keras.py @@ -0,0 +1,31 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/10 10:49 +# @author :Mo +# @function :create model of keras-bert for get [-2] layers + +from keras.engine import Layer + + +class NonMaskingLayer(Layer): + """ + fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978 + thanks for https://github.com/jacoxu + """ + + def __init__(self, **kwargs): + self.supports_masking = True + super(NonMaskingLayer, self).__init__(**kwargs) + + def build(self, input_shape): + pass + + def compute_mask(self, input, input_mask=None): + # do not pass the mask to the next layers + return None + + def call(self, x, mask=None): + return x + + def get_output_shape_for(self, input_shape): + return input_shape diff --git a/FeatureProject/xlnet/tet_xlnet_keras_sim.py b/FeatureProject/xlnet/tet_xlnet_keras_sim.py new file mode 100644 index 0000000..942bae2 --- /dev/null +++ b/FeatureProject/xlnet/tet_xlnet_keras_sim.py @@ -0,0 +1,77 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/7 20:27 +# @author :Mo +# @function :test sentence of xlnet encode and cosin sim of two question + + +def calculate_count(): + """ + 统计一下1000条测试数据的平均耗时 + :return: + """ + from FeatureProject.xlnet.extract_keras_xlnet_feature import KerasXlnetVector + import time + + xlnet_vector = KerasXlnetVector() + print("xlnet start ok!") + time_start = time.time() + for i in range(1000): + vector = xlnet_vector.xlnet_encode(["yx,你知道吗,我很喜欢你呀,在一起在一起在一起,哈哈哈哈"]) + + time_end = time.time() + time_avg = (time_end-time_start)/1000 + print(vector) + print(time_avg) + # 0.12605296468734742 win10 gpu avg + # 0.01629048466682434 linux cpu avg + + +def sim_two_question(): + """测试一下两个问题的相似句子""" + from FeatureProject.xlnet.extract_keras_xlnet_feature import KerasXlnetVector + from sklearn import preprocessing + from math import pi + import numpy as np + import time + import math + + def cosine_distance(v1, v2): # 余弦距离 + if v1.all() and v2.all(): + return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) + else: + return 0 + + def scale_zoom(rate): # sig 缩放 + zoom = (1 + np.exp(-float(rate))) / 2 + return zoom + + def scale_triangle(rate): # sin 缩放 + triangle = math.sin(rate/1*pi/2 - pi/2) + return triangle + + xlnet_vector = KerasXlnetVector() + print("xlnet start ok!") + while True: + print("input ques-1: ") + ques_1 = input() + print("input ques_2: ") + ques_2 = input() + vector_1 = xlnet_vector.xlnet_encode([ques_1]) + vector_2 = xlnet_vector.xlnet_encode([ques_2]) + sim = cosine_distance(vector_1[0], vector_2[0]) + # sim_list = [sim, 0, 0.2, 0.4, 0.6, 0.8, 1.0] + # sim = preprocessing.scale(sim_list)[0] + # sim = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(sim_list)[0] + # sim_1 = preprocessing.normalize(sim_list, norm='l1')[0] + # sim_2 = preprocessing.normalize(sim_list, norm='l2')[0] + # sim = scale_zoom(sim) + # sim = scale_triangle(sim) + # print(sim_1) + # print(sim_2) + print(sim) + + +if __name__=="__main__": + calculate_count() + sim_two_question() diff --git a/readme.md b/readme.md index 703235f..8bd0585 100644 --- a/readme.md +++ b/readme.md @@ -34,6 +34,9 @@ - bert句向量、文本相似度 - bert/extract_keras_bert_feature.py:提取bert句向量特征 - bert/tet_bert_keras_sim.py:测试bert句向量cosin相似度 + - xlnet句向量、文本相似度 + - xlnet/extract_keras_xlnet_feature.py:提取bert句向量特征 + - xlnet/tet_xlnet_keras_sim.py:测试bert句向量cosin相似度 - normalization_util指的是数据归一化 - 0-1归一化处理 - 均值归一化 @@ -83,6 +86,8 @@ - chinese_L-12_H-768_A-12(谷歌预训练好的模型) github项目中只是上传部分数据,需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket 解压后就可以啦 + - chinese_xlnet_mid_L-24_H-768_A-12(哈工大训练的中文xlnet, mid, 24层, wiki语料+通用语料) + - 下载地址[https://github.com/ymcui/Chinese-PreTrained-XLNet](https://github.com/ymcui/Chinese-PreTrained-XLNet) - chinese_vector github项目中只是上传部分数据,需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket - 截取的部分word2vec训练词向量(自己需要下载全效果才会好)