xlnet embedding and similarity
This commit is contained in:
parent
be89b4ab22
commit
30536dfffe
@ -1,5 +1,5 @@
|
|||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
# !/usr/bin/python
|
# !/usr/bin/python
|
||||||
# @time :2019/5/21 15:23
|
# @time :2019/8/28 1:49
|
||||||
# @author :Mo
|
# @author :Mo
|
||||||
# @function :
|
# @function :
|
5
FeatureProject/xlnet/__init__.py
Normal file
5
FeatureProject/xlnet/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/8/27 22:26
|
||||||
|
# @author :Mo
|
||||||
|
# @function :
|
33
FeatureProject/xlnet/args.py
Normal file
33
FeatureProject/xlnet/args.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/8/27 23:03
|
||||||
|
# @author :Mo
|
||||||
|
# @function :
|
||||||
|
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
# base dir
|
||||||
|
projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||||
|
sys.path.append(projectdir)
|
||||||
|
|
||||||
|
|
||||||
|
# path of BERT model
|
||||||
|
model_dir = os.path.join(projectdir, 'Data', 'chinese_xlnet_mid_L-24_H-768_A-12')
|
||||||
|
config_name = os.path.join(model_dir, 'xlnet_config.json')
|
||||||
|
ckpt_name = os.path.join(model_dir, 'xlnet_model.ckpt')
|
||||||
|
spiece_model = os.path.join(model_dir, 'spiece.model')
|
||||||
|
attention_type = 'bi' # or 'uni'
|
||||||
|
# 批处理尺寸
|
||||||
|
batch_size = 1
|
||||||
|
# 历史序列长度
|
||||||
|
memory_len=0
|
||||||
|
# 当前目标序列长度
|
||||||
|
target_len=32
|
||||||
|
# 默认取倒数第二层的输出值作为句向量
|
||||||
|
layer_indexes = [0, 23] # 可填 0, 1, 2, 3, 4, 5, 6, 7..., 24,其中0为embedding层
|
||||||
|
# gpu使用率
|
||||||
|
gpu_memory_fraction = 0.64
|
135
FeatureProject/xlnet/extract_keras_xlnet_feature.py
Normal file
135
FeatureProject/xlnet/extract_keras_xlnet_feature.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/8/27 22:27
|
||||||
|
# @author :Mo
|
||||||
|
# @function :
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI
|
||||||
|
from keras_xlnet import load_trained_model_from_checkpoint
|
||||||
|
|
||||||
|
from FeatureProject.bert.layers_keras import NonMaskingLayer
|
||||||
|
import keras.backend.tensorflow_backend as ktf_keras
|
||||||
|
from keras.models import Model
|
||||||
|
from keras.layers import Add
|
||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
import codecs
|
||||||
|
import os
|
||||||
|
|
||||||
|
from FeatureProject.xlnet import args
|
||||||
|
|
||||||
|
|
||||||
|
# 全局使用,使其可以django、flask、tornado等调用
|
||||||
|
graph = None
|
||||||
|
model = None
|
||||||
|
# gpu配置与使用率设置
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||||
|
config = tf.ConfigProto()
|
||||||
|
config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
|
||||||
|
sess = tf.Session(config=config)
|
||||||
|
ktf_keras.set_session(sess)
|
||||||
|
|
||||||
|
|
||||||
|
class KerasXlnetVector():
|
||||||
|
def __init__(self):
|
||||||
|
self.attention_type = ATTENTION_TYPE_BI if args.attention_type[0] == 'bi' else ATTENTION_TYPE_UNI
|
||||||
|
self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, args.batch_size
|
||||||
|
self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name
|
||||||
|
self.layer_indexes, self.in_train_phase = args.layer_indexes, False
|
||||||
|
|
||||||
|
print("load KerasXlnetEmbedding start! ")
|
||||||
|
# 全局使用,使其可以django、flask、tornado等调用
|
||||||
|
global graph
|
||||||
|
graph = tf.get_default_graph()
|
||||||
|
global model
|
||||||
|
# 模型加载
|
||||||
|
model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path,
|
||||||
|
attention_type=self.attention_type,
|
||||||
|
in_train_phase=self.in_train_phase,
|
||||||
|
config_path=self.config_path,
|
||||||
|
memory_len=self.memory_len,
|
||||||
|
target_len=self.target_len,
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
mask_index=0)
|
||||||
|
# 字典加载
|
||||||
|
self.tokenizer = Tokenizer(args.spiece_model)
|
||||||
|
# debug时候查看layers
|
||||||
|
self.model_layers = model.layers
|
||||||
|
len_layers = self.model_layers.__len__()
|
||||||
|
print(len_layers)
|
||||||
|
len_couche = int((len_layers - 6) / 10)
|
||||||
|
# 一共246个layer
|
||||||
|
# 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层
|
||||||
|
# 一共24层
|
||||||
|
layer_dict = [5]
|
||||||
|
layer_0 = 6
|
||||||
|
for i in range(len_couche):
|
||||||
|
layer_0 = layer_0 + 10
|
||||||
|
layer_dict.append(layer_0-2)
|
||||||
|
# 输出它本身
|
||||||
|
if len(self.layer_indexes) == 0:
|
||||||
|
encoder_layer = model.output
|
||||||
|
# 分类如果只有一层,取得不正确的话就取倒数第二层
|
||||||
|
elif len(self.layer_indexes) == 1:
|
||||||
|
if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]:
|
||||||
|
encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]]).output
|
||||||
|
else:
|
||||||
|
encoder_layer = model.get_layer(index=layer_dict[-2]).output
|
||||||
|
# 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数
|
||||||
|
else:
|
||||||
|
# layer_indexes must be [0, 1, 2,3,......24]
|
||||||
|
all_layers = [model.get_layer(index=layer_dict[lay]).output
|
||||||
|
if lay in [i + 1 for i in range(len_couche + 1)]
|
||||||
|
else model.get_layer(index=layer_dict[-2]).output # 如果给出不正确,就默认输出倒数第二层
|
||||||
|
for lay in self.layer_indexes]
|
||||||
|
print(self.layer_indexes)
|
||||||
|
print(all_layers)
|
||||||
|
all_layers_select = []
|
||||||
|
for all_layers_one in all_layers:
|
||||||
|
all_layers_select.append(all_layers_one)
|
||||||
|
encoder_layer = Add()(all_layers_select)
|
||||||
|
print(encoder_layer.shape)
|
||||||
|
output_layer = NonMaskingLayer()(encoder_layer)
|
||||||
|
model = Model(model.inputs, output_layer)
|
||||||
|
print("load KerasXlnetEmbedding end")
|
||||||
|
model.summary(132)
|
||||||
|
|
||||||
|
|
||||||
|
def xlnet_encode(self, texts):
|
||||||
|
|
||||||
|
# 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py
|
||||||
|
mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1)
|
||||||
|
masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9)
|
||||||
|
|
||||||
|
# 文本预处理
|
||||||
|
predicts = []
|
||||||
|
for text in texts:
|
||||||
|
# print(text)
|
||||||
|
tokens = self.tokenizer.encode(text)
|
||||||
|
tokens = tokens + [0]*(self.target_len-len(tokens)) if len(tokens) < self.target_len else tokens[0:self.target_len]
|
||||||
|
token_input = np.expand_dims(np.array(tokens), axis=0)
|
||||||
|
mask_input = np.array([0 if ids == 0 else 1 for ids in tokens])
|
||||||
|
segment_input = np.zeros_like(token_input)
|
||||||
|
memory_length_input = np.zeros((1, 1))
|
||||||
|
# 全局使用,使其可以django、flask、tornado等调用
|
||||||
|
with graph.as_default():
|
||||||
|
predict = model.predict([token_input, segment_input, memory_length_input], batch_size=1)
|
||||||
|
# print(predict)
|
||||||
|
prob = predict[0]
|
||||||
|
pooled = masked_reduce_mean(prob, [mask_input])
|
||||||
|
pooled = pooled.tolist()
|
||||||
|
predicts.append(pooled[0])
|
||||||
|
return predicts
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
xlnet_vector = KerasXlnetVector()
|
||||||
|
pooled = xlnet_vector.xlnet_encode(['你是谁呀', '小老弟'])
|
||||||
|
print(pooled)
|
||||||
|
while True:
|
||||||
|
print("input:")
|
||||||
|
ques = input()
|
||||||
|
print(ques)
|
||||||
|
print(xlnet_vector.xlnet_encode([ques]))
|
31
FeatureProject/xlnet/layers_keras.py
Normal file
31
FeatureProject/xlnet/layers_keras.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/10 10:49
|
||||||
|
# @author :Mo
|
||||||
|
# @function :create model of keras-bert for get [-2] layers
|
||||||
|
|
||||||
|
from keras.engine import Layer
|
||||||
|
|
||||||
|
|
||||||
|
class NonMaskingLayer(Layer):
|
||||||
|
"""
|
||||||
|
fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978
|
||||||
|
thanks for https://github.com/jacoxu
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.supports_masking = True
|
||||||
|
super(NonMaskingLayer, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def compute_mask(self, input, input_mask=None):
|
||||||
|
# do not pass the mask to the next layers
|
||||||
|
return None
|
||||||
|
|
||||||
|
def call(self, x, mask=None):
|
||||||
|
return x
|
||||||
|
|
||||||
|
def get_output_shape_for(self, input_shape):
|
||||||
|
return input_shape
|
77
FeatureProject/xlnet/tet_xlnet_keras_sim.py
Normal file
77
FeatureProject/xlnet/tet_xlnet_keras_sim.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/7 20:27
|
||||||
|
# @author :Mo
|
||||||
|
# @function :test sentence of xlnet encode and cosin sim of two question
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_count():
|
||||||
|
"""
|
||||||
|
统计一下1000条测试数据的平均耗时
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
from FeatureProject.xlnet.extract_keras_xlnet_feature import KerasXlnetVector
|
||||||
|
import time
|
||||||
|
|
||||||
|
xlnet_vector = KerasXlnetVector()
|
||||||
|
print("xlnet start ok!")
|
||||||
|
time_start = time.time()
|
||||||
|
for i in range(1000):
|
||||||
|
vector = xlnet_vector.xlnet_encode(["yx,你知道吗,我很喜欢你呀,在一起在一起在一起,哈哈哈哈"])
|
||||||
|
|
||||||
|
time_end = time.time()
|
||||||
|
time_avg = (time_end-time_start)/1000
|
||||||
|
print(vector)
|
||||||
|
print(time_avg)
|
||||||
|
# 0.12605296468734742 win10 gpu avg
|
||||||
|
# 0.01629048466682434 linux cpu avg
|
||||||
|
|
||||||
|
|
||||||
|
def sim_two_question():
|
||||||
|
"""测试一下两个问题的相似句子"""
|
||||||
|
from FeatureProject.xlnet.extract_keras_xlnet_feature import KerasXlnetVector
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from math import pi
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
|
||||||
|
def cosine_distance(v1, v2): # 余弦距离
|
||||||
|
if v1.all() and v2.all():
|
||||||
|
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def scale_zoom(rate): # sig 缩放
|
||||||
|
zoom = (1 + np.exp(-float(rate))) / 2
|
||||||
|
return zoom
|
||||||
|
|
||||||
|
def scale_triangle(rate): # sin 缩放
|
||||||
|
triangle = math.sin(rate/1*pi/2 - pi/2)
|
||||||
|
return triangle
|
||||||
|
|
||||||
|
xlnet_vector = KerasXlnetVector()
|
||||||
|
print("xlnet start ok!")
|
||||||
|
while True:
|
||||||
|
print("input ques-1: ")
|
||||||
|
ques_1 = input()
|
||||||
|
print("input ques_2: ")
|
||||||
|
ques_2 = input()
|
||||||
|
vector_1 = xlnet_vector.xlnet_encode([ques_1])
|
||||||
|
vector_2 = xlnet_vector.xlnet_encode([ques_2])
|
||||||
|
sim = cosine_distance(vector_1[0], vector_2[0])
|
||||||
|
# sim_list = [sim, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
|
||||||
|
# sim = preprocessing.scale(sim_list)[0]
|
||||||
|
# sim = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(sim_list)[0]
|
||||||
|
# sim_1 = preprocessing.normalize(sim_list, norm='l1')[0]
|
||||||
|
# sim_2 = preprocessing.normalize(sim_list, norm='l2')[0]
|
||||||
|
# sim = scale_zoom(sim)
|
||||||
|
# sim = scale_triangle(sim)
|
||||||
|
# print(sim_1)
|
||||||
|
# print(sim_2)
|
||||||
|
print(sim)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
calculate_count()
|
||||||
|
sim_two_question()
|
@ -34,6 +34,9 @@
|
|||||||
- bert句向量、文本相似度
|
- bert句向量、文本相似度
|
||||||
- bert/extract_keras_bert_feature.py:提取bert句向量特征
|
- bert/extract_keras_bert_feature.py:提取bert句向量特征
|
||||||
- bert/tet_bert_keras_sim.py:测试bert句向量cosin相似度
|
- bert/tet_bert_keras_sim.py:测试bert句向量cosin相似度
|
||||||
|
- xlnet句向量、文本相似度
|
||||||
|
- xlnet/extract_keras_xlnet_feature.py:提取bert句向量特征
|
||||||
|
- xlnet/tet_xlnet_keras_sim.py:测试bert句向量cosin相似度
|
||||||
- normalization_util指的是数据归一化
|
- normalization_util指的是数据归一化
|
||||||
- 0-1归一化处理
|
- 0-1归一化处理
|
||||||
- 均值归一化
|
- 均值归一化
|
||||||
@ -83,6 +86,8 @@
|
|||||||
- chinese_L-12_H-768_A-12(谷歌预训练好的模型)
|
- chinese_L-12_H-768_A-12(谷歌预训练好的模型)
|
||||||
github项目中只是上传部分数据,需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket
|
github项目中只是上传部分数据,需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket
|
||||||
解压后就可以啦
|
解压后就可以啦
|
||||||
|
- chinese_xlnet_mid_L-24_H-768_A-12(哈工大训练的中文xlnet, mid, 24层, wiki语料+通用语料)
|
||||||
|
- 下载地址[https://github.com/ymcui/Chinese-PreTrained-XLNet](https://github.com/ymcui/Chinese-PreTrained-XLNet)
|
||||||
- chinese_vector
|
- chinese_vector
|
||||||
github项目中只是上传部分数据,需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket
|
github项目中只是上传部分数据,需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket
|
||||||
- 截取的部分word2vec训练词向量(自己需要下载全效果才会好)
|
- 截取的部分word2vec训练词向量(自己需要下载全效果才会好)
|
||||||
|
Loading…
Reference in New Issue
Block a user