From dbfbe71602aee024f95098fe0e3daecd26a36131 Mon Sep 17 00:00:00 2001 From: yongzhuo <2714618994@qq.com> Date: Sun, 13 Oct 2019 09:08:31 +0800 Subject: [PATCH] fix split of train,test --- README.md | 2 +- .../fast_text}/__init__.py | 2 +- .../data_preprocess/data_split.py | 46 +++++++++++++++++++ keras_textclassification/m00_Bert/predict.py | 2 +- keras_textclassification/m00_Bert/train.py | 2 +- keras_textclassification/m00_Xlnet/predict.py | 2 +- keras_textclassification/m00_Xlnet/train.py | 2 +- .../m01_FastText/predict.py | 2 +- .../m01_FastText/train.py | 2 +- .../m03_CharCNN/predict.py | 2 +- keras_textclassification/m03_CharCNN/train.py | 2 +- .../m03_CharCNN/train_zhang.py | 2 +- .../m04_TextRNN/predict.py | 2 +- keras_textclassification/m04_TextRNN/train.py | 2 +- .../m05_TextRCNN/predict.py | 2 +- .../m05_TextRCNN/train.py | 2 +- .../m06_TextDCNN/predict.py | 2 +- .../m06_TextDCNN/train.py | 2 +- .../m07_TextDPCNN/predict.py | 2 +- .../m07_TextDPCNN/train.py | 2 +- .../m08_TextVDCNN/predict.py | 2 +- .../m08_TextVDCNN/train.py | 2 +- .../m09_TextCRNN/predict.py | 2 +- .../m09_TextCRNN/train.py | 2 +- .../m10_DeepMoji/predict.py | 2 +- .../m10_DeepMoji/train.py | 2 +- .../m11_SelfAttention/predict.py | 2 +- .../m11_SelfAttention/train.py | 2 +- keras_textclassification/m12_HAN/predict.py | 2 +- keras_textclassification/m12_HAN/train.py | 2 +- .../m13_CapsuleNet/predict.py | 2 +- .../m13_CapsuleNet/train.py | 2 +- .../m14_Transformer/predict.py | 2 +- .../m14_Transformer/train.py | 2 +- test/multi_label_class/train_multi.py | 2 +- test/tet_char_bert_embedding.py | 2 +- test/tet_char_random_embedding.py | 2 +- test/tet_char_word2vec_embedding.py | 2 +- test/tet_char_xlnet_embedding.py | 2 +- test/tet_word_random_embedding.py | 2 +- test/tet_word_word2vec_embedding.py | 2 +- 41 files changed, 86 insertions(+), 40 deletions(-) rename keras_textclassification/data/{embeddings/chinese_L-12_H-768_A-12 => model/fast_text}/__init__.py (71%) create mode 100644 keras_textclassification/data_preprocess/data_split.py diff --git a/README.md b/README.md index 2b92fcf..c0a9ebe 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/data/embeddings/chinese_L-12_H-768_A-12/__init__.py b/keras_textclassification/data/model/fast_text/__init__.py similarity index 71% rename from keras_textclassification/data/embeddings/chinese_L-12_H-768_A-12/__init__.py rename to keras_textclassification/data/model/fast_text/__init__.py index 2eb3ecb..360022a 100644 --- a/keras_textclassification/data/embeddings/chinese_L-12_H-768_A-12/__init__.py +++ b/keras_textclassification/data/model/fast_text/__init__.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- # !/usr/bin/python -# @time :2019/8/28 14:13 +# @time :2019/10/13 9:00 # @author :Mo # @function : \ No newline at end of file diff --git a/keras_textclassification/data_preprocess/data_split.py b/keras_textclassification/data_preprocess/data_split.py new file mode 100644 index 0000000..6824ab7 --- /dev/null +++ b/keras_textclassification/data_preprocess/data_split.py @@ -0,0 +1,46 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/10/13 8:07 +# @author :Mo +# @function :数据切分为训练集,验证集 + + +from sklearn.model_selection import StratifiedKFold +import pandas as pd +import numpy as np + +from keras_textclassification.data_preprocess.text_preprocess import txt_write + + +def data_kfold(path_org_data, k_fold_split=10, path_save_dir=""): + """ + 切分训练-测试集 + :param path_org_data: str, 原始语料绝对路径地址,utf-8的csv格式 + :param k_fold_split: int, k折切分, 原始语料中每个类至少有k_fold_split条句子 + :param path_save_dir: str, 生成训练集-测试集文件的保存目录 + :return: + """ + label_ques = pd.read_csv(path_org_data, names=["label","ques"], usecols=["label","ques"]) + quess = label_ques["ques"].values.tolist()[1:] + labels = label_ques["label"].values.tolist()[1:] + + quess, labels = np.array(quess), np.array(labels) + kf_sp = StratifiedKFold(n_splits=k_fold_split) + + for train_index, dev_index in kf_sp.split(quess, labels): + train_x, train_y = quess[train_index], labels[train_index] + dev_x, dev_y = quess[dev_index], labels[dev_index] + lq_train = [train_y[i].replace(",",",").strip() + "," + train_x[i].replace(",",",").strip() + "\n" + for i in range(len(train_y))] + lq_valid = [dev_y[i].replace(",",",").strip() + "," + dev_x[i].replace(",",",").strip() + "\n" + for i in range(len(dev_y))] + txt_write(["label,ques\n"] + lq_train, path_save_dir + "lq_train.csv") + txt_write(["label,ques\n"] + lq_valid, path_save_dir + "lq_valid.csv") + break + +if __name__ == '__main__': + + from keras_textclassification.conf.path_config import path_root + filepath = path_root + "/data/baidu_qa_2019/baike_qa_train.csv" # 原始语料 + k_fold_split = 10 + data_kfold(path_org_data=filepath, k_fold_split=10, path_save_dir=path_root+ "/data/baidu_qa_2019/") \ No newline at end of file diff --git a/keras_textclassification/m00_Bert/predict.py b/keras_textclassification/m00_Bert/predict.py index 79e71da..e5e8317 100644 --- a/keras_textclassification/m00_Bert/predict.py +++ b/keras_textclassification/m00_Bert/predict.py @@ -129,7 +129,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m00_Bert/train.py b/keras_textclassification/m00_Bert/train.py index fb0dd73..e2ec4de 100644 --- a/keras_textclassification/m00_Bert/train.py +++ b/keras_textclassification/m00_Bert/train.py @@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m00_Xlnet/predict.py b/keras_textclassification/m00_Xlnet/predict.py index 8e8ea35..7a5fcc5 100644 --- a/keras_textclassification/m00_Xlnet/predict.py +++ b/keras_textclassification/m00_Xlnet/predict.py @@ -129,7 +129,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m00_Xlnet/train.py b/keras_textclassification/m00_Xlnet/train.py index 82a95f0..73779a0 100644 --- a/keras_textclassification/m00_Xlnet/train.py +++ b/keras_textclassification/m00_Xlnet/train.py @@ -97,7 +97,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m01_FastText/predict.py b/keras_textclassification/m01_FastText/predict.py index 1c4bc98..0fcebcc 100644 --- a/keras_textclassification/m01_FastText/predict.py +++ b/keras_textclassification/m01_FastText/predict.py @@ -128,7 +128,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m01_FastText/train.py b/keras_textclassification/m01_FastText/train.py index 2b1ca50..7c618a4 100644 --- a/keras_textclassification/m01_FastText/train.py +++ b/keras_textclassification/m01_FastText/train.py @@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m03_CharCNN/predict.py b/keras_textclassification/m03_CharCNN/predict.py index 17f066e..b1798bc 100644 --- a/keras_textclassification/m03_CharCNN/predict.py +++ b/keras_textclassification/m03_CharCNN/predict.py @@ -119,7 +119,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - # pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + # pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m03_CharCNN/train.py b/keras_textclassification/m03_CharCNN/train.py index bcbffed..a145c50 100644 --- a/keras_textclassification/m03_CharCNN/train.py +++ b/keras_textclassification/m03_CharCNN/train.py @@ -98,7 +98,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m03_CharCNN/train_zhang.py b/keras_textclassification/m03_CharCNN/train_zhang.py index 22a9221..86c857d 100644 --- a/keras_textclassification/m03_CharCNN/train_zhang.py +++ b/keras_textclassification/m03_CharCNN/train_zhang.py @@ -97,4 +97,4 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 diff --git a/keras_textclassification/m04_TextRNN/predict.py b/keras_textclassification/m04_TextRNN/predict.py index 6a9a380..ed23057 100644 --- a/keras_textclassification/m04_TextRNN/predict.py +++ b/keras_textclassification/m04_TextRNN/predict.py @@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m04_TextRNN/train.py b/keras_textclassification/m04_TextRNN/train.py index 060f812..1a747fb 100644 --- a/keras_textclassification/m04_TextRNN/train.py +++ b/keras_textclassification/m04_TextRNN/train.py @@ -84,7 +84,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m05_TextRCNN/predict.py b/keras_textclassification/m05_TextRCNN/predict.py index 38a73d0..9f2c4cc 100644 --- a/keras_textclassification/m05_TextRCNN/predict.py +++ b/keras_textclassification/m05_TextRCNN/predict.py @@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m05_TextRCNN/train.py b/keras_textclassification/m05_TextRCNN/train.py index 8e2fcb3..521473b 100644 --- a/keras_textclassification/m05_TextRCNN/train.py +++ b/keras_textclassification/m05_TextRCNN/train.py @@ -93,7 +93,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m06_TextDCNN/predict.py b/keras_textclassification/m06_TextDCNN/predict.py index 339ca3c..7c46a9d 100644 --- a/keras_textclassification/m06_TextDCNN/predict.py +++ b/keras_textclassification/m06_TextDCNN/predict.py @@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m06_TextDCNN/train.py b/keras_textclassification/m06_TextDCNN/train.py index 9504100..f90abdf 100644 --- a/keras_textclassification/m06_TextDCNN/train.py +++ b/keras_textclassification/m06_TextDCNN/train.py @@ -86,7 +86,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m07_TextDPCNN/predict.py b/keras_textclassification/m07_TextDPCNN/predict.py index 9d6bdcf..5007eb7 100644 --- a/keras_textclassification/m07_TextDPCNN/predict.py +++ b/keras_textclassification/m07_TextDPCNN/predict.py @@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m07_TextDPCNN/train.py b/keras_textclassification/m07_TextDPCNN/train.py index 1bae2ac..9ff8664 100644 --- a/keras_textclassification/m07_TextDPCNN/train.py +++ b/keras_textclassification/m07_TextDPCNN/train.py @@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m08_TextVDCNN/predict.py b/keras_textclassification/m08_TextVDCNN/predict.py index 4e6f6e8..cbec24f 100644 --- a/keras_textclassification/m08_TextVDCNN/predict.py +++ b/keras_textclassification/m08_TextVDCNN/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m08_TextVDCNN/train.py b/keras_textclassification/m08_TextVDCNN/train.py index 8b593db..a8e0447 100644 --- a/keras_textclassification/m08_TextVDCNN/train.py +++ b/keras_textclassification/m08_TextVDCNN/train.py @@ -98,7 +98,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m09_TextCRNN/predict.py b/keras_textclassification/m09_TextCRNN/predict.py index 18247a3..0bfd22a 100644 --- a/keras_textclassification/m09_TextCRNN/predict.py +++ b/keras_textclassification/m09_TextCRNN/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m09_TextCRNN/train.py b/keras_textclassification/m09_TextCRNN/train.py index 953e79b..3fd3d1f 100644 --- a/keras_textclassification/m09_TextCRNN/train.py +++ b/keras_textclassification/m09_TextCRNN/train.py @@ -93,7 +93,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.001) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m10_DeepMoji/predict.py b/keras_textclassification/m10_DeepMoji/predict.py index 0c1236c..2402984 100644 --- a/keras_textclassification/m10_DeepMoji/predict.py +++ b/keras_textclassification/m10_DeepMoji/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m10_DeepMoji/train.py b/keras_textclassification/m10_DeepMoji/train.py index c09e799..b1d968f 100644 --- a/keras_textclassification/m10_DeepMoji/train.py +++ b/keras_textclassification/m10_DeepMoji/train.py @@ -91,7 +91,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__ == "__main__": - train(rate=0.001) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m11_SelfAttention/predict.py b/keras_textclassification/m11_SelfAttention/predict.py index 12a65f5..60a1ee7 100644 --- a/keras_textclassification/m11_SelfAttention/predict.py +++ b/keras_textclassification/m11_SelfAttention/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m11_SelfAttention/train.py b/keras_textclassification/m11_SelfAttention/train.py index 535a2fd..f9d4f95 100644 --- a/keras_textclassification/m11_SelfAttention/train.py +++ b/keras_textclassification/m11_SelfAttention/train.py @@ -86,7 +86,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__ == "__main__": - train(rate=0.001) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m12_HAN/predict.py b/keras_textclassification/m12_HAN/predict.py index 1c4c218..97dae2a 100644 --- a/keras_textclassification/m12_HAN/predict.py +++ b/keras_textclassification/m12_HAN/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m12_HAN/train.py b/keras_textclassification/m12_HAN/train.py index 3bafad1..c6a5467 100644 --- a/keras_textclassification/m12_HAN/train.py +++ b/keras_textclassification/m12_HAN/train.py @@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__ == "__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m13_CapsuleNet/predict.py b/keras_textclassification/m13_CapsuleNet/predict.py index 51c2672..eed9a51 100644 --- a/keras_textclassification/m13_CapsuleNet/predict.py +++ b/keras_textclassification/m13_CapsuleNet/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m13_CapsuleNet/train.py b/keras_textclassification/m13_CapsuleNet/train.py index 2169575..d1039ee 100644 --- a/keras_textclassification/m13_CapsuleNet/train.py +++ b/keras_textclassification/m13_CapsuleNet/train.py @@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__ == "__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/keras_textclassification/m14_Transformer/predict.py b/keras_textclassification/m14_Transformer/predict.py index 1b43a58..2276e59 100644 --- a/keras_textclassification/m14_Transformer/predict.py +++ b/keras_textclassification/m14_Transformer/predict.py @@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters): if __name__=="__main__": # 测试集预测 - pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 # 可输入 input 预测 pred_input() diff --git a/keras_textclassification/m14_Transformer/train.py b/keras_textclassification/m14_Transformer/train.py index d5a8992..c754bb1 100644 --- a/keras_textclassification/m14_Transformer/train.py +++ b/keras_textclassification/m14_Transformer/train.py @@ -95,7 +95,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__ == "__main__": - train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 + train(rate=1) # sample条件下设为1,否则训练语料可能会很少 # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/test/multi_label_class/train_multi.py b/test/multi_label_class/train_multi.py index 6b7770e..0f8583e 100644 --- a/test/multi_label_class/train_multi.py +++ b/test/multi_label_class/train_multi.py @@ -84,7 +84,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/test/tet_char_bert_embedding.py b/test/tet_char_bert_embedding.py index 9ca2faf..26e9f5d 100644 --- a/test/tet_char_bert_embedding.py +++ b/test/tet_char_bert_embedding.py @@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/test/tet_char_random_embedding.py b/test/tet_char_random_embedding.py index c1b5f25..d7f7173 100644 --- a/test/tet_char_random_embedding.py +++ b/test/tet_char_random_embedding.py @@ -81,7 +81,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/test/tet_char_word2vec_embedding.py b/test/tet_char_word2vec_embedding.py index 4b73e8d..8f46f0d 100644 --- a/test/tet_char_word2vec_embedding.py +++ b/test/tet_char_word2vec_embedding.py @@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/test/tet_char_xlnet_embedding.py b/test/tet_char_xlnet_embedding.py index ccdb8a8..c691f6b 100644 --- a/test/tet_char_xlnet_embedding.py +++ b/test/tet_char_xlnet_embedding.py @@ -85,4 +85,4 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) diff --git a/test/tet_word_random_embedding.py b/test/tet_word_random_embedding.py index 27d0d19..168ed59 100644 --- a/test/tet_word_random_embedding.py +++ b/test/tet_word_random_embedding.py @@ -81,7 +81,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory diff --git a/test/tet_word_word2vec_embedding.py b/test/tet_word_word2vec_embedding.py index c498e7f..f025837 100644 --- a/test/tet_word_word2vec_embedding.py +++ b/test/tet_word_word2vec_embedding.py @@ -81,7 +81,7 @@ def train(hyper_parameters=None, rate=1.0): if __name__=="__main__": - train(rate=0.01) + train(rate=1) # 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory