fix split of train,test

This commit is contained in:
yongzhuo 2019-10-13 09:08:31 +08:00
parent e01c006fa9
commit dbfbe71602
41 changed files with 86 additions and 40 deletions

View File

@ -199,7 +199,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -1,5 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/8/28 14:13
# @time :2019/10/13 9:00
# @author :Mo
# @function :

View File

@ -0,0 +1,46 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/10/13 8:07
# @author :Mo
# @function :数据切分为训练集,验证集
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from keras_textclassification.data_preprocess.text_preprocess import txt_write
def data_kfold(path_org_data, k_fold_split=10, path_save_dir=""):
"""
切分训练-测试集
:param path_org_data: str, 原始语料绝对路径地址,utf-8的csv格式
:param k_fold_split: int, k折切分, 原始语料中每个类至少有k_fold_split条句子
:param path_save_dir: str, 生成训练集-测试集文件的保存目录
:return:
"""
label_ques = pd.read_csv(path_org_data, names=["label","ques"], usecols=["label","ques"])
quess = label_ques["ques"].values.tolist()[1:]
labels = label_ques["label"].values.tolist()[1:]
quess, labels = np.array(quess), np.array(labels)
kf_sp = StratifiedKFold(n_splits=k_fold_split)
for train_index, dev_index in kf_sp.split(quess, labels):
train_x, train_y = quess[train_index], labels[train_index]
dev_x, dev_y = quess[dev_index], labels[dev_index]
lq_train = [train_y[i].replace(",","").strip() + "," + train_x[i].replace(",","").strip() + "\n"
for i in range(len(train_y))]
lq_valid = [dev_y[i].replace(",","").strip() + "," + dev_x[i].replace(",","").strip() + "\n"
for i in range(len(dev_y))]
txt_write(["label,ques\n"] + lq_train, path_save_dir + "lq_train.csv")
txt_write(["label,ques\n"] + lq_valid, path_save_dir + "lq_valid.csv")
break
if __name__ == '__main__':
from keras_textclassification.conf.path_config import path_root
filepath = path_root + "/data/baidu_qa_2019/baike_qa_train.csv" # 原始语料
k_fold_split = 10
data_kfold(path_org_data=filepath, k_fold_split=10, path_save_dir=path_root+ "/data/baidu_qa_2019/")

View File

@ -129,7 +129,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -129,7 +129,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -97,7 +97,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -128,7 +128,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -119,7 +119,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
# pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
# pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -98,7 +98,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -97,4 +97,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少

View File

@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -84,7 +84,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -93,7 +93,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -86,7 +86,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -117,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -98,7 +98,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -93,7 +93,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.001) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -91,7 +91,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=0.001) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -86,7 +86,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=0.001) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -90,7 +90,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -118,7 +118,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=0.01) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()

View File

@ -95,7 +95,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -84,7 +84,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -81,7 +81,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -85,4 +85,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)

View File

@ -81,7 +81,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

View File

@ -81,7 +81,7 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=0.01)
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory