mirror of
https://github.com/JoergFranke/ADNC.git
synced 2024-11-17 22:08:04 +08:00
add comments
This commit is contained in:
parent
03464bc3e2
commit
572086adc3
@ -15,6 +15,7 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "" # gpu not required for inference
|
os.environ["CUDA_VISIBLE_DEVICES"] = "" # gpu not required for inference
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import yaml
|
import yaml
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -13,10 +13,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
import os
|
import os
|
||||||
import argparse
|
|
||||||
|
import tensorflow as tf
|
||||||
import yaml
|
import yaml
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import tensorflow as tf
|
|
||||||
|
|
||||||
from adnc.data.loader import DataLoader
|
from adnc.data.loader import DataLoader
|
||||||
from adnc.model.mann import MANN
|
from adnc.model.mann import MANN
|
||||||
|
@ -21,9 +21,9 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from adnc.model import MANN, Optimizer, Supporter
|
|
||||||
from adnc.analysis import Analyser
|
from adnc.analysis import Analyser
|
||||||
from adnc.data import DataLoader
|
from adnc.data import DataLoader
|
||||||
|
from adnc.model import MANN, Optimizer, Supporter
|
||||||
from adnc.model.utils import EarlyStop
|
from adnc.model.utils import EarlyStop
|
||||||
|
|
||||||
tf.reset_default_graph()
|
tf.reset_default_graph()
|
||||||
@ -41,77 +41,63 @@ restore_checkpoint = args.check # allows to restore a specific check
|
|||||||
if not restore_checkpoint:
|
if not restore_checkpoint:
|
||||||
restore_checkpoint = False
|
restore_checkpoint = False
|
||||||
|
|
||||||
|
dataset_name = 'babi_task' # defines the dataset choosen from config
|
||||||
|
model_type = 'mann' # type of model, currently only 'mann'
|
||||||
|
|
||||||
|
experiment_name = 'github_example' # name of the experiment
|
||||||
|
|
||||||
data_set_name = 'babi_task'
|
project_dir = 'experiments/' # folder to save experiments
|
||||||
model_type = 'mann'
|
config_file = 'config.yml' # name of config file
|
||||||
|
|
||||||
experiment_name = 'github_example'
|
early_stop = EarlyStop(10) # initialize early stopping after 10 higher losses in a row
|
||||||
|
|
||||||
|
analyse = True # allows a closer analysis of the training progress, like memory influence
|
||||||
|
plot_process = True # plots a function plot after each epoch
|
||||||
|
|
||||||
project_dir = 'experiments/'
|
sp = Supporter(project_dir, config_file, experiment_name, dataset_name, model_type,
|
||||||
config_file = 'config.yml'
|
session_no) # initializes supporter class for experiment handling
|
||||||
|
|
||||||
early_stop = EarlyStop(10)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
analyse = True
|
|
||||||
plot_process = True
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
sp = Supporter(project_dir, config_file, experiment_name, data_set_name, model_type, session_no)
|
|
||||||
|
|
||||||
data_set_config = sp.config(data_set_name)
|
|
||||||
|
|
||||||
dl = DataLoader(data_set_config)
|
|
||||||
valid_loader = dl.get_data_loader('valid')
|
|
||||||
train_loader = dl.get_data_loader('train')
|
|
||||||
|
|
||||||
|
dl = DataLoader(sp.config(dataset_name)) # initializes data loader class
|
||||||
|
valid_loader = dl.get_data_loader('valid') # gets a valid data iterator
|
||||||
|
train_loader = dl.get_data_loader('train') # gets a train data iterator
|
||||||
|
|
||||||
if analyse:
|
if analyse:
|
||||||
ana = Analyser(data_set_name, sp.session_dir, save_fig=plot_process)
|
ana = Analyser(dataset_name, sp.session_dir, save_fig=plot_process,
|
||||||
|
save_variables=True) # initilizes a analyzer class
|
||||||
|
|
||||||
|
sp.config(model_type)['input_size'] = dl.x_size # after the data loader is initilized, the input size
|
||||||
|
sp.config(model_type)['output_size'] = dl.y_size # and output size is known and used for the model
|
||||||
|
model = MANN(sp.config('mann'), analyse) # initilizes the model class
|
||||||
|
|
||||||
|
data, target, mask = model.feed # TF data, target and mask placeholders for training
|
||||||
|
|
||||||
sp.config(model_type)['input_size'] = dl.x_size
|
trainer = Optimizer(sp.config('training'), model.loss,
|
||||||
sp.config(model_type)['output_size'] = dl.y_size
|
model.trainable_variables) # initilizes a trainer class with the optimizer
|
||||||
model = MANN(sp.config('mann'), analyse)
|
optimizer = trainer.optimizer # the optimizer for training, similar to TF
|
||||||
|
|
||||||
data, target, mask = model.feed
|
|
||||||
|
|
||||||
|
|
||||||
trainer = Optimizer(sp.config('training'), model.loss, model.trainable_variables)
|
|
||||||
optimizer = trainer.optimizer
|
|
||||||
|
|
||||||
init_op = tf.global_variables_initializer()
|
init_op = tf.global_variables_initializer()
|
||||||
saver = tf.train.Saver(max_to_keep=30)
|
saver = tf.train.Saver(max_to_keep=30)
|
||||||
|
|
||||||
summary_train_loss = tf.summary.scalar("train_loss", model.loss)
|
summary_train_loss = tf.summary.scalar("train_loss", model.loss)
|
||||||
summary_valid_loss = tf.summary.scalar("valid_loss", model.loss)
|
summary_valid_loss = tf.summary.scalar("valid_loss", model.loss)
|
||||||
|
|
||||||
lstm_scale = tf.summary.scalar("lstm_scale", tf.reduce_mean(model.trainable_variables[2]))
|
lstm_scale = tf.summary.scalar("lstm_scale", tf.reduce_mean(model.trainable_variables[2]))
|
||||||
lstm_beta = tf.summary.scalar("lstm_beta", tf.reduce_mean(model.trainable_variables[3]))
|
lstm_beta = tf.summary.scalar("lstm_beta", tf.reduce_mean(model.trainable_variables[3]))
|
||||||
|
|
||||||
sp.pub("vocabulary size: {}".format(dl.vocabulary_size))
|
sp.pub("vocabulary size: {}".format(dl.vocabulary_size)) # prints values and logs it to a log file
|
||||||
sp.pub("train set length: {}".format(dl.sample_amount('train')))
|
sp.pub("train set length: {}".format(dl.sample_amount('train')))
|
||||||
sp.pub("train batch amount: {}".format(dl.batch_amount('train')))
|
sp.pub("train batch amount: {}".format(dl.batch_amount('train')))
|
||||||
sp.pub("valid set length: {}".format(dl.sample_amount('valid')))
|
sp.pub("valid set length: {}".format(dl.sample_amount('valid')))
|
||||||
sp.pub("valid batch amount: {}".format(dl.batch_amount('valid')))
|
sp.pub("valid batch amount: {}".format(dl.batch_amount('valid')))
|
||||||
sp.pub("model parameter amount: {}".format(model.parameter_amount))
|
sp.pub("model parameter amount: {}".format(model.parameter_amount))
|
||||||
|
|
||||||
|
conf = tf.ConfigProto() # TF session config for optimal GPU usage
|
||||||
conf = tf.ConfigProto()
|
|
||||||
conf.gpu_options.per_process_gpu_memory_fraction = 0.8
|
conf.gpu_options.per_process_gpu_memory_fraction = 0.8
|
||||||
conf.gpu_options.allocator_type = 'BFC'
|
conf.gpu_options.allocator_type = 'BFC'
|
||||||
conf.gpu_options.allow_growth = True
|
conf.gpu_options.allow_growth = True
|
||||||
conf.allow_soft_placement = True
|
conf.allow_soft_placement = True
|
||||||
|
|
||||||
with tf.Session(config=conf) as sess:
|
with tf.Session(config=conf) as sess:
|
||||||
|
if sp.restore and restore_checkpoint: # restores model dumps after a crash or to continiue training
|
||||||
if sp.restore and restore_checkpoint:
|
|
||||||
saver.restore(sess, os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(restore_checkpoint)))
|
saver.restore(sess, os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(restore_checkpoint)))
|
||||||
epoch_start = restore_checkpoint + 1
|
epoch_start = restore_checkpoint + 1
|
||||||
sp.pub("restart training with checkpoint {}".format(epoch_start - 1))
|
sp.pub("restart training with checkpoint {}".format(epoch_start - 1))
|
||||||
@ -131,7 +117,7 @@ with tf.Session(config=conf) as sess:
|
|||||||
|
|
||||||
writer = tf.summary.FileWriter(os.path.join(sp.session_dir, "summary"), sess.graph)
|
writer = tf.summary.FileWriter(os.path.join(sp.session_dir, "summary"), sess.graph)
|
||||||
|
|
||||||
for e in range(epoch_start, sp.config('training')['epochs']):
|
for e in range(epoch_start, sp.config('training')['epochs']): # loop over all training epochs
|
||||||
|
|
||||||
train_cost = 0
|
train_cost = 0
|
||||||
train_count = 0
|
train_count = 0
|
||||||
@ -140,11 +126,12 @@ with tf.Session(config=conf) as sess:
|
|||||||
time_e = time.time()
|
time_e = time.time()
|
||||||
time_0 = time.time()
|
time_0 = time.time()
|
||||||
|
|
||||||
for step in tqdm(range(int(dl.batch_amount('train')))):
|
for step in tqdm(range(int(dl.batch_amount('train')))): # loop over all training samples
|
||||||
|
|
||||||
sample = next(train_loader)
|
sample = next(train_loader) # new training sample from train iterator
|
||||||
|
|
||||||
_, c, summary, lb, ls = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale],feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']})
|
_, c, summary, lb, ls = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale],
|
||||||
|
feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']})
|
||||||
train_cost += c
|
train_cost += c
|
||||||
train_count += 1
|
train_count += 1
|
||||||
writer.add_summary(summary, e * dl.batch_amount('train') + step)
|
writer.add_summary(summary, e * dl.batch_amount('train') + step)
|
||||||
@ -154,13 +141,14 @@ with tf.Session(config=conf) as sess:
|
|||||||
valid_cost = 0
|
valid_cost = 0
|
||||||
valid_count = 0
|
valid_count = 0
|
||||||
|
|
||||||
for v in range(int(dl.batch_amount('valid'))):
|
for v in range(int(dl.batch_amount('valid'))): # loop over all validation samples
|
||||||
vsample = next(valid_loader)
|
vsample = next(valid_loader)
|
||||||
vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss],feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
|
vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss],
|
||||||
|
feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
|
||||||
valid_cost += vcost
|
valid_cost += vcost
|
||||||
valid_count += 1
|
valid_count += 1
|
||||||
writer.add_summary(summary, e * dl.batch_amount('valid') + v)
|
writer.add_summary(summary, e * dl.batch_amount('valid') + v)
|
||||||
tm = np.argmax(vsample['y'], axis=-1)
|
tm = np.argmax(vsample['y'], axis=-1) # calculates the word error rate
|
||||||
pm = np.argmax(vpred, axis=-1)
|
pm = np.argmax(vpred, axis=-1)
|
||||||
corrects = np.equal(tm, pm)
|
corrects = np.equal(tm, pm)
|
||||||
all_corrects += np.sum(corrects * vsample['m'])
|
all_corrects += np.sum(corrects * vsample['m'])
|
||||||
@ -170,59 +158,70 @@ with tf.Session(config=conf) as sess:
|
|||||||
train_cost = train_cost / train_count
|
train_cost = train_cost / train_count
|
||||||
word_error_rate = 1 - (all_corrects / all_overall)
|
word_error_rate = 1 - (all_corrects / all_overall)
|
||||||
|
|
||||||
if not np.isnan(valid_cost):
|
if not np.isnan(valid_cost): # checks NAN
|
||||||
|
|
||||||
save_path = saver.save(sess, os.path.join(sp.session_dir ,"model_dump_{}.ckpt".format(e)))
|
save_path = saver.save(sess,
|
||||||
|
os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(e))) # dumps model weights
|
||||||
|
|
||||||
if analyse:
|
if analyse: # if analysis, it logs memory influence and plots functionality
|
||||||
|
|
||||||
controller_inf = []
|
controller_inf = []
|
||||||
memory_inf = []
|
memory_inf = []
|
||||||
all_corrects = 0
|
all_corrects = 0
|
||||||
all_overall = 0
|
all_overall = 0
|
||||||
|
|
||||||
for vstep in range(10):
|
for vstep in range(
|
||||||
|
10): # makes ten valid inferneces to get the gradiens for memory influence calculation
|
||||||
vsample = next(valid_loader)
|
vsample = next(valid_loader)
|
||||||
|
|
||||||
analyse_values, prediction, gradients = sess.run([model.analyse, model.prediction, trainer.gradients],
|
analyse_values, prediction, gradients = sess.run(
|
||||||
|
[model.analyse, model.prediction, trainer.gradients],
|
||||||
feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
|
feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
|
||||||
weights = {v.name: {'var':g[1], 'grad':g[0], 'shape':g[0].shape } for v, g in zip(model.trainable_variables, gradients)}
|
weights = {v.name: {'var': g[1], 'grad': g[0], 'shape': g[0].shape} for v, g in
|
||||||
|
zip(model.trainable_variables, gradients)}
|
||||||
if 'x_word' not in vsample.keys():
|
if 'x_word' not in vsample.keys():
|
||||||
vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1), (1, 0))
|
vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1), (1, 0))
|
||||||
data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'], ]
|
data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'], ]
|
||||||
|
|
||||||
|
|
||||||
decoded_targets, decoded_predictions = dl.decode_output(vsample, prediction)
|
decoded_targets, decoded_predictions = dl.decode_output(vsample, prediction)
|
||||||
|
|
||||||
save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights]
|
save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights]
|
||||||
|
|
||||||
co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch", save_plot=vstep)
|
co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch",
|
||||||
|
save_plot=vstep) # calculates the memory influence
|
||||||
controller_inf.append(co_inf)
|
controller_inf.append(co_inf)
|
||||||
memory_inf.append(mu_inf)
|
memory_inf.append(mu_inf)
|
||||||
|
|
||||||
|
|
||||||
controller_inf = np.mean(controller_inf)
|
controller_inf = np.mean(controller_inf)
|
||||||
memory_inf = np.mean(memory_inf)
|
memory_inf = np.mean(memory_inf)
|
||||||
|
|
||||||
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]), e * dl.batch_amount('train') + step)
|
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]),
|
||||||
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]), e * dl.batch_amount('train') + step)
|
e * dl.batch_amount('train') + step)
|
||||||
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]), e * dl.batch_amount('train') + step)
|
writer.add_summary(
|
||||||
|
tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]),
|
||||||
|
e * dl.batch_amount('train') + step)
|
||||||
|
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]),
|
||||||
|
e * dl.batch_amount('train') + step)
|
||||||
|
|
||||||
|
sp.pub(
|
||||||
sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, "
|
"epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, "
|
||||||
"memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
|
"memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
|
||||||
e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf, time.time() - time_0, sp.time_stamp(), save_path))
|
e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf,
|
||||||
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence", "memory influence", "wer"],
|
time.time() - time_0, sp.time_stamp(), save_path))
|
||||||
[e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf, word_error_rate])
|
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence",
|
||||||
|
"memory influence", "wer"],
|
||||||
|
[e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf,
|
||||||
|
word_error_rate]) # saves the values in an numpy array for later analysis
|
||||||
else:
|
else:
|
||||||
sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
|
sp.pub(
|
||||||
|
"epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
|
||||||
e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path))
|
e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path))
|
||||||
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"], [e, step, train_cost, valid_cost, time.time() - time_0])
|
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"],
|
||||||
|
[e, step, train_cost, valid_cost, time.time() - time_0])
|
||||||
else:
|
else:
|
||||||
sp.pub("ERROR: nan in training")
|
sp.pub("ERROR: nan in training")
|
||||||
sys.exit("NAN")
|
sys.exit("NAN") # end training in case of NAN
|
||||||
|
|
||||||
if early_stop(valid_cost):
|
if early_stop(valid_cost):
|
||||||
sp.pub("EARLYSTOP: valid error increase")
|
sp.pub("EARLYSTOP: valid error increase")
|
||||||
sys.exit("EARLYSTOP")
|
sys.exit("EARLYSTOP") # end training when valid loss increases, early stopping
|
||||||
|
Loading…
Reference in New Issue
Block a user