add start training script

2024-11-17 13:58:03 +08:00 · 2018-07-05 01:05:18 +02:00 · 2018-07-05 01:05:18 +02:00 · 9aeb5a21f1
commit 9aeb5a21f1
parent 438e9bf0a0
2 changed files with 334 additions and 0 deletions
--- a/scripts/config.yml
+++ b/scripts/config.yml
@ -0,0 +1,106 @@
+# Copyright 2018 Jörg Franke
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+#######################################
+###       Global Configuration      ###
+#######################################
+
+global:
+  batch_size:          &batch_size 32
+
+#######################################
+###     Training Configuration     ###
+#######################################
+training:
+  epochs:               50          # epochs to train
+  learn_rate:           0.00005     # learning reate for optimizer
+  optimizer:            'rmsprop'   # optimizer [ rmsprop,, adam, momentum, adadelta, adagrad, sgd]
+  optimizer_config:     {'momentum':0.9}  # config for optimizer [momentum, nesterov]
+  gradient_clipping:    10          # gradient clipping value
+  weight_decay:         False       # weight decay, False or float
+
+
+
+#######################################
+###        MANN Configuration       ###
+#######################################
+mann:
+  name:                 'mann1'
+  seed:                 245
+  input_size:           0
+  output_size:          0
+  batch_size:           *batch_size
+  input_embedding:      False
+  architecture:         'uni' # bidirectional 172 384
+  controller_config:    {"num_units":[128], "layer_norm":True, "activation":'tanh', 'cell_type':'clstm', 'connect':'sparse'}
+  memory_unit_config:   {"cell_type":'cbmu', "memory_length":64, "memory_width":32, "read_heads":4, "write_heads": 2, "dnc_norm":True, "bypass_dropout":False, "wgate1":False}
+  atop_rnn_config:      False # {"num_units":[32], "layer_norm":True, "activation":'tanh', 'cell_type':'clstm', 'connect':'sparse', 'attention':False}
+  output_function:      "softmax"   # softmax tanh5 linear
+  output_mask:          True
+  loss_function:        'cross_entropy' # cross_entropy, mse
+  bw_input_fw:          False
+
+
+###################################################################
+#######                     bAbI QA Task                     ######
+###################################################################
+babi_task:
+  data_set:        'babi'
+
+#  data_dir:       'data_babi/tasks_1-20_v1-2'
+#  tmp_dir:        'data_dir'
+
+  seed:           876
+  valid_ratio:    0.1           # like nature paper
+  batch_size:     *batch_size
+  max_len:        1000
+
+  set_type:       ['en-10k']    # ['hn-10k', 'en-10k', 'shuffled-10k']
+  task_selection: ['1', '2', '12'] # list of number (1-20) or 'all'
+  augment16:      False         # augmentation of task 16
+
+  num_chached:    5             # number of cached samples
+  threads:        1             # number of parallel threads
+
+
+
+##################################################################
+######                     Copy Task                        ######
+##################################################################
+copy_task:
+  data_set:       'copy_task'
+
+  seed:           125
+  batch_size:     *batch_size
+
+  set_list:
+    train:
+        quantity:       6000    # quantity of the training set
+        min_length:     20      # min length of the training sample
+        max_length:     50      # max length of a training sample
+    valid:
+        quantity:       600     # quantity of the validation set
+        min_length:     50
+        max_length:     100
+#    test:
+#        quantity:       100
+#        min_length:     10
+#        max_length:     10
+
+  feature_width:  100           # width of the feature vector
+
+  num_chached:    10            # number of cached samples
+  threads:        1             # number of parallel threads
+
--- a/scripts/start_training.py
+++ b/scripts/start_training.py
@ -0,0 +1,228 @@
+# Copyright 2018 Jörg Franke
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import argparse
+import os
+import sys
+import time
+
+import numpy as np
+import tensorflow as tf
+from tqdm import tqdm
+
+from adnc.model import MANN, Optimizer, Supporter
+from adnc.analysis import Analyser
+from adnc.data import DataLoader
+from adnc.model.utils import EarlyStop
+
+tf.reset_default_graph()
+
+parser = argparse.ArgumentParser(description='Process some integers.')
+parser.add_argument('--sess', type=int, default=False, help='session number')
+parser.add_argument('--check', type=int, default=False, help='restore checkpoint')
+args = parser.parse_args()
+
+session_no = args.sess                      # allows to restore a specific session
+if not session_no:
+    session_no = False
+
+restore_checkpoint = args.check             # allows to restore a specific checkpoint
+if not restore_checkpoint:
+    restore_checkpoint = False
+
+
+
+data_set_name = 'babi_task'
+model_type = 'mann'
+
+experiment_name = 'github_example'
+
+
+project_dir = 'experiments/'
+config_file = 'config.yml'
+
+early_stop = EarlyStop(10)
+
+
+
+analyse = True
+plot_process = True
+
+
+
+
+sp = Supporter(project_dir, config_file, experiment_name, data_set_name, model_type, session_no)
+
+data_set_config = sp.config(data_set_name)
+
+dl = DataLoader(data_set_config)
+valid_loader = dl.get_data_loader('valid')
+train_loader = dl.get_data_loader('train')
+
+
+if analyse:
+    ana = Analyser(data_set_name, sp.session_dir, save_fig=plot_process)
+
+
+
+sp.config(model_type)['input_size'] = dl.x_size
+sp.config(model_type)['output_size'] = dl.y_size
+model = MANN(sp.config('mann'), analyse)
+
+data, target, mask = model.feed
+
+
+trainer = Optimizer(sp.config('training'), model.loss, model.trainable_variables)
+optimizer = trainer.optimizer
+
+init_op = tf.global_variables_initializer()
+saver = tf.train.Saver(max_to_keep=30)
+
+summary_train_loss = tf.summary.scalar("train_loss", model.loss)
+summary_valid_loss = tf.summary.scalar("valid_loss", model.loss)
+
+lstm_scale = tf.summary.scalar("lstm_scale", tf.reduce_mean(model.trainable_variables[2]))
+lstm_beta = tf.summary.scalar("lstm_beta", tf.reduce_mean(model.trainable_variables[3]))
+
+sp.pub("vocabulary size: {}".format(dl.vocabulary_size))
+sp.pub("train set length: {}".format(dl.sample_amount('train')))
+sp.pub("train batch amount: {}".format(dl.batch_amount('train')))
+sp.pub("valid set length: {}".format(dl.sample_amount('valid')))
+sp.pub("valid batch amount: {}".format(dl.batch_amount('valid')))
+sp.pub("model parameter amount: {}".format(model.parameter_amount))
+
+
+conf = tf.ConfigProto()
+conf.gpu_options.per_process_gpu_memory_fraction = 0.8
+conf.gpu_options.allocator_type = 'BFC'
+conf.gpu_options.allow_growth = True
+conf.allow_soft_placement = True
+
+with tf.Session(config=conf) as sess:
+
+    if sp.restore and restore_checkpoint:
+        saver.restore(sess, os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(restore_checkpoint)))
+        epoch_start = restore_checkpoint + 1
+        sp.pub("restart training with checkpoint {}".format(epoch_start - 1))
+    elif sp.restore and not restore_checkpoint:
+        if tf.train.latest_checkpoint(sp.session_dir) == None:
+            sess.run(init_op)
+            epoch_start = 0
+            sp.pub("start new training")
+        else:
+            saver.restore(sess,tf.train.latest_checkpoint(sp.session_dir))
+            epoch_start = int(tf.train.latest_checkpoint(sp.session_dir).split('_')[-1].split('.')[0]) + 1
+            sp.pub("restart training with checkpoint {}".format(epoch_start - 1))
+    else:
+        sess.run(init_op)
+        epoch_start = 0
+        sp.pub("start new training")
+
+    writer = tf.summary.FileWriter(os.path.join(sp.session_dir, "summary"), sess.graph)
+
+    for e in range(epoch_start, sp.config('training')['epochs']):
+
+        train_cost = 0
+        train_count = 0
+        all_corrects = 0
+        all_overall = 0
+        time_e = time.time()
+        time_0 = time.time()
+
+        for step in tqdm(range(int(dl.batch_amount('train')))):
+
+            sample = next(train_loader)
+
+            _, c, summary, lb, ls  = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale],feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']})
+            train_cost += c
+            train_count += 1
+            writer.add_summary(summary, e * dl.batch_amount('train') + step)
+            writer.add_summary(lb, e * dl.batch_amount('train') + step)
+            writer.add_summary(ls, e * dl.batch_amount('train') + step)
+
+        valid_cost = 0
+        valid_count = 0
+
+        for v in range(int(dl.batch_amount('valid'))):
+            vsample = next(valid_loader)
+            vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss],feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
+            valid_cost += vcost
+            valid_count += 1
+            writer.add_summary(summary, e *  dl.batch_amount('valid') + v)
+            tm = np.argmax(vsample['y'], axis=-1)
+            pm = np.argmax(vpred, axis=-1)
+            corrects = np.equal(tm, pm)
+            all_corrects += np.sum(corrects * vsample['m'])
+            all_overall += np.sum(vsample['m'])
+
+        valid_cost = valid_cost / valid_count
+        train_cost = train_cost /train_count
+        word_error_rate = 1- (all_corrects/all_overall)
+
+        if not np.isnan(valid_cost):
+
+            save_path = saver.save(sess, os.path.join(sp.session_dir ,"model_dump_{}.ckpt".format(e)))
+
+            if analyse:
+
+                controller_inf = []
+                memory_inf = []
+                all_corrects = 0
+                all_overall = 0
+
+                for vstep in range(10):
+                    vsample = next(valid_loader)
+
+                    analyse_values, prediction, gradients = sess.run([model.analyse, model.prediction, trainer.gradients],
+                                                                           feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
+                    weights = {v.name: {'var':g[1], 'grad':g[0], 'shape':g[0].shape } for v, g in zip(model.trainable_variables, gradients)}
+                    if 'x_word' not in vsample.keys():
+                        vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1),(1,0))
+                    data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'],]
+
+
+                    decoded_targets, decoded_predictions = dl.decode_output(vsample, prediction)
+
+                    save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights ]
+
+                    co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch", save_plot=vstep)
+                    controller_inf.append(co_inf)
+                    memory_inf.append(mu_inf)
+
+
+                controller_inf = np.mean(controller_inf)
+                memory_inf = np.mean(memory_inf)
+
+                writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]), e * dl.batch_amount('train') + step)
+                writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]), e * dl.batch_amount('train') + step)
+                writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]), e * dl.batch_amount('train') + step)
+
+
+                sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, "
+                       "memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
+                        e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf, time.time() - time_0, sp.time_stamp(), save_path))
+                sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence", "memory influence", "wer"],
+                           [e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf, word_error_rate])
+
+            else:
+                sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
+                    e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path))
+                sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"], [e, step, train_cost, valid_cost, time.time() - time_0])
+        else:
+            sp.pub("ERROR: nan in training")
+            sys.exit("NAN")
+
+        if early_stop(valid_cost):
+            sp.pub("EARLYSTOP: valid error increase")
+            sys.exit("EARLYSTOP")