From 2721d27d16003d1b04d137964b369259119627a8 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 01:28:39 +0530 Subject: [PATCH 01/10] cleanup, print more info for repr --- dnc/dnc.py | 43 +++++++++++++++++++++++++++++++++++++++++++ dnc/sparse_memory.py | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/dnc/dnc.py b/dnc/dnc.py index b482a6f..1c49749 100644 --- a/dnc/dnc.py +++ b/dnc/dnc.py @@ -271,3 +271,46 @@ class DNC(nn.Module): return outputs, (controller_hidden, mem_hidden, read_vectors), viz else: return outputs, (controller_hidden, mem_hidden, read_vectors) + + def __repr__(self): + s = "\n----------------------------------------\n" + s += '{name}({input_size}, {hidden_size}' + if self.rnn_type != 'lstm': + s += ', rnn_type={rnn_type}' + if self.num_layers != 1: + s += ', num_layers={num_layers}' + if self.num_hidden_layers != 2: + s += ', num_hidden_layers={num_hidden_layers}' + if self.bias != True: + s += ', bias={bias}' + if self.batch_first != True: + s += ', batch_first={batch_first}' + if self.dropout != 0: + s += ', dropout={dropout}' + if self.bidirectional != False: + s += ', bidirectional={bidirectional}' + if self.nr_cells != 5: + s += ', nr_cells={nr_cells}' + if self.read_heads != 2: + s += ', read_heads={read_heads}' + if self.cell_size != 10: + s += ', cell_size={cell_size}' + if self.nonlinearity != 'tanh': + s += ', nonlinearity={nonlinearity}' + if self.gpu_id != -1: + s += ', gpu_id={gpu_id}' + if self.independent_linears != False: + s += ', independent_linears={independent_linears}' + if self.share_memory != True: + s += ', share_memory={share_memory}' + if self.debug != False: + s += ', debug={debug}' + if self.clip != 20: + s += ', clip={clip}' + + s += ")\n" + super(DNC, self).__repr__() + \ + "\n----------------------------------------\n" + return s.format(name=self.__class__.__name__, **self.__dict__) + + + diff --git a/dnc/sparse_memory.py b/dnc/sparse_memory.py index 5370f4a..f8d5155 100644 --- a/dnc/sparse_memory.py +++ b/dnc/sparse_memory.py @@ -134,7 +134,7 @@ class SparseMemory(nn.Module): def write_into_sparse_memory(self, hidden): visible_memory = hidden['visible_memory'] - positions = hidden['read_positions'].squeeze() + positions = hidden['read_positions'] (b, m, w) = hidden['memory'].size() # update memory From 24d94fc4065646d4c7ceedffa56105b6cafe20d4 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 01:29:12 +0530 Subject: [PATCH 02/10] Add adding tasks --- README.md | 27 +++- tasks/adding_task.py | 268 ++++++++++++++++++++++++++++++++++++++ tasks/adding_task_v2.py | 276 ++++++++++++++++++++++++++++++++++++++++ tasks/copy_task.py | 5 +- 4 files changed, 571 insertions(+), 5 deletions(-) create mode 100644 tasks/adding_task.py create mode 100644 tasks/adding_task_v2.py diff --git a/README.md b/README.md index 51f9e82..799d181 100644 --- a/README.md +++ b/README.md @@ -362,7 +362,9 @@ Memory vectors returned by forward pass (`np.ndarray`): | `debug_memory['usage']` | layer * time | nr_cells -## Example copy task +## Tasks + +### Copy task The copy task, as descibed in the original paper, is included in the repo. @@ -403,6 +405,29 @@ The visdom dashboard shows memory as a heatmap for batch 0 every `-summarize_fre ![Visdom dashboard](./docs/dnc-mem-debug.png) +### Generalizing Addition task + +The adding task is as described in [this github pull request](https://github.com/Mostafa-Samir/DNC-tensorflow/pull/4#issue-199369192). +This task +- creates one-hot vectors of size `input_size`, each representing a number +- feeds a sentence of them to a network +- the output of which is added to get the sum of the decoded outputs + +The task first trains the network for sentences of size ~100, and then tests if the network genetalizes for lengths ~1000. + +```bash +python3 -B ./tasks/adding_task.py -cuda 0 -lr 0.0001 -rnn_type lstm -memory_type sam -nlayer 1 -nhlayer 1 -nhid 100 -dropout 0 -mem_slot 1000 -mem_size 32 -read_heads 1 -sparse_reads 4 -batch_size 20 -optim rmsprop -input_size 3 -sequence_max_length 1000 +``` + +### Generalizing Addition task v2 + +The second adding task is similar to the first one, except that the network's output at the last time step is used for loss calculation, forcing the network to learn to add. + +```bash +python3 -B ./tasks/adding_task_v2.py -cuda 0 -lr 0.0001 -rnn_type lstm -memory_type sam -nlayer 1 -nhlayer 1 -nhid 100 -dropout 0 -mem_slot 1000 -mem_size 32 -read_heads 1 -sparse_reads 4 -batch_size 20 -optim rmsprop -input_size 3 -sequence_max_length 1000 +``` + + ## Code Structure 1. DNCs: diff --git a/tasks/adding_task.py b/tasks/adding_task.py new file mode 100644 index 0000000..0c2e0b0 --- /dev/null +++ b/tasks/adding_task.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import warnings +warnings.filterwarnings('ignore') + +import numpy as np +import getopt +import sys +import os +import math +import time +import argparse +from visdom import Visdom + +sys.path.insert(0, os.path.join('..', '..')) + +import torch as T +from torch.autograd import Variable as var +import torch.nn.functional as F +import torch.optim as optim + +from torch.nn.utils import clip_grad_norm + +from dnc.dnc import DNC +from dnc.sdnc import SDNC +from dnc.sam import SAM +from dnc.util import * + +parser = argparse.ArgumentParser(description='PyTorch Differentiable Neural Computer') +parser.add_argument('-input_size', type=int, default=6, help='dimension of input feature') +parser.add_argument('-rnn_type', type=str, default='lstm', help='type of recurrent cells to use for the controller') +parser.add_argument('-nhid', type=int, default=64, help='number of hidden units of the inner nn') +parser.add_argument('-dropout', type=float, default=0, help='controller dropout') +parser.add_argument('-memory_type', type=str, default='dnc', help='dense or sparse memory: dnc | sdnc | sam') + +parser.add_argument('-nlayer', type=int, default=1, help='number of layers') +parser.add_argument('-nhlayer', type=int, default=2, help='number of hidden layers') +parser.add_argument('-lr', type=float, default=1e-4, help='initial learning rate') +parser.add_argument('-optim', type=str, default='adam', help='learning rule, supports adam|rmsprop') +parser.add_argument('-clip', type=float, default=50, help='gradient clipping') + +parser.add_argument('-batch_size', type=int, default=100, metavar='N', help='batch size') +parser.add_argument('-mem_size', type=int, default=20, help='memory dimension') +parser.add_argument('-mem_slot', type=int, default=16, help='number of memory slots') +parser.add_argument('-read_heads', type=int, default=4, help='number of read heads') +parser.add_argument('-sparse_reads', type=int, default=10, help='number of sparse reads per read head') +parser.add_argument('-temporal_reads', type=int, default=2, help='number of temporal reads') + +parser.add_argument('-sequence_max_length', type=int, default=1000, metavar='N', help='sequence_max_length') +parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') + +parser.add_argument('-iterations', type=int, default=1000, metavar='N', help='total number of iteration') +parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') +parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency') +parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps') + +args = parser.parse_args() +print(args) + +viz = Visdom() +# assert viz.check_connection() + +if args.cuda != -1: + print('Using CUDA.') + T.manual_seed(1111) +else: + print('Using CPU.') + +def llprint(message): + sys.stdout.write(message) + sys.stdout.flush() + + +def onehot(x, n): + ret = np.zeros(n).astype(np.float32) + ret[x] = 1.0 + return ret + + +def generate_data(length, size): + + content = np.random.randint(0, size - 1, length) + + seqlen = length + 1 + x_seq_list = [float('nan')] * seqlen + sums = 0.0 + sums_text = "" + for i in range(seqlen): + if (i < length): + x_seq_list[i] = onehot(content[i], size) + sums += content[i] + sums_text += str(content[i]) + " + " + else: + x_seq_list[i] = onehot(size - 1, size) + + x_seq_list = np.array(x_seq_list) + x_seq_list = x_seq_list.reshape((1,) + x_seq_list.shape) + sums = np.array(sums) + sums = sums.reshape(1, 1, 1) + + return cudavec(x_seq_list, gpu_id=args.cuda).float(), cudavec(sums, gpu_id=args.cuda).float(), sums_text + + +def cross_entropy(prediction, target): + return (prediction - target) ** 2 + + +if __name__ == '__main__': + + dirname = os.path.dirname(__file__) + ckpts_dir = os.path.join(dirname, 'checkpoints') + + input_size = args.input_size + memory_type = args.memory_type + lr = args.lr + clip = args.clip + batch_size = args.batch_size + sequence_max_length = args.sequence_max_length + cuda = args.cuda + iterations = args.iterations + summarize_freq = args.summarize_freq + check_freq = args.check_freq + visdom = args.visdom + + from_checkpoint = None + + if args.memory_type == 'dnc': + rnn = DNC( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=True + ) + elif args.memory_type == 'sdnc': + rnn = SDNC( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + sparse_reads=args.sparse_reads, + temporal_reads=args.temporal_reads, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + elif args.memory_type == 'sam': + rnn = SAM( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + sparse_reads=args.sparse_reads, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + else: + raise Exception('Not recognized type of memory') + + if args.cuda != -1: + rnn = rnn.cuda(args.cuda) + + print(rnn) + + last_save_losses = [] + + if args.optim == 'adam': + optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 + elif args.optim == 'adamax': + optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 + elif args.optim == 'rmsprop': + optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 + elif args.optim == 'sgd': + optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 + elif args.optim == 'adagrad': + optimizer = optim.Adagrad(rnn.parameters(), lr=args.lr) + elif args.optim == 'adadelta': + optimizer = optim.Adadelta(rnn.parameters(), lr=args.lr) + + last_100_losses = [] + + (chx, mhx, rv) = (None, None, None) + for epoch in range(iterations + 1): + llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) + optimizer.zero_grad() + # We use for training just (sequence_max_length / 10) examples + random_length = np.random.randint(2, (sequence_max_length / 10) + 1) + input_data, target_output, sums_text = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + # print(.size(), target_output.size()) + output = output.sum(dim=2, keepdim=True).sum(dim=1, keepdim=True) + loss = cross_entropy(output, target_output) + + loss.backward() + + T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) + optimizer.step() + loss_value = loss.data[0] + + # detach memory from graph + mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() } + + summerize = (epoch % summarize_freq == 0) + take_checkpoint = (epoch != 0) and (epoch % iterations == 0) + + last_100_losses.append(loss_value) + + if summerize: + llprint("\rIteration %d/%d" % (epoch, iterations)) + llprint("\nAvg. Logistic Loss: %.4f\n" % (np.mean(last_100_losses))) + output = output.data.cpu().numpy() + print("Real value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") + last_100_losses = [] + + if take_checkpoint: + llprint("\nSaving Checkpoint ... "), + check_ptr = os.path.join(ckpts_dir, 'step_{}.pth'.format(epoch)) + cur_weights = rnn.state_dict() + T.save(cur_weights, check_ptr) + llprint("Done!\n") + + llprint("\nTesting generalization...\n") + + rnn.eval() + + for i in range(int(iterations + 1 / 10)): + llprint("\nIteration %d/%d" % (i, iterations)) + # We test now the learned generalization using sequence_max_length examples + random_length = np.random.randint(2, int(sequence_max_length) + 1) + input_data, target_output, sums_text = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + output = output.sum(dim=2, keepdim=True).sum(dim=1, keepdim=True) + output = output.data.cpu().numpy() + print("\nReal value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") diff --git a/tasks/adding_task_v2.py b/tasks/adding_task_v2.py new file mode 100644 index 0000000..fcba1f4 --- /dev/null +++ b/tasks/adding_task_v2.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import warnings +warnings.filterwarnings('ignore') + +import numpy as np +import getopt +import sys +import os +import math +import time +import argparse +from visdom import Visdom + +sys.path.insert(0, os.path.join('..', '..')) + +import torch as T +from torch.autograd import Variable as var +import torch.nn.functional as F +import torch.optim as optim + +from torch.nn.utils import clip_grad_norm + +from dnc.dnc import DNC +from dnc.sdnc import SDNC +from dnc.sam import SAM +from dnc.util import * + +parser = argparse.ArgumentParser(description='PyTorch Differentiable Neural Computer') +parser.add_argument('-input_size', type=int, default=6, help='dimension of input feature') +parser.add_argument('-rnn_type', type=str, default='lstm', help='type of recurrent cells to use for the controller') +parser.add_argument('-nhid', type=int, default=100, help='number of hidden units of the inner nn') +parser.add_argument('-dropout', type=float, default=0, help='controller dropout') +parser.add_argument('-memory_type', type=str, default='dnc', help='dense or sparse memory: dnc | sdnc | sam') + +parser.add_argument('-nlayer', type=int, default=1, help='number of layers') +parser.add_argument('-nhlayer', type=int, default=2, help='number of hidden layers') +parser.add_argument('-lr', type=float, default=1e-4, help='initial learning rate') +parser.add_argument('-optim', type=str, default='adam', help='learning rule, supports adam|rmsprop') +parser.add_argument('-clip', type=float, default=50, help='gradient clipping') + +parser.add_argument('-batch_size', type=int, default=100, metavar='N', help='batch size') +parser.add_argument('-mem_size', type=int, default=20, help='memory dimension') +parser.add_argument('-mem_slot', type=int, default=16, help='number of memory slots') +parser.add_argument('-read_heads', type=int, default=4, help='number of read heads') +parser.add_argument('-sparse_reads', type=int, default=10, help='number of sparse reads per read head') +parser.add_argument('-temporal_reads', type=int, default=2, help='number of temporal reads') + +parser.add_argument('-sequence_max_length', type=int, default=4, metavar='N', help='sequence_max_length') +parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') + +parser.add_argument('-iterations', type=int, default=1000, metavar='N', help='total number of iteration') +parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') +parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency') +parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps') + +args = parser.parse_args() +print(args) + +viz = Visdom() +# assert viz.check_connection() + +if args.cuda != -1: + print('Using CUDA.') + T.manual_seed(1111) +else: + print('Using CPU.') + +def llprint(message): + sys.stdout.write(message) + sys.stdout.flush() + + +def onehot(x, n): + ret = np.zeros(n).astype(np.float32) + ret[x] = 1.0 + return ret + + +def generate_data(length, size): + + content = np.random.randint(0, size - 1, length) + + seqlen = length + 1 + x_seq_list = [float('nan')] * seqlen + sums = 0.0 + sums_text = "" + for i in range(seqlen): + if (i < length): + x_seq_list[i] = onehot(content[i], size) + sums += content[i] + sums_text += str(content[i]) + " + " + else: + x_seq_list[i] = onehot(size - 1, size) + + x_seq_list = np.array(x_seq_list) + x_seq_list = x_seq_list.reshape((1,) + x_seq_list.shape) + x_seq_list = np.reshape(x_seq_list, (1, -1, size)) + + target_output = np.zeros((1, 1, seqlen), dtype=np.float32) + target_output[:, -1, -1] = sums + target_output = np.reshape(target_output, (1, -1, 1)) + + weights_vec = np.zeros((1, 1, seqlen), dtype=np.float32) + weights_vec[:, -1, -1] = 1.0 + weights_vec = np.reshape(weights_vec, (1, -1, 1)) + + return cudavec(x_seq_list, gpu_id=args.cuda).float(), \ + cudavec(target_output, gpu_id=args.cuda).float(), sums_text, \ + cudavec(weights_vec, gpu_id=args.cuda) + + +if __name__ == '__main__': + + dirname = os.path.dirname(__file__) + ckpts_dir = os.path.join(dirname, 'checkpoints') + + input_size = args.input_size + memory_type = args.memory_type + lr = args.lr + clip = args.clip + batch_size = args.batch_size + sequence_max_length = args.sequence_max_length + cuda = args.cuda + iterations = args.iterations + summarize_freq = args.summarize_freq + check_freq = args.check_freq + visdom = args.visdom + + from_checkpoint = None + + if args.memory_type == 'dnc': + rnn = DNC( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=True + ) + elif args.memory_type == 'sdnc': + rnn = SDNC( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + sparse_reads=args.sparse_reads, + temporal_reads=args.temporal_reads, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + elif args.memory_type == 'sam': + rnn = SAM( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + sparse_reads=args.sparse_reads, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + else: + raise Exception('Not recognized type of memory') + + if args.cuda != -1: + rnn = rnn.cuda(args.cuda) + + print(rnn) + + last_save_losses = [] + + if args.optim == 'adam': + optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 + elif args.optim == 'adamax': + optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 + elif args.optim == 'rmsprop': + optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 + elif args.optim == 'sgd': + optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 + elif args.optim == 'adagrad': + optimizer = optim.Adagrad(rnn.parameters(), lr=args.lr) + elif args.optim == 'adadelta': + optimizer = optim.Adadelta(rnn.parameters(), lr=args.lr) + + last_100_losses = [] + + (chx, mhx, rv) = (None, None, None) + for epoch in range(iterations + 1): + llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) + optimizer.zero_grad() + + # We use for training just (sequence_max_length / 10) examples + random_length = np.random.randint(2, (sequence_max_length / 10) + 1) + input_data, target_output, sums_text, loss_weights = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + loss = T.mean((output[:, -1, :].sum() - target_output.sum()) ** 2, dim=-1) + + loss.backward() + + T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) + optimizer.step() + loss_value = loss.data[0] + + # detach memory from graph + mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() } + + summerize = (epoch % summarize_freq == 0) + take_checkpoint = (epoch != 0) and (epoch % iterations == 0) + + last_100_losses.append(loss_value) + + if summerize: + output = output[:, -1, :].sum().data.cpu().numpy()[0] + target_output = target_output.sum().data.cpu().numpy() + + llprint("\rIteration %d/%d" % (epoch, iterations)) + llprint("\nAvg. Logistic Loss: %.4f\n" % (np.mean(last_100_losses))) + print(target_output) + print("Real value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") + last_100_losses = [] + + if take_checkpoint: + llprint("\nSaving Checkpoint ... "), + check_ptr = os.path.join(ckpts_dir, 'step_{}.pth'.format(epoch)) + cur_weights = rnn.state_dict() + T.save(cur_weights, check_ptr) + llprint("Done!\n") + + llprint("\nTesting generalization...\n") + + rnn.eval() + + for i in range(int(iterations + 1 / 10)): + llprint("\nIteration %d/%d" % (i, iterations)) + # We test now the learned generalization using sequence_max_length examples + random_length = np.random.randint(2, sequence_max_length + 1) + input_data, target_output, sums_text, loss_weights = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + output = output[:, -1, :].sum().data.cpu().numpy()[0] + target_output = target_output.sum().data.cpu().numpy() + + print("\nReal value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") diff --git a/tasks/copy_task.py b/tasks/copy_task.py index e2dd87a..07a2ccb 100755 --- a/tasks/copy_task.py +++ b/tasks/copy_task.py @@ -51,7 +51,6 @@ parser.add_argument('-sequence_max_length', type=int, default=4, metavar='N', he parser.add_argument('-curriculum_increment', type=int, default=0, metavar='N', help='sequence_max_length incrementor per 1K iterations') parser.add_argument('-curriculum_freq', type=int, default=1000, metavar='N', help='sequence_max_length incrementor per 1K iterations') parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') -parser.add_argument('-log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument('-iterations', type=int, default=100000, metavar='N', help='total number of iteration') parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') @@ -183,9 +182,7 @@ if __name__ == '__main__': if args.optim == 'adam': optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 - if args.optim == 'sparseadam': - optimizer = optim.SparseAdam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 - if args.optim == 'adamax': + elif args.optim == 'adamax': optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'rmsprop': optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, eps=1e-10) # 0.0001 From 6de3644d49edc68061d810ed31d496970d139b2f Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 01:34:35 +0530 Subject: [PATCH 03/10] 2k iterations --- tasks/adding_task.py | 2 +- tasks/adding_task_v2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/adding_task.py b/tasks/adding_task.py index 0c2e0b0..360baba 100644 --- a/tasks/adding_task.py +++ b/tasks/adding_task.py @@ -50,7 +50,7 @@ parser.add_argument('-temporal_reads', type=int, default=2, help='number of temp parser.add_argument('-sequence_max_length', type=int, default=1000, metavar='N', help='sequence_max_length') parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') -parser.add_argument('-iterations', type=int, default=1000, metavar='N', help='total number of iteration') +parser.add_argument('-iterations', type=int, default=2000, metavar='N', help='total number of iteration') parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency') parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps') diff --git a/tasks/adding_task_v2.py b/tasks/adding_task_v2.py index fcba1f4..4352771 100644 --- a/tasks/adding_task_v2.py +++ b/tasks/adding_task_v2.py @@ -50,7 +50,7 @@ parser.add_argument('-temporal_reads', type=int, default=2, help='number of temp parser.add_argument('-sequence_max_length', type=int, default=4, metavar='N', help='sequence_max_length') parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') -parser.add_argument('-iterations', type=int, default=1000, metavar='N', help='total number of iteration') +parser.add_argument('-iterations', type=int, default=2000, metavar='N', help='total number of iteration') parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency') parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps') From 9164e5721d516602a3bc0142af35971a10ce3c23 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 01:38:03 +0530 Subject: [PATCH 04/10] update toc --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 799d181..f9e4e0b 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,10 @@ Includes: - [SAM](#sam) - [Example usage](#example-usage-2) - [Debugging](#debugging-2) -- [Example copy task](#example-copy-task) +- [Tasks](#tasks) + - [Copy task](#copy-task) + - [Generalizing Addition task](#generalizing-addition-task) + - [Generalizing Addition task v2](#generalizing-addition-task-v2) - [Code Structure](#code-structure) - [General noteworthy stuff](#general-noteworthy-stuff) From 1d11fae06b2715df41390cde98a32ede99c4b25a Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 11:17:59 +0530 Subject: [PATCH 05/10] Few bug fixes for unsqueezing batch size 1 --- dnc/sparse_temporal_memory.py | 8 ++++---- tasks/adding_task.py | 1 - tasks/adding_task_v2.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dnc/sparse_temporal_memory.py b/dnc/sparse_temporal_memory.py index bb066d7..1154a40 100644 --- a/dnc/sparse_temporal_memory.py +++ b/dnc/sparse_temporal_memory.py @@ -145,7 +145,7 @@ class SparseTemporalMemory(nn.Module): def write_into_sparse_memory(self, hidden): visible_memory = hidden['visible_memory'] - positions = hidden['read_positions'].squeeze() + positions = hidden['read_positions'] (b, m, w) = hidden['memory'].size() # update memory @@ -181,7 +181,7 @@ class SparseTemporalMemory(nn.Module): rev_link_matrix = (1 - temporal_write_weights_j) * rev_link_matrix + (temporal_write_weights_j * precedence_dense_i) - return link_matrix.squeeze() * I, rev_link_matrix.squeeze() * I + return link_matrix * I, rev_link_matrix * I def update_precedence(self, precedence, write_weights): return (1 - T.sum(write_weights, dim=-1, keepdim=True)) * precedence + write_weights @@ -255,8 +255,8 @@ class SparseTemporalMemory(nn.Module): return usage, I def directional_weightings(self, link_matrix, rev_link_matrix, temporal_read_weights): - f = T.bmm(link_matrix, temporal_read_weights.unsqueeze(2)).squeeze() - b = T.bmm(rev_link_matrix, temporal_read_weights.unsqueeze(2)).squeeze() + f = T.bmm(link_matrix, temporal_read_weights.unsqueeze(2)).squeeze(2) + b = T.bmm(rev_link_matrix, temporal_read_weights.unsqueeze(2)).squeeze(2) return f, b def read_from_sparse_memory(self, memory, indexes, keys, least_used_mem, usage, forward, backward, prev_read_positions): diff --git a/tasks/adding_task.py b/tasks/adding_task.py index 360baba..f5a88e8 100644 --- a/tasks/adding_task.py +++ b/tasks/adding_task.py @@ -214,7 +214,6 @@ if __name__ == '__main__': else: output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) - # print(.size(), target_output.size()) output = output.sum(dim=2, keepdim=True).sum(dim=1, keepdim=True) loss = cross_entropy(output, target_output) diff --git a/tasks/adding_task_v2.py b/tasks/adding_task_v2.py index 4352771..82ef224 100644 --- a/tasks/adding_task_v2.py +++ b/tasks/adding_task_v2.py @@ -220,7 +220,7 @@ if __name__ == '__main__': else: output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) - loss = T.mean((output[:, -1, :].sum() - target_output.sum()) ** 2, dim=-1) + loss = T.mean(((loss_weights * output).sum(-1, keepdim=True) - target_output) ** 2) loss.backward() From 7719698fff947bcb6cc152c728d3cd3e62370c64 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 14:18:43 +0530 Subject: [PATCH 06/10] Add generalization of 10x --- tasks/copy_task.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tasks/copy_task.py b/tasks/copy_task.py index 07a2ccb..8f63bc0 100755 --- a/tasks/copy_task.py +++ b/tasks/copy_task.py @@ -185,7 +185,7 @@ if __name__ == '__main__': elif args.optim == 'adamax': optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'rmsprop': - optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, eps=1e-10) # 0.0001 + optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 elif args.optim == 'sgd': optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 elif args.optim == 'adagrad': @@ -358,3 +358,24 @@ if __name__ == '__main__': cur_weights = rnn.state_dict() T.save(cur_weights, check_ptr) llprint("Done!\n") + + for i in range(int((iterations + 1) / 10)): + llprint("\nIteration %d/%d" % (i, iterations)) + # We test now the learned generalization using sequence_max_length examples + random_length = np.random.randint(2, sequence_max_length * 10 + 1) + input_data, target_output, loss_weights = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + output = output[:, -1, :].sum().data.cpu().numpy()[0] + target_output = target_output.sum().data.cpu().numpy() + + try: + print("\nReal value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") + except Exception as e: + pass + From c74defd78b723be2b0f6abe78eca0c21fb1b9ab6 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 14:19:06 +0530 Subject: [PATCH 07/10] Modify how sequence max length is scaled for generalization --- README.md | 17 +-- tasks/adding_task.py | 10 +- tasks/adding_task_v2.py | 276 ---------------------------------------- 3 files changed, 14 insertions(+), 289 deletions(-) delete mode 100644 tasks/adding_task_v2.py diff --git a/README.md b/README.md index f9e4e0b..021bbb0 100644 --- a/README.md +++ b/README.md @@ -367,7 +367,7 @@ Memory vectors returned by forward pass (`np.ndarray`): ## Tasks -### Copy task +### Copy task (with curriculum and generalization) The copy task, as descibed in the original paper, is included in the repo. @@ -375,13 +375,13 @@ From the project root: ```bash python ./tasks/copy_task.py -cuda 0 -optim rmsprop -batch_size 32 -mem_slot 64 # (like original implementation) -python3 ./tasks/copy_task.py -cuda 0 -lr 0.001 -rnn_type lstm -nlayer 1 -nhlayer 2 -dropout 0 -mem_slot 32 -batch_size 1000 -optim adam -sequence_max_length 8 # (faster convergence) +python ./tasks/copy_task.py -cuda 0 -lr 0.001 -rnn_type lstm -nlayer 1 -nhlayer 2 -dropout 0 -mem_slot 32 -batch_size 1000 -optim adam -sequence_max_length 8 # (faster convergence) For SDNCs: -python3 -B ./tasks/copy_task.py -cuda 0 -lr 0.001 -rnn_type lstm -memory_type sdnc -nlayer 1 -nhlayer 2 -dropout 0 -mem_slot 100 -mem_size 10 -read_heads 1 -sparse_reads 10 -batch_size 20 -optim adam -sequence_max_length 10 +python ./tasks/copy_task.py -cuda 0 -lr 0.001 -rnn_type lstm -memory_type sdnc -nlayer 1 -nhlayer 2 -dropout 0 -mem_slot 100 -mem_size 10 -read_heads 1 -sparse_reads 10 -batch_size 20 -optim adam -sequence_max_length 10 and for curriculum learning for SDNCs: -python3 -B ./tasks/copy_task.py -cuda 0 -lr 0.001 -rnn_type lstm -memory_type sdnc -nlayer 1 -nhlayer 2 -dropout 0 -mem_slot 100 -mem_size 10 -read_heads 1 -sparse_reads 4 -temporal_reads 4 -batch_size 20 -optim adam -sequence_max_length 4 -curriculum_increment 2 -curriculum_freq 10000 +python ./tasks/copy_task.py -cuda 0 -lr 0.001 -rnn_type lstm -memory_type sdnc -nlayer 1 -nhlayer 2 -dropout 0 -mem_slot 100 -mem_size 10 -read_heads 1 -sparse_reads 4 -temporal_reads 4 -batch_size 20 -optim adam -sequence_max_length 4 -curriculum_increment 2 -curriculum_freq 10000 ``` For the full set of options, see: @@ -419,18 +419,19 @@ This task The task first trains the network for sentences of size ~100, and then tests if the network genetalizes for lengths ~1000. ```bash -python3 -B ./tasks/adding_task.py -cuda 0 -lr 0.0001 -rnn_type lstm -memory_type sam -nlayer 1 -nhlayer 1 -nhid 100 -dropout 0 -mem_slot 1000 -mem_size 32 -read_heads 1 -sparse_reads 4 -batch_size 20 -optim rmsprop -input_size 3 -sequence_max_length 1000 +python ./tasks/adding_task.py -cuda 0 -lr 0.0001 -rnn_type lstm -memory_type sam -nlayer 1 -nhlayer 1 -nhid 100 -dropout 0 -mem_slot 1000 -mem_size 32 -read_heads 1 -sparse_reads 4 -batch_size 20 -optim rmsprop -input_size 3 -sequence_max_length 100 ``` -### Generalizing Addition task v2 +### Generalizing Argmax task -The second adding task is similar to the first one, except that the network's output at the last time step is used for loss calculation, forcing the network to learn to add. +The second adding task is similar to the first one, except that the network's output at the last time step is expected to be the argmax of the input. ```bash -python3 -B ./tasks/adding_task_v2.py -cuda 0 -lr 0.0001 -rnn_type lstm -memory_type sam -nlayer 1 -nhlayer 1 -nhid 100 -dropout 0 -mem_slot 1000 -mem_size 32 -read_heads 1 -sparse_reads 4 -batch_size 20 -optim rmsprop -input_size 3 -sequence_max_length 1000 +python ./tasks/argmax_task.py -cuda 0 -lr 0.0001 -rnn_type lstm -memory_type dnc -nlayer 1 -nhlayer 1 -nhid 100 -dropout 0 -mem_slot 100 -mem_size 10 -read_heads 2 -batch_size 1 -optim rmsprop -sequence_max_length 15 -input_size 10 -iterations 10000 ``` + ## Code Structure 1. DNCs: diff --git a/tasks/adding_task.py b/tasks/adding_task.py index f5a88e8..2c7db8e 100644 --- a/tasks/adding_task.py +++ b/tasks/adding_task.py @@ -206,7 +206,7 @@ if __name__ == '__main__': llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) optimizer.zero_grad() # We use for training just (sequence_max_length / 10) examples - random_length = np.random.randint(2, (sequence_max_length / 10) + 1) + random_length = np.random.randint(2, (sequence_max_length) + 1) input_data, target_output, sums_text = generate_data(random_length, input_size) if rnn.debug: @@ -226,12 +226,12 @@ if __name__ == '__main__': # detach memory from graph mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() } - summerize = (epoch % summarize_freq == 0) + summarize = (epoch % summarize_freq == 0) take_checkpoint = (epoch != 0) and (epoch % iterations == 0) last_100_losses.append(loss_value) - if summerize: + if summarize: llprint("\rIteration %d/%d" % (epoch, iterations)) llprint("\nAvg. Logistic Loss: %.4f\n" % (np.mean(last_100_losses))) output = output.data.cpu().numpy() @@ -250,10 +250,10 @@ if __name__ == '__main__': rnn.eval() - for i in range(int(iterations + 1 / 10)): + for i in range(int((iterations + 1) / 10)): llprint("\nIteration %d/%d" % (i, iterations)) # We test now the learned generalization using sequence_max_length examples - random_length = np.random.randint(2, int(sequence_max_length) + 1) + random_length = np.random.randint(2, int(sequence_max_length) * 10 + 1) input_data, target_output, sums_text = generate_data(random_length, input_size) if rnn.debug: diff --git a/tasks/adding_task_v2.py b/tasks/adding_task_v2.py deleted file mode 100644 index 82ef224..0000000 --- a/tasks/adding_task_v2.py +++ /dev/null @@ -1,276 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import warnings -warnings.filterwarnings('ignore') - -import numpy as np -import getopt -import sys -import os -import math -import time -import argparse -from visdom import Visdom - -sys.path.insert(0, os.path.join('..', '..')) - -import torch as T -from torch.autograd import Variable as var -import torch.nn.functional as F -import torch.optim as optim - -from torch.nn.utils import clip_grad_norm - -from dnc.dnc import DNC -from dnc.sdnc import SDNC -from dnc.sam import SAM -from dnc.util import * - -parser = argparse.ArgumentParser(description='PyTorch Differentiable Neural Computer') -parser.add_argument('-input_size', type=int, default=6, help='dimension of input feature') -parser.add_argument('-rnn_type', type=str, default='lstm', help='type of recurrent cells to use for the controller') -parser.add_argument('-nhid', type=int, default=100, help='number of hidden units of the inner nn') -parser.add_argument('-dropout', type=float, default=0, help='controller dropout') -parser.add_argument('-memory_type', type=str, default='dnc', help='dense or sparse memory: dnc | sdnc | sam') - -parser.add_argument('-nlayer', type=int, default=1, help='number of layers') -parser.add_argument('-nhlayer', type=int, default=2, help='number of hidden layers') -parser.add_argument('-lr', type=float, default=1e-4, help='initial learning rate') -parser.add_argument('-optim', type=str, default='adam', help='learning rule, supports adam|rmsprop') -parser.add_argument('-clip', type=float, default=50, help='gradient clipping') - -parser.add_argument('-batch_size', type=int, default=100, metavar='N', help='batch size') -parser.add_argument('-mem_size', type=int, default=20, help='memory dimension') -parser.add_argument('-mem_slot', type=int, default=16, help='number of memory slots') -parser.add_argument('-read_heads', type=int, default=4, help='number of read heads') -parser.add_argument('-sparse_reads', type=int, default=10, help='number of sparse reads per read head') -parser.add_argument('-temporal_reads', type=int, default=2, help='number of temporal reads') - -parser.add_argument('-sequence_max_length', type=int, default=4, metavar='N', help='sequence_max_length') -parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') - -parser.add_argument('-iterations', type=int, default=2000, metavar='N', help='total number of iteration') -parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') -parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency') -parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps') - -args = parser.parse_args() -print(args) - -viz = Visdom() -# assert viz.check_connection() - -if args.cuda != -1: - print('Using CUDA.') - T.manual_seed(1111) -else: - print('Using CPU.') - -def llprint(message): - sys.stdout.write(message) - sys.stdout.flush() - - -def onehot(x, n): - ret = np.zeros(n).astype(np.float32) - ret[x] = 1.0 - return ret - - -def generate_data(length, size): - - content = np.random.randint(0, size - 1, length) - - seqlen = length + 1 - x_seq_list = [float('nan')] * seqlen - sums = 0.0 - sums_text = "" - for i in range(seqlen): - if (i < length): - x_seq_list[i] = onehot(content[i], size) - sums += content[i] - sums_text += str(content[i]) + " + " - else: - x_seq_list[i] = onehot(size - 1, size) - - x_seq_list = np.array(x_seq_list) - x_seq_list = x_seq_list.reshape((1,) + x_seq_list.shape) - x_seq_list = np.reshape(x_seq_list, (1, -1, size)) - - target_output = np.zeros((1, 1, seqlen), dtype=np.float32) - target_output[:, -1, -1] = sums - target_output = np.reshape(target_output, (1, -1, 1)) - - weights_vec = np.zeros((1, 1, seqlen), dtype=np.float32) - weights_vec[:, -1, -1] = 1.0 - weights_vec = np.reshape(weights_vec, (1, -1, 1)) - - return cudavec(x_seq_list, gpu_id=args.cuda).float(), \ - cudavec(target_output, gpu_id=args.cuda).float(), sums_text, \ - cudavec(weights_vec, gpu_id=args.cuda) - - -if __name__ == '__main__': - - dirname = os.path.dirname(__file__) - ckpts_dir = os.path.join(dirname, 'checkpoints') - - input_size = args.input_size - memory_type = args.memory_type - lr = args.lr - clip = args.clip - batch_size = args.batch_size - sequence_max_length = args.sequence_max_length - cuda = args.cuda - iterations = args.iterations - summarize_freq = args.summarize_freq - check_freq = args.check_freq - visdom = args.visdom - - from_checkpoint = None - - if args.memory_type == 'dnc': - rnn = DNC( - input_size=args.input_size, - hidden_size=args.nhid, - rnn_type=args.rnn_type, - num_layers=args.nlayer, - num_hidden_layers=args.nhlayer, - dropout=args.dropout, - nr_cells=args.mem_slot, - cell_size=args.mem_size, - read_heads=args.read_heads, - gpu_id=args.cuda, - debug=args.visdom, - batch_first=True, - independent_linears=True - ) - elif args.memory_type == 'sdnc': - rnn = SDNC( - input_size=args.input_size, - hidden_size=args.nhid, - rnn_type=args.rnn_type, - num_layers=args.nlayer, - num_hidden_layers=args.nhlayer, - dropout=args.dropout, - nr_cells=args.mem_slot, - cell_size=args.mem_size, - sparse_reads=args.sparse_reads, - temporal_reads=args.temporal_reads, - read_heads=args.read_heads, - gpu_id=args.cuda, - debug=args.visdom, - batch_first=True, - independent_linears=False - ) - elif args.memory_type == 'sam': - rnn = SAM( - input_size=args.input_size, - hidden_size=args.nhid, - rnn_type=args.rnn_type, - num_layers=args.nlayer, - num_hidden_layers=args.nhlayer, - dropout=args.dropout, - nr_cells=args.mem_slot, - cell_size=args.mem_size, - sparse_reads=args.sparse_reads, - read_heads=args.read_heads, - gpu_id=args.cuda, - debug=args.visdom, - batch_first=True, - independent_linears=False - ) - else: - raise Exception('Not recognized type of memory') - - if args.cuda != -1: - rnn = rnn.cuda(args.cuda) - - print(rnn) - - last_save_losses = [] - - if args.optim == 'adam': - optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 - elif args.optim == 'adamax': - optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 - elif args.optim == 'rmsprop': - optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 - elif args.optim == 'sgd': - optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 - elif args.optim == 'adagrad': - optimizer = optim.Adagrad(rnn.parameters(), lr=args.lr) - elif args.optim == 'adadelta': - optimizer = optim.Adadelta(rnn.parameters(), lr=args.lr) - - last_100_losses = [] - - (chx, mhx, rv) = (None, None, None) - for epoch in range(iterations + 1): - llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) - optimizer.zero_grad() - - # We use for training just (sequence_max_length / 10) examples - random_length = np.random.randint(2, (sequence_max_length / 10) + 1) - input_data, target_output, sums_text, loss_weights = generate_data(random_length, input_size) - - if rnn.debug: - output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) - else: - output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) - - loss = T.mean(((loss_weights * output).sum(-1, keepdim=True) - target_output) ** 2) - - loss.backward() - - T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) - optimizer.step() - loss_value = loss.data[0] - - # detach memory from graph - mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() } - - summerize = (epoch % summarize_freq == 0) - take_checkpoint = (epoch != 0) and (epoch % iterations == 0) - - last_100_losses.append(loss_value) - - if summerize: - output = output[:, -1, :].sum().data.cpu().numpy()[0] - target_output = target_output.sum().data.cpu().numpy() - - llprint("\rIteration %d/%d" % (epoch, iterations)) - llprint("\nAvg. Logistic Loss: %.4f\n" % (np.mean(last_100_losses))) - print(target_output) - print("Real value: ", ' = ' + str(int(target_output[0]))) - print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") - last_100_losses = [] - - if take_checkpoint: - llprint("\nSaving Checkpoint ... "), - check_ptr = os.path.join(ckpts_dir, 'step_{}.pth'.format(epoch)) - cur_weights = rnn.state_dict() - T.save(cur_weights, check_ptr) - llprint("Done!\n") - - llprint("\nTesting generalization...\n") - - rnn.eval() - - for i in range(int(iterations + 1 / 10)): - llprint("\nIteration %d/%d" % (i, iterations)) - # We test now the learned generalization using sequence_max_length examples - random_length = np.random.randint(2, sequence_max_length + 1) - input_data, target_output, sums_text, loss_weights = generate_data(random_length, input_size) - - if rnn.debug: - output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) - else: - output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) - - output = output[:, -1, :].sum().data.cpu().numpy()[0] - target_output = target_output.sum().data.cpu().numpy() - - print("\nReal value: ", ' = ' + str(int(target_output[0]))) - print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") From 8988490a5ba65a73e629e328a010f684dd919159 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Tue, 19 Dec 2017 14:19:48 +0530 Subject: [PATCH 08/10] update toc --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 021bbb0..42884de 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,9 @@ Includes: - [Example usage](#example-usage-2) - [Debugging](#debugging-2) - [Tasks](#tasks) - - [Copy task](#copy-task) + - [Copy task (with curriculum and generalization)](#copy-task-with-curriculum-and-generalization) - [Generalizing Addition task](#generalizing-addition-task) - - [Generalizing Addition task v2](#generalizing-addition-task-v2) + - [Generalizing Argmax task](#generalizing-argmax-task) - [Code Structure](#code-structure) - [General noteworthy stuff](#general-noteworthy-stuff) From 78ac06a332487a2781dc1c526d58ca50a1eb5135 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Wed, 20 Dec 2017 01:58:17 +0530 Subject: [PATCH 09/10] Add argmax task --- tasks/argmax_task.py | 283 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 tasks/argmax_task.py diff --git a/tasks/argmax_task.py b/tasks/argmax_task.py new file mode 100644 index 0000000..79f8311 --- /dev/null +++ b/tasks/argmax_task.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import warnings +warnings.filterwarnings('ignore') + +import numpy as np +import getopt +import sys +import os +import math +import time +import argparse +from visdom import Visdom + +sys.path.insert(0, os.path.join('..', '..')) + +import torch as T +from torch.autograd import Variable as var +import torch.nn.functional as F +import torch.optim as optim + +from torch.nn.utils import clip_grad_norm + +from dnc.dnc import DNC +from dnc.sdnc import SDNC +from dnc.sam import SAM +from dnc.util import * + +parser = argparse.ArgumentParser(description='PyTorch Differentiable Neural Computer') +parser.add_argument('-input_size', type=int, default=6, help='dimension of input feature') +parser.add_argument('-rnn_type', type=str, default='lstm', help='type of recurrent cells to use for the controller') +parser.add_argument('-nhid', type=int, default=100, help='number of hidden units of the inner nn') +parser.add_argument('-dropout', type=float, default=0, help='controller dropout') +parser.add_argument('-memory_type', type=str, default='dnc', help='dense or sparse memory: dnc | sdnc | sam') + +parser.add_argument('-nlayer', type=int, default=1, help='number of layers') +parser.add_argument('-nhlayer', type=int, default=2, help='number of hidden layers') +parser.add_argument('-lr', type=float, default=1e-4, help='initial learning rate') +parser.add_argument('-optim', type=str, default='adam', help='learning rule, supports adam|rmsprop') +parser.add_argument('-clip', type=float, default=50, help='gradient clipping') + +parser.add_argument('-batch_size', type=int, default=100, metavar='N', help='batch size') +parser.add_argument('-mem_size', type=int, default=20, help='memory dimension') +parser.add_argument('-mem_slot', type=int, default=16, help='number of memory slots') +parser.add_argument('-read_heads', type=int, default=4, help='number of read heads') +parser.add_argument('-sparse_reads', type=int, default=10, help='number of sparse reads per read head') +parser.add_argument('-temporal_reads', type=int, default=2, help='number of temporal reads') + +parser.add_argument('-sequence_max_length', type=int, default=4, metavar='N', help='sequence_max_length') +parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU') + +parser.add_argument('-iterations', type=int, default=2000, metavar='N', help='total number of iteration') +parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency') +parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency') +parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps') + +args = parser.parse_args() +print(args) + +viz = Visdom() +# assert viz.check_connection() + +if args.cuda != -1: + print('Using CUDA.') + T.manual_seed(1111) +else: + print('Using CPU.') + +def llprint(message): + sys.stdout.write(message) + sys.stdout.flush() + + +def onehot(x, n): + ret = np.zeros(n).astype(np.float32) + ret[x] = 1.0 + return ret + + +def generate_data(length, size): + + content = np.random.randint(0, size - 1, length) + + seqlen = length + 1 + x_seq_list = [float('nan')] * seqlen + max_value = 0 + max_ind = 0 + for i in range(seqlen): + if (i < length): + x_seq_list[i] = onehot(content[i], size) + if (max_value <= content[i]): + max_value = content[i] + max_ind = i + else: + x_seq_list[i] = onehot(size - 1, size) + + x_seq_list = np.array(x_seq_list) + x_seq_list = x_seq_list.reshape((1,) + x_seq_list.shape) + x_seq_list = np.reshape(x_seq_list, (1, -1, size)) + + target_output = np.zeros((1, 1, seqlen), dtype=np.float32) + target_output[:, -1, -1] = max_ind + target_output = np.reshape(target_output, (1, -1, 1)) + + weights_vec = np.zeros((1, 1, seqlen), dtype=np.float32) + weights_vec[:, -1, -1] = 1.0 + weights_vec = np.reshape(weights_vec, (1, -1, 1)) + + return cudavec(x_seq_list, gpu_id=args.cuda).float(), \ + cudavec(target_output, gpu_id=args.cuda).float(), \ + cudavec(weights_vec, gpu_id=args.cuda) + + +if __name__ == '__main__': + + dirname = os.path.dirname(__file__) + ckpts_dir = os.path.join(dirname, 'checkpoints') + + input_size = args.input_size + memory_type = args.memory_type + lr = args.lr + clip = args.clip + batch_size = args.batch_size + sequence_max_length = args.sequence_max_length + cuda = args.cuda + iterations = args.iterations + summarize_freq = args.summarize_freq + check_freq = args.check_freq + visdom = args.visdom + + from_checkpoint = None + + if args.memory_type == 'dnc': + rnn = DNC( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + elif args.memory_type == 'sdnc': + rnn = SDNC( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + sparse_reads=args.sparse_reads, + temporal_reads=args.temporal_reads, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + elif args.memory_type == 'sam': + rnn = SAM( + input_size=args.input_size, + hidden_size=args.nhid, + rnn_type=args.rnn_type, + num_layers=args.nlayer, + num_hidden_layers=args.nhlayer, + dropout=args.dropout, + nr_cells=args.mem_slot, + cell_size=args.mem_size, + sparse_reads=args.sparse_reads, + read_heads=args.read_heads, + gpu_id=args.cuda, + debug=args.visdom, + batch_first=True, + independent_linears=False + ) + else: + raise Exception('Not recognized type of memory') + + if args.cuda != -1: + rnn = rnn.cuda(args.cuda) + + print(rnn) + + last_save_losses = [] + + if args.optim == 'adam': + optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 + elif args.optim == 'adamax': + optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 + elif args.optim == 'rmsprop': + optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 + elif args.optim == 'sgd': + optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 + elif args.optim == 'adagrad': + optimizer = optim.Adagrad(rnn.parameters(), lr=args.lr) + elif args.optim == 'adadelta': + optimizer = optim.Adadelta(rnn.parameters(), lr=args.lr) + + last_100_losses = [] + + (chx, mhx, rv) = (None, None, None) + for epoch in range(iterations + 1): + llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) + optimizer.zero_grad() + + # We use for training just (sequence_max_length / 10) examples + random_length = np.random.randint(2, (sequence_max_length) + 1) + input_data, target_output, loss_weights = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + loss = T.mean(((loss_weights * output).sum(-1, keepdim=True) - target_output) ** 2) + + loss.backward() + + T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) + optimizer.step() + loss_value = loss.data[0] + + # detach memory from graph + mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() } + + summarize = (epoch % summarize_freq == 0) + take_checkpoint = (epoch != 0) and (epoch % iterations == 0) + + last_100_losses.append(loss_value) + + try: + if summarize: + output = (loss_weights * output).sum().data.cpu().numpy()[0] + target_output = target_output.sum().data.cpu().numpy() + + llprint("\rIteration %d/%d" % (epoch, iterations)) + llprint("\nAvg. Logistic Loss: %.4f\n" % (np.mean(last_100_losses))) + print(target_output) + print("Real value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") + last_100_losses = [] + + if take_checkpoint: + llprint("\nSaving Checkpoint ... "), + check_ptr = os.path.join(ckpts_dir, 'step_{}.pth'.format(epoch)) + cur_weights = rnn.state_dict() + T.save(cur_weights, check_ptr) + llprint("Done!\n") + except Exception as e: + pass + + llprint("\nTesting generalization...\n") + + rnn.eval() + + for i in range(int((iterations + 1) / 10)): + llprint("\nIteration %d/%d" % (i, iterations)) + # We test now the learned generalization using sequence_max_length examples + random_length = np.random.randint(2, sequence_max_length * 2 + 1) + input_data, target_output, loss_weights = generate_data(random_length, input_size) + + if rnn.debug: + output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + else: + output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) + + output = output[:, -1, :].sum().data.cpu().numpy()[0] + target_output = target_output.sum().data.cpu().numpy() + + try: + print("\nReal value: ", ' = ' + str(int(target_output[0]))) + print("Predicted: ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]") + except Exception as e: + pass From 2c359e9a86df4eb2e68ed474c1d8639941a05c1d Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Wed, 20 Dec 2017 02:08:34 +0530 Subject: [PATCH 10/10] Make FAISS work properly, fall back to flann when not available, fixes #23 --- README.md | 15 +++++++ dnc/faiss_index.py | 24 ++++++------ dnc/sparse_memory.py | 56 +++++++++++++++----------- dnc/sparse_temporal_memory.py | 74 +++++++++++++++++++++-------------- dnc/util.py | 2 +- 5 files changed, 106 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 42884de..a6bd058 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,12 @@ pip install -r ./requirements.txt pip install -e . ``` +For using fully GPU based SDNCs or SAMs, install FAISS: + +```bash +conda install faiss-gpu -c pytorch +``` + `pytest` is required to run the test ## Architecure @@ -465,6 +471,15 @@ make -j 4 sudo make install ``` +FAISS can be installed using: + +```bash +conda install faiss-gpu -c pytorch +``` + +FAISS is much faster, has a GPU implementation and is interoperable with pytorch tensors. +We try to use FAISS by default, in absence of which we fall back to FLANN. + 2. An alternative to FLANN is [FAISS](https://github.com/facebookresearch/faiss), which is much faster and interoperable with torch cuda tensors (but is difficult to distribute, see [dnc/faiss_index.py](dnc/faiss_index.py)). 3. `nan`s in the gradients are common, try with different batch sizes diff --git a/dnc/faiss_index.py b/dnc/faiss_index.py index 00e5b38..a7dc516 100644 --- a/dnc/faiss_index.py +++ b/dnc/faiss_index.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -from faiss import faiss +import faiss -from faiss.faiss import cast_integer_to_float_ptr as cast_float -from faiss.faiss import cast_integer_to_int_ptr as cast_int -from faiss.faiss import cast_integer_to_long_ptr as cast_long +from faiss import cast_integer_to_float_ptr as cast_float +from faiss import cast_integer_to_int_ptr as cast_int +from faiss import cast_integer_to_long_ptr as cast_long from .util import * @@ -21,16 +21,16 @@ class FAISSIndex(object): self.num_lists = num_lists self.gpu_id = gpu_id - res = res if res else faiss.StandardGpuResources() - res.setTempMemoryFraction(0.01) + # BEWARE: if this variable gets deallocated, FAISS crashes + self.res = res if res else faiss.StandardGpuResources() + self.res.setTempMemoryFraction(0.01) if self.gpu_id != -1: - res.initializeForDevice(self.gpu_id) + self.res.initializeForDevice(self.gpu_id) nr_samples = self.nr_cells * 100 * self.cell_size - train = train if train is not None else T.randn(self.nr_cells * 100, self.cell_size) * 10 - # train = T.randn(self.nr_cells * 100, self.cell_size) + train = train if train is not None else T.randn(self.nr_cells * 100, self.cell_size) - self.index = faiss.GpuIndexIVFFlat(res, self.cell_size, self.num_lists, faiss.METRIC_INNER_PRODUCT) + self.index = faiss.GpuIndexIVFFlat(self.res, self.cell_size, self.num_lists, faiss.METRIC_L2) self.index.setNumProbes(self.probes) self.train(train) @@ -48,7 +48,7 @@ class FAISSIndex(object): self.index.reset() T.cuda.synchronize() - def add(self, other, positions=None, last=-1): + def add(self, other, positions=None, last=None): other = ensure_gpu(other, self.gpu_id) T.cuda.synchronize() @@ -57,7 +57,7 @@ class FAISSIndex(object): assert positions.size(0) == other.size(0), "Mismatch in number of positions and vectors" self.index.add_with_ids_c(other.size(0), cast_float(ptr(other)), cast_long(ptr(positions + 1))) else: - other = other[:last, :] + other = other[:last, :] if last is not None else other self.index.add_c(other.size(0), cast_float(ptr(other))) T.cuda.synchronize() diff --git a/dnc/sparse_memory.py b/dnc/sparse_memory.py index f8d5155..d06e975 100644 --- a/dnc/sparse_memory.py +++ b/dnc/sparse_memory.py @@ -8,7 +8,6 @@ import torch.nn.functional as F import numpy as np import math -from .flann_index import FLANNIndex from .util import * import time @@ -44,11 +43,12 @@ class SparseMemory(nn.Module): m = self.mem_size w = self.cell_size r = self.read_heads - # The visible memory size: (K * R read heads, forward and backward temporal reads of size KL and least used memory cell) + # The visible memory size: (K * R read heads, forward and backward + # temporal reads of size KL and least used memory cell) self.c = (r * self.K) + 1 if self.independent_linears: - self.read_query_transform = nn.Linear(self.input_size, w*r) + self.read_query_transform = nn.Linear(self.input_size, w * r) self.write_vector_transform = nn.Linear(self.input_size, w) self.interpolation_gate_transform = nn.Linear(self.input_size, self.c) self.write_gate_transform = nn.Linear(self.input_size, 1) @@ -72,11 +72,20 @@ class SparseMemory(nn.Module): if 'indexes' in hidden: [x.reset() for x in hidden['indexes']] else: - # create new indexes - hidden['indexes'] = \ - [FLANNIndex(cell_size=self.cell_size, - nr_cells=self.mem_size, K=self.K, num_kdtrees=self.num_lists, - probes=self.index_checks, gpu_id=self.mem_gpu_id) for x in range(b)] + # create new indexes, try to use FAISS, fall back to FLANN + try: + from .faiss_index import FAISSIndex + hidden['indexes'] = \ + [FAISSIndex(cell_size=self.cell_size, + nr_cells=self.mem_size, K=self.K, num_lists=self.num_lists, + probes=self.index_checks, gpu_id=self.mem_gpu_id) for x in range(b)] + except Exception as e: + print("\nFalling back to FLANN (CPU). \nFor using faster, GPU based indexes, install FAISS: `conda install faiss-gpu -c pytorch`") + from .flann_index import FLANNIndex + hidden['indexes'] = \ + [FLANNIndex(cell_size=self.cell_size, + nr_cells=self.mem_size, K=self.K, num_kdtrees=self.num_lists, + probes=self.index_checks, gpu_id=self.mem_gpu_id) for x in range(b)] # add existing memory into indexes pos = hidden['read_positions'].squeeze().data.cpu().numpy() @@ -104,7 +113,7 @@ class SparseMemory(nn.Module): 'read_weights': cuda(T.zeros(b, m).fill_(δ), gpu_id=self.gpu_id), 'write_weights': cuda(T.zeros(b, m).fill_(δ), gpu_id=self.gpu_id), 'read_vectors': cuda(T.zeros(b, r, w).fill_(δ), gpu_id=self.gpu_id), - 'least_used_mem': cuda(T.zeros(b, 1).fill_(c+1), gpu_id=self.gpu_id).long(), + 'least_used_mem': cuda(T.zeros(b, 1).fill_(c + 1), gpu_id=self.gpu_id).long(), 'usage': cuda(T.zeros(b, m).fill_(δ), gpu_id=self.gpu_id), 'read_positions': cuda(T.arange(0, c).expand(b, c), gpu_id=self.gpu_id).long() } @@ -126,9 +135,10 @@ class SparseMemory(nn.Module): hidden['read_weights'].data.fill_(δ) hidden['write_weights'].data.fill_(δ) hidden['read_vectors'].data.fill_(δ) - hidden['least_used_mem'].data.fill_(c+1+self.timestep) + hidden['least_used_mem'].data.fill_(c + 1 + self.timestep) hidden['usage'].data.fill_(δ) - hidden['read_positions'] = cuda(T.arange(self.timestep, c+self.timestep).expand(b, c), gpu_id=self.gpu_id).long() + hidden['read_positions'] = cuda( + T.arange(self.timestep, c + self.timestep).expand(b, c), gpu_id=self.gpu_id).long() return hidden @@ -147,8 +157,9 @@ class SparseMemory(nn.Module): hidden['indexes'][batch].reset() hidden['indexes'][batch].add(hidden['memory'][batch], last=pos[batch][-1]) - mem_limit_reached = hidden['least_used_mem'][0].data.cpu().numpy()[0] >= self.mem_size-1 - hidden['least_used_mem'] = (hidden['least_used_mem'] * 0 + self.c + 1) if mem_limit_reached else hidden['least_used_mem'] + 1 + mem_limit_reached = hidden['least_used_mem'][0].data.cpu().numpy()[0] >= self.mem_size - 1 + hidden['least_used_mem'] = (hidden['least_used_mem'] * 0 + self.c + + 1) if mem_limit_reached else hidden['least_used_mem'] + 1 return hidden @@ -177,7 +188,8 @@ class SparseMemory(nn.Module): erase_matrix = I.unsqueeze(2).expand(hidden['visible_memory'].size()) # write into memory - hidden['visible_memory'] = hidden['visible_memory'] * (1 - erase_matrix) + T.bmm(write_weights.unsqueeze(2), write_vector) + hidden['visible_memory'] = hidden['visible_memory'] * \ + (1 - erase_matrix) + T.bmm(write_weights.unsqueeze(2), write_vector) hidden = self.write_into_sparse_memory(hidden) return hidden @@ -240,11 +252,11 @@ class SparseMemory(nn.Module): # sparse read read_vectors, positions, read_weights, visible_memory = \ self.read_from_sparse_memory( - hidden['memory'], - hidden['indexes'], - read_query, - hidden['least_used_mem'], - hidden['usage'] + hidden['memory'], + hidden['indexes'], + read_query, + hidden['least_used_mem'], + hidden['usage'] ) hidden['read_positions'] = positions @@ -276,11 +288,11 @@ class SparseMemory(nn.Module): else: ξ = self.interface_weights(ξ) # r read keys (b * r * w) - read_query = ξ[:, :r*w].contiguous().view(b, r, w) + read_query = ξ[:, :r * w].contiguous().view(b, r, w) # write key (b * 1 * w) - write_vector = ξ[:, r*w: r*w + w].contiguous().view(b, 1, w) + write_vector = ξ[:, r * w: r * w + w].contiguous().view(b, 1, w) # write vector (b * 1 * r) - interpolation_gate = F.sigmoid(ξ[:, r*w + w: r*w + w + c]).contiguous().view(b, c) + interpolation_gate = F.sigmoid(ξ[:, r * w + w: r * w + w + c]).contiguous().view(b, c) # write gate (b * 1) write_gate = F.sigmoid(ξ[:, -1].contiguous()).unsqueeze(1).view(b, 1) diff --git a/dnc/sparse_temporal_memory.py b/dnc/sparse_temporal_memory.py index 1154a40..2ddac8d 100644 --- a/dnc/sparse_temporal_memory.py +++ b/dnc/sparse_temporal_memory.py @@ -46,11 +46,12 @@ class SparseTemporalMemory(nn.Module): m = self.mem_size w = self.cell_size r = self.read_heads - # The visible memory size: (K * R read heads, forward and backward temporal reads of size KL and least used memory cell) + # The visible memory size: (K * R read heads, forward and backward + # temporal reads of size KL and least used memory cell) self.c = (r * self.K) + (self.KL * 2) + 1 if self.independent_linears: - self.read_query_transform = nn.Linear(self.input_size, w*r) + self.read_query_transform = nn.Linear(self.input_size, w * r) self.write_vector_transform = nn.Linear(self.input_size, w) self.interpolation_gate_transform = nn.Linear(self.input_size, self.c) self.write_gate_transform = nn.Linear(self.input_size, 1) @@ -75,10 +76,19 @@ class SparseTemporalMemory(nn.Module): [x.reset() for x in hidden['indexes']] else: # create new indexes - hidden['indexes'] = \ - [FLANNIndex(cell_size=self.cell_size, - nr_cells=self.mem_size, K=self.K, num_kdtrees=self.num_lists, - probes=self.index_checks, gpu_id=self.mem_gpu_id) for x in range(b)] + try: + from .faiss_index import FAISSIndex + hidden['indexes'] = \ + [FAISSIndex(cell_size=self.cell_size, + nr_cells=self.mem_size, K=self.K, num_lists=self.num_lists, + probes=self.index_checks, gpu_id=self.mem_gpu_id) for x in range(b)] + except Exception as e: + print("\nFalling back to FLANN (CPU). \nFor using faster, GPU based indexes, install FAISS: `conda install faiss-gpu -c pytorch`") + from .flann_index import FLANNIndex + hidden['indexes'] = \ + [FLANNIndex(cell_size=self.cell_size, + nr_cells=self.mem_size, K=self.K, num_kdtrees=self.num_lists, + probes=self.index_checks, gpu_id=self.mem_gpu_id) for x in range(b)] # add existing memory into indexes pos = hidden['read_positions'].squeeze().data.cpu().numpy() @@ -103,13 +113,13 @@ class SparseTemporalMemory(nn.Module): # warning can be a huge chunk of contiguous memory 'memory': cuda(T.zeros(b, m, w).fill_(δ), gpu_id=self.mem_gpu_id), 'visible_memory': cuda(T.zeros(b, c, w).fill_(δ), gpu_id=self.mem_gpu_id), - 'link_matrix': cuda(T.zeros(b, m, self.KL*2), gpu_id=self.gpu_id), - 'rev_link_matrix': cuda(T.zeros(b, m, self.KL*2), gpu_id=self.gpu_id), - 'precedence': cuda(T.zeros(b, self.KL*2).fill_(δ), gpu_id=self.gpu_id), + 'link_matrix': cuda(T.zeros(b, m, self.KL * 2), gpu_id=self.gpu_id), + 'rev_link_matrix': cuda(T.zeros(b, m, self.KL * 2), gpu_id=self.gpu_id), + 'precedence': cuda(T.zeros(b, self.KL * 2).fill_(δ), gpu_id=self.gpu_id), 'read_weights': cuda(T.zeros(b, m).fill_(δ), gpu_id=self.gpu_id), 'write_weights': cuda(T.zeros(b, m).fill_(δ), gpu_id=self.gpu_id), 'read_vectors': cuda(T.zeros(b, r, w).fill_(δ), gpu_id=self.gpu_id), - 'least_used_mem': cuda(T.zeros(b, 1).fill_(c+1), gpu_id=self.gpu_id).long(), + 'least_used_mem': cuda(T.zeros(b, 1).fill_(c + 1), gpu_id=self.gpu_id).long(), 'usage': cuda(T.zeros(b, m).fill_(δ), gpu_id=self.gpu_id), 'read_positions': cuda(T.arange(0, c).expand(b, c), gpu_id=self.gpu_id).long() } @@ -137,9 +147,10 @@ class SparseTemporalMemory(nn.Module): hidden['read_weights'].data.fill_(δ) hidden['write_weights'].data.fill_(δ) hidden['read_vectors'].data.fill_(δ) - hidden['least_used_mem'].data.fill_(c+1+self.timestep) + hidden['least_used_mem'].data.fill_(c + 1 + self.timestep) hidden['usage'].data.fill_(δ) - hidden['read_positions'] = cuda(T.arange(self.timestep, c+self.timestep).expand(b, c), gpu_id=self.gpu_id).long() + hidden['read_positions'] = cuda( + T.arange(self.timestep, c + self.timestep).expand(b, c), gpu_id=self.gpu_id).long() return hidden @@ -158,8 +169,9 @@ class SparseTemporalMemory(nn.Module): hidden['indexes'][batch].reset() hidden['indexes'][batch].add(hidden['memory'][batch], last=pos[batch][-1]) - mem_limit_reached = hidden['least_used_mem'][0].data.cpu().numpy()[0] >= self.mem_size-1 - hidden['least_used_mem'] = (hidden['least_used_mem'] * 0 + self.c + 1) if mem_limit_reached else hidden['least_used_mem'] + 1 + mem_limit_reached = hidden['least_used_mem'][0].data.cpu().numpy()[0] >= self.mem_size - 1 + hidden['least_used_mem'] = (hidden['least_used_mem'] * 0 + self.c + + 1) if mem_limit_reached else hidden['least_used_mem'] + 1 return hidden @@ -179,7 +191,8 @@ class SparseTemporalMemory(nn.Module): link_matrix = (1 - write_weights_i) * link_matrix + write_weights_i * precedence_j - rev_link_matrix = (1 - temporal_write_weights_j) * rev_link_matrix + (temporal_write_weights_j * precedence_dense_i) + rev_link_matrix = (1 - temporal_write_weights_j) * rev_link_matrix + \ + (temporal_write_weights_j * precedence_dense_i) return link_matrix * I, rev_link_matrix * I @@ -211,22 +224,23 @@ class SparseTemporalMemory(nn.Module): erase_matrix = I.unsqueeze(2).expand(hidden['visible_memory'].size()) # write into memory - hidden['visible_memory'] = hidden['visible_memory'] * (1 - erase_matrix) + T.bmm(write_weights.unsqueeze(2), write_vector) + hidden['visible_memory'] = hidden['visible_memory'] * \ + (1 - erase_matrix) + T.bmm(write_weights.unsqueeze(2), write_vector) hidden = self.write_into_sparse_memory(hidden) # update link_matrix and precedence (b, c) = write_weights.size() # update link matrix - temporal_read_positions = hidden['read_positions'][:, self.read_heads*self.K+1:] + temporal_read_positions = hidden['read_positions'][:, self.read_heads * self.K + 1:] hidden['link_matrix'], hidden['rev_link_matrix'] = \ - self.update_link_matrices( + self.update_link_matrices( hidden['link_matrix'], hidden['rev_link_matrix'], hidden['write_weights'], hidden['precedence'], temporal_read_positions - ) + ) # update precedence vector read_weights = hidden['read_weights'].gather(1, temporal_read_positions) @@ -299,20 +313,20 @@ class SparseTemporalMemory(nn.Module): def read(self, read_query, hidden): # get forward and backward weights - temporal_read_positions = hidden['read_positions'][:, self.read_heads*self.K+1:] + temporal_read_positions = hidden['read_positions'][:, self.read_heads * self.K + 1:] read_weights = hidden['read_weights'].gather(1, temporal_read_positions) forward, backward = self.directional_weightings(hidden['link_matrix'], hidden['rev_link_matrix'], read_weights) # sparse read read_vectors, positions, read_weights, visible_memory = \ self.read_from_sparse_memory( - hidden['memory'], - hidden['indexes'], - read_query, - hidden['least_used_mem'], - hidden['usage'], - forward, backward, - hidden['read_positions'] + hidden['memory'], + hidden['indexes'], + read_query, + hidden['least_used_mem'], + hidden['usage'], + forward, backward, + hidden['read_positions'] ) hidden['read_positions'] = positions @@ -344,11 +358,11 @@ class SparseTemporalMemory(nn.Module): else: ξ = self.interface_weights(ξ) # r read keys (b * r * w) - read_query = ξ[:, :r*w].contiguous().view(b, r, w) + read_query = ξ[:, :r * w].contiguous().view(b, r, w) # write key (b * 1 * w) - write_vector = ξ[:, r*w: r*w + w].contiguous().view(b, 1, w) + write_vector = ξ[:, r * w: r * w + w].contiguous().view(b, 1, w) # write vector (b * 1 * r) - interpolation_gate = F.sigmoid(ξ[:, r*w + w: r*w + w + c]).contiguous().view(b, c) + interpolation_gate = F.sigmoid(ξ[:, r * w + w: r * w + w + c]).contiguous().view(b, c) # write gate (b * 1) write_gate = F.sigmoid(ξ[:, -1].contiguous()).unsqueeze(1).view(b, 1) diff --git a/dnc/util.py b/dnc/util.py index feaeead..5602ceb 100644 --- a/dnc/util.py +++ b/dnc/util.py @@ -138,7 +138,7 @@ def ptr(tensor): if T.is_tensor(tensor): return tensor.storage().data_ptr() elif hasattr(tensor, 'data'): - return tensor.data.storage().data_ptr() + return tensor.clone().data.storage().data_ptr() else: return tensor