diff --git a/dnc/dnc.py b/dnc/dnc.py index 8c22c2f..4cb0666 100644 --- a/dnc/dnc.py +++ b/dnc/dnc.py @@ -13,7 +13,7 @@ from torch.nn.utils.rnn import PackedSequence from .util import * from .memory import * -from torch.nn.init import orthogonal, xavier_uniform +from torch.nn.init import orthogonal_, xavier_uniform_ class DNC(nn.Module): @@ -115,7 +115,7 @@ class DNC(nn.Module): # final output layer self.output = nn.Linear(self.nn_output_size, self.input_size) - orthogonal(self.output.weight) + orthogonal_(self.output.weight) if self.gpu_id != -1: [x.cuda(self.gpu_id) for x in self.rnns] @@ -130,7 +130,7 @@ class DNC(nn.Module): # initialize hidden state of the controller RNN if chx is None: h = cuda(T.zeros(self.num_hidden_layers, batch_size, self.output_size), gpu_id=self.gpu_id) - xavier_uniform(h) + xavier_uniform_(h) chx = [ (h, h) if self.rnn_type.lower() == 'lstm' else h for x in range(self.num_layers)] diff --git a/dnc/memory.py b/dnc/memory.py index 46917d8..455a7c2 100644 --- a/dnc/memory.py +++ b/dnc/memory.py @@ -214,45 +214,45 @@ class Memory(nn.Module): if self.independent_linears: # r read keys (b * r * w) - read_keys = F.tanh(self.read_keys_transform(ξ).view(b, r, w)) + read_keys = T.tanh(self.read_keys_transform(ξ).view(b, r, w)) # r read strengths (b * r) read_strengths = F.softplus(self.read_strengths_transform(ξ).view(b, r)) # write key (b * 1 * w) - write_key = F.tanh(self.write_key_transform(ξ).view(b, 1, w)) + write_key = T.tanh(self.write_key_transform(ξ).view(b, 1, w)) # write strength (b * 1) write_strength = F.softplus(self.write_strength_transform(ξ).view(b, 1)) # erase vector (b * 1 * w) - erase_vector = F.sigmoid(self.erase_vector_transform(ξ).view(b, 1, w)) + erase_vector = T.sigmoid(self.erase_vector_transform(ξ).view(b, 1, w)) # write vector (b * 1 * w) - write_vector = F.tanh(self.write_vector_transform(ξ).view(b, 1, w)) + write_vector = T.tanh(self.write_vector_transform(ξ).view(b, 1, w)) # r free gates (b * r) - free_gates = F.sigmoid(self.free_gates_transform(ξ).view(b, r)) + free_gates = T.sigmoid(self.free_gates_transform(ξ).view(b, r)) # allocation gate (b * 1) - allocation_gate = F.sigmoid(self.allocation_gate_transform(ξ).view(b, 1)) + allocation_gate = T.sigmoid(self.allocation_gate_transform(ξ).view(b, 1)) # write gate (b * 1) - write_gate = F.sigmoid(self.write_gate_transform(ξ).view(b, 1)) + write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1)) # read modes (b * r * 3) read_modes = σ(self.read_modes_transform(ξ).view(b, r, 3), 1) else: ξ = self.interface_weights(ξ) # r read keys (b * w * r) - read_keys = F.tanh(ξ[:, :r * w].contiguous().view(b, r, w)) + read_keys = T.tanh(ξ[:, :r * w].contiguous().view(b, r, w)) # r read strengths (b * r) read_strengths = F.softplus(ξ[:, r * w:r * w + r].contiguous().view(b, r)) # write key (b * w * 1) - write_key = F.tanh(ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w)) + write_key = T.tanh(ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w)) # write strength (b * 1) write_strength = F.softplus(ξ[:, r * w + r + w].contiguous().view(b, 1)) # erase vector (b * w) - erase_vector = F.sigmoid(ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w)) + erase_vector = T.sigmoid(ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w)) # write vector (b * w) - write_vector = F.tanh(ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w)) + write_vector = T.tanh(ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w)) # r free gates (b * r) - free_gates = F.sigmoid(ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r)) + free_gates = T.sigmoid(ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r)) # allocation gate (b * 1) - allocation_gate = F.sigmoid(ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1)) + allocation_gate = T.sigmoid(ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1)) # write gate (b * 1) - write_gate = F.sigmoid(ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1) + write_gate = T.sigmoid(ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1) # read modes (b * 3*r) read_modes = σ(ξ[:, r * w + 2 * r + 3 * w + 3: r * w + 5 * r + 3 * w + 3].contiguous().view(b, r, 3), 1) diff --git a/dnc/sam.py b/dnc/sam.py index b8786db..226bd62 100644 --- a/dnc/sam.py +++ b/dnc/sam.py @@ -9,7 +9,7 @@ import numpy as np from torch.nn.utils.rnn import pad_packed_sequence as pad from torch.nn.utils.rnn import pack_padded_sequence as pack from torch.nn.utils.rnn import PackedSequence -from torch.nn.init import orthogonal, xavier_uniform +from torch.nn.init import orthogonal_, xavier_uniform_ from .util import * from .sparse_memory import SparseMemory diff --git a/dnc/sdnc.py b/dnc/sdnc.py index f2e1385..4a3e417 100644 --- a/dnc/sdnc.py +++ b/dnc/sdnc.py @@ -9,7 +9,7 @@ import numpy as np from torch.nn.utils.rnn import pad_packed_sequence as pad from torch.nn.utils.rnn import pack_padded_sequence as pack from torch.nn.utils.rnn import PackedSequence -from torch.nn.init import orthogonal, xavier_uniform +from torch.nn.init import orthogonal_, xavier_uniform_ from .util import * from .sparse_temporal_memory import SparseTemporalMemory diff --git a/dnc/sparse_memory.py b/dnc/sparse_memory.py index 9f48c8f..75cc6fc 100644 --- a/dnc/sparse_memory.py +++ b/dnc/sparse_memory.py @@ -52,14 +52,14 @@ class SparseMemory(nn.Module): self.write_vector_transform = nn.Linear(self.input_size, w) self.interpolation_gate_transform = nn.Linear(self.input_size, self.c) self.write_gate_transform = nn.Linear(self.input_size, 1) - T.nn.init.orthogonal(self.read_query_transform.weight) - T.nn.init.orthogonal(self.write_vector_transform.weight) - T.nn.init.orthogonal(self.interpolation_gate_transform.weight) - T.nn.init.orthogonal(self.write_gate_transform.weight) + T.nn.init.orthogonal_(self.read_query_transform.weight) + T.nn.init.orthogonal_(self.write_vector_transform.weight) + T.nn.init.orthogonal_(self.interpolation_gate_transform.weight) + T.nn.init.orthogonal_(self.write_gate_transform.weight) else: self.interface_size = (r * w) + w + self.c + 1 self.interface_weights = nn.Linear(self.input_size, self.interface_size) - T.nn.init.orthogonal(self.interface_weights.weight) + T.nn.init.orthogonal_(self.interface_weights.weight) self.I = cuda(1 - T.eye(self.c).unsqueeze(0), gpu_id=self.gpu_id) # (1 * n * n) self.δ = 0.005 # minimum usage @@ -288,9 +288,9 @@ class SparseMemory(nn.Module): # write key (b * 1 * w) write_vector = self.write_vector_transform(ξ).view(b, 1, w) # write vector (b * 1 * r) - interpolation_gate = F.sigmoid(self.interpolation_gate_transform(ξ)).view(b, c) + interpolation_gate = T.sigmoid(self.interpolation_gate_transform(ξ)).view(b, c) # write gate (b * 1) - write_gate = F.sigmoid(self.write_gate_transform(ξ).view(b, 1)) + write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1)) else: ξ = self.interface_weights(ξ) # r read keys (b * r * w) @@ -298,9 +298,9 @@ class SparseMemory(nn.Module): # write key (b * 1 * w) write_vector = ξ[:, r * w: r * w + w].contiguous().view(b, 1, w) # write vector (b * 1 * r) - interpolation_gate = F.sigmoid(ξ[:, r * w + w: r * w + w + c]).contiguous().view(b, c) + interpolation_gate = T.sigmoid(ξ[:, r * w + w: r * w + w + c]).contiguous().view(b, c) # write gate (b * 1) - write_gate = F.sigmoid(ξ[:, -1].contiguous()).unsqueeze(1).view(b, 1) + write_gate = T.sigmoid(ξ[:, -1].contiguous()).unsqueeze(1).view(b, 1) self.timestep += 1 hidden = self.write(interpolation_gate, write_vector, write_gate, hidden) diff --git a/dnc/sparse_temporal_memory.py b/dnc/sparse_temporal_memory.py index 63eb02f..95383dd 100644 --- a/dnc/sparse_temporal_memory.py +++ b/dnc/sparse_temporal_memory.py @@ -55,14 +55,14 @@ class SparseTemporalMemory(nn.Module): self.write_vector_transform = nn.Linear(self.input_size, w) self.interpolation_gate_transform = nn.Linear(self.input_size, self.c) self.write_gate_transform = nn.Linear(self.input_size, 1) - T.nn.init.orthogonal(self.read_query_transform.weight) - T.nn.init.orthogonal(self.write_vector_transform.weight) - T.nn.init.orthogonal(self.interpolation_gate_transform.weight) - T.nn.init.orthogonal(self.write_gate_transform.weight) + T.nn.init.orthogonal_(self.read_query_transform.weight) + T.nn.init.orthogonal_(self.write_vector_transform.weight) + T.nn.init.orthogonal_(self.interpolation_gate_transform.weight) + T.nn.init.orthogonal_(self.write_gate_transform.weight) else: self.interface_size = (r * w) + w + self.c + 1 self.interface_weights = nn.Linear(self.input_size, self.interface_size) - T.nn.init.orthogonal(self.interface_weights.weight) + T.nn.init.orthogonal_(self.interface_weights.weight) self.I = cuda(1 - T.eye(self.c).unsqueeze(0), gpu_id=self.gpu_id) # (1 * n * n) self.δ = 0.005 # minimum usage @@ -358,9 +358,9 @@ class SparseTemporalMemory(nn.Module): # write key (b * 1 * w) write_vector = self.write_vector_transform(ξ).view(b, 1, w) # write vector (b * 1 * r) - interpolation_gate = F.sigmoid(self.interpolation_gate_transform(ξ)).view(b, c) + interpolation_gate = T.sigmoid(self.interpolation_gate_transform(ξ)).view(b, c) # write gate (b * 1) - write_gate = F.sigmoid(self.write_gate_transform(ξ).view(b, 1)) + write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1)) else: ξ = self.interface_weights(ξ) # r read keys (b * r * w) @@ -368,9 +368,9 @@ class SparseTemporalMemory(nn.Module): # write key (b * 1 * w) write_vector = ξ[:, r * w: r * w + w].contiguous().view(b, 1, w) # write vector (b * 1 * r) - interpolation_gate = F.sigmoid(ξ[:, r * w + w: r * w + w + c]).contiguous().view(b, c) + interpolation_gate = T.sigmoid(ξ[:, r * w + w: r * w + w + c]).contiguous().view(b, c) # write gate (b * 1) - write_gate = F.sigmoid(ξ[:, -1].contiguous()).unsqueeze(1).view(b, 1) + write_gate = T.sigmoid(ξ[:, -1].contiguous()).unsqueeze(1).view(b, 1) self.timestep += 1 hidden = self.write(interpolation_gate, write_vector, write_gate, hidden) diff --git a/dnc/util.py b/dnc/util.py index 5602ceb..87ae5d6 100644 --- a/dnc/util.py +++ b/dnc/util.py @@ -4,7 +4,6 @@ import torch.nn as nn import torch as T import torch.nn.functional as F -from torch.autograd import Variable as var import numpy as np import torch from torch.autograd import Variable @@ -24,24 +23,37 @@ def recursiveTrace(obj): def cuda(x, grad=False, gpu_id=-1): + x = x.float() if T.is_tensor(x) else x if gpu_id == -1: - return var(x, requires_grad=grad) + t = T.FloatTensor(x) + t.requires_grad=grad + return t else: - return var(x.pin_memory(), requires_grad=grad).cuda(gpu_id, async=True) + t = T.FloatTensor(x.pin_memory()).cuda(gpu_id, async=True) + t.requires_grad=grad + return t def cudavec(x, grad=False, gpu_id=-1): if gpu_id == -1: - return var(T.from_numpy(x), requires_grad=grad) + t = T.Tensor(T.from_numpy(x)) + t.requires_grad = grad + return t else: - return var(T.from_numpy(x).pin_memory(), requires_grad=grad).cuda(gpu_id, async=True) + t = T.Tensor(T.from_numpy(x).pin_memory()).cuda(gpu_id, async=True) + t.requires_grad = grad + return t def cudalong(x, grad=False, gpu_id=-1): if gpu_id == -1: - return var(T.from_numpy(x.astype(np.long)), requires_grad=grad) + t = T.LongTensor(T.from_numpy(x.astype(np.long))) + t.requires_grad = grad + return t else: - return var(T.from_numpy(x.astype(np.long)).pin_memory(), requires_grad=grad).cuda(gpu_id, async=True) + t = T.LongTensor(T.from_numpy(x.astype(np.long)).pin_memory()).cuda(gpu_id, async=True) + t.requires_grad = grad + return t def θ(a, b, dimA=2, dimB=2, normBy=2): @@ -89,10 +101,7 @@ def σ(input, axis=1): trans_size = trans_input.size() input_2d = trans_input.contiguous().view(-1, trans_size[-1]) - if '0.3' in T.__version__: - soft_max_2d = F.softmax(input_2d, -1) - else: - soft_max_2d = F.softmax(input_2d) + soft_max_2d = F.softmax(input_2d, -1) soft_max_nd = soft_max_2d.view(*trans_size) return soft_max_nd.transpose(axis, len(input_size) - 1) diff --git a/setup.py b/setup.py index 71ce06b..aa19949 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ with open(path.join(here, 'README.rst'), encoding='utf-8') as f: setup( name='dnc', - version='0.0.9', + version='0.1.0', description='Differentiable Neural Computer, for Pytorch', long_description=long_description, diff --git a/tasks/adding_task.py b/tasks/adding_task.py index 2c7db8e..ca0dbd2 100644 --- a/tasks/adding_task.py +++ b/tasks/adding_task.py @@ -20,7 +20,7 @@ from torch.autograd import Variable as var import torch.nn.functional as F import torch.optim as optim -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ from dnc.dnc import DNC from dnc.sdnc import SDNC @@ -219,7 +219,7 @@ if __name__ == '__main__': loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), args.clip) optimizer.step() loss_value = loss.data[0] diff --git a/tasks/argmax_task.py b/tasks/argmax_task.py index 79f8311..711a017 100644 --- a/tasks/argmax_task.py +++ b/tasks/argmax_task.py @@ -20,7 +20,7 @@ from torch.autograd import Variable as var import torch.nn.functional as F import torch.optim as optim -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ from dnc.dnc import DNC from dnc.sdnc import SDNC @@ -225,7 +225,7 @@ if __name__ == '__main__': loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), args.clip) optimizer.step() loss_value = loss.data[0] diff --git a/tasks/copy_task.py b/tasks/copy_task.py index 8f63bc0..8be5984 100755 --- a/tasks/copy_task.py +++ b/tasks/copy_task.py @@ -20,7 +20,7 @@ from torch.autograd import Variable as var import torch.nn.functional as F import torch.optim as optim -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ from dnc.dnc import DNC from dnc.sdnc import SDNC @@ -212,7 +212,7 @@ if __name__ == '__main__': loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), args.clip) optimizer.step() loss_value = loss.data[0] diff --git a/test/test_gru.py b/test/test_gru.py index 59f2041..ec97609 100644 --- a/test/test_gru.py +++ b/test/test_gru.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -71,7 +71,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -127,7 +127,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -188,7 +188,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_indexes.py b/test/test_indexes.py index 298cebc..b08f180 100644 --- a/test/test_indexes.py +++ b/test/test_indexes.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np diff --git a/test/test_lstm.py b/test/test_lstm.py index fd5b0c3..b4f918f 100644 --- a/test/test_lstm.py +++ b/test/test_lstm.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -70,7 +70,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -126,7 +126,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -187,7 +187,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_rnn.py b/test/test_rnn.py index 57c600e..053db49 100644 --- a/test/test_rnn.py +++ b/test/test_rnn.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -71,7 +71,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -127,7 +127,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -188,7 +188,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_sam_gru.py b/test/test_sam_gru.py index c6d086b..55654e2 100644 --- a/test/test_sam_gru.py +++ b/test/test_sam_gru.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -72,7 +72,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -130,7 +130,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -191,7 +191,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_sam_lstm.py b/test/test_sam_lstm.py index 1bac97b..d891f40 100644 --- a/test/test_sam_lstm.py +++ b/test/test_sam_lstm.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -72,7 +72,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -130,7 +130,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -191,7 +191,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_sam_rnn.py b/test/test_sam_rnn.py index e1ad9a1..4c04ac1 100644 --- a/test/test_sam_rnn.py +++ b/test/test_sam_rnn.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -72,7 +72,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -130,7 +130,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -191,7 +191,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_sdnc_gru.py b/test/test_sdnc_gru.py index 0d7dbd8..4d71ecd 100644 --- a/test/test_sdnc_gru.py +++ b/test/test_sdnc_gru.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -74,7 +74,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -134,7 +134,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -197,7 +197,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_sdnc_lstm.py b/test/test_sdnc_lstm.py index f6e4c69..342a0ac 100644 --- a/test/test_sdnc_lstm.py +++ b/test/test_sdnc_lstm.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -74,7 +74,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -134,7 +134,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -197,7 +197,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_sdnc_rnn.py b/test/test_sdnc_rnn.py index b0f200d..9870691 100644 --- a/test/test_sdnc_rnn.py +++ b/test/test_sdnc_rnn.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch as T from torch.autograd import Variable as var import torch.nn.functional as F -from torch.nn.utils import clip_grad_norm +from torch.nn.utils import clip_grad_norm_ import torch.optim as optim import numpy as np @@ -74,7 +74,7 @@ def test_rnn_1(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([21, 10, 100]) @@ -134,7 +134,7 @@ def test_rnn_n(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) @@ -197,7 +197,7 @@ def test_rnn_no_memory_pass(): loss = criterion((output), target_output) loss.backward() - T.nn.utils.clip_grad_norm(rnn.parameters(), clip) + T.nn.utils.clip_grad_norm_(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) diff --git a/test/test_utils.py b/test/test_utils.py index 5fd6862..929f329 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import torch.nn as nn import torch as T @@ -28,6 +28,6 @@ def generate_data(batch_size, length, size, cuda=-1): def criterion(predictions, targets): return T.mean( - -1 * F.logsigmoid(predictions) * (targets) - T.log(1 - F.sigmoid(predictions) + 1e-9) * (1 - targets) + -1 * F.logsigmoid(predictions) * (targets) - T.log(1 - T.sigmoid(predictions) + 1e-9) * (1 - targets) )