first commit
This commit is contained in:
commit
7f5d07930f
24
README.md
Normal file
24
README.md
Normal file
@ -0,0 +1,24 @@
|
||||
DNC: Differentiable Neural Computer
|
||||
=======
|
||||
|
||||
Implements DeepMind's third nature paper, [Hybrid computing using a neural network with dynamic external memory](http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html) by Graves et. al.
|
||||
|
||||
![DNC schema](copy/static/dnc_schema.png?raw=true)
|
||||
|
||||
Based on the paper's appendix, I sketched the [computational graph](https://docs.google.com/drawings/d/1Fc9eOH1wPw0PbBHWkEH39jik7h7HT9BWAE8ZhSr4hJc/edit?usp=sharing)
|
||||
|
||||
_This is a work in progress_
|
||||
--------
|
||||
I have a general framework and a couple Jupyter notebooks for debugging. This is not a finished project. It's still very much in the dev stage. I need to:
|
||||
1. write unit tests
|
||||
2. improve documentation/comments
|
||||
3. run it on more difficult tasks
|
||||
4. add some nice visualizations
|
||||
|
||||
|
||||
Dependencies
|
||||
--------
|
||||
* All code is written in python 2.7. You will need:
|
||||
* Numpy
|
||||
* Matplotlib
|
||||
* [TensorFlow r1.0](https://www.tensorflow.org/api_docs/python/)
|
217
copy/.ipynb_checkpoints/copy-checkpoint.ipynb
Normal file
217
copy/.ipynb_checkpoints/copy-checkpoint.ipynb
Normal file
@ -0,0 +1,217 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Differentiable Neural Computer\n",
|
||||
"**Sam Greydanus $\\cdot$ February 2017 $\\cdot$ MIT License.**\n",
|
||||
"\n",
|
||||
"Represents the state of the art in differentiable memory. Inspired by this [Nature paper](http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html). Some ideas taken from [this Gihub repo](https://github.com/Mostafa-Samir/DNC-tensorflow)\n",
|
||||
"\n",
|
||||
"<a href=\"http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html\"><img src=\"/static/dnc_schema.png\" alt=\"DNC schema\" style=\"width: 400px;\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tensorflow as tf\n",
|
||||
"import numpy as np\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"from dnc.dnc import DNC\n",
|
||||
"from nn_controller import NNController"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"length = 10\n",
|
||||
"xydim = 6\n",
|
||||
"tf.app.flags.DEFINE_integer(\"xlen\", xydim, \"Input dimension\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"ylen\", xydim, \"output dimension\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"max_sequence_length\", 2*length+1, \"Maximum sequence length\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"length\", length, \"Maximum sequence length\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"batch_size\", 2, \"Size of batch in minibatch gradient descent\")\n",
|
||||
"\n",
|
||||
"tf.app.flags.DEFINE_integer(\"R\", 1, \"Number of DNC read heads\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"W\", 10, \"Word length for DNC memory\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"N\", 15, \"Number of words the DNC memory can store\")\n",
|
||||
"\n",
|
||||
"tf.app.flags.DEFINE_integer(\"print_every\", 100, \"Print training info after this number of train steps\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"iterations\", 100000, \"Number of training iterations\")\n",
|
||||
"tf.app.flags.DEFINE_float(\"lr\", 1e-4, \"Learning rate (alpha) for the model\")\n",
|
||||
"tf.app.flags.DEFINE_float(\"momentum\", .9, \"Momentum for RMSProp\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"save_every\", 0, \"Save model after this number of train steps\")\n",
|
||||
"tf.app.flags.DEFINE_string(\"save_dir\", \"models\", \"Directory in which to save checkpoints\")\n",
|
||||
"tf.app.flags.DEFINE_string(\"log_dir\", \"logs\", \"Directory in which to save logs\")\n",
|
||||
"FLAGS = tf.app.flags.FLAGS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_data(batch_size, length, dim):\n",
|
||||
" X, y = np.zeros((batch_size, 2 * length + 1, dim)), np.zeros((batch_size, 2 * length + 1, dim))\n",
|
||||
" sequence = np.random.binomial(1, 0.5, (batch_size, length, dim - 1))\n",
|
||||
"\n",
|
||||
" X[:, :length, :dim - 1] = sequence\n",
|
||||
" X[:, length, -1] = 1 # end symbol\n",
|
||||
" y[:, length + 1:, :dim - 1] = sequence\n",
|
||||
" \n",
|
||||
" return X, y\n",
|
||||
"\n",
|
||||
"def binary_cross_entropy(y_hat, y):\n",
|
||||
" return tf.reduce_mean(-y*tf.log(y_hat) - (1-y)*tf.log(1-y_hat))\n",
|
||||
"\n",
|
||||
"def llprint(message):\n",
|
||||
" sys.stdout.write(message)\n",
|
||||
" sys.stdout.flush()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"building graph...\n",
|
||||
"computing gradients...\n",
|
||||
"init variables... \n",
|
||||
"starting to train...\n",
|
||||
"\n",
|
||||
"Iteration 0/100000\n",
|
||||
"\tloss: 0.6899\n",
|
||||
"Iteration 100/100000\n",
|
||||
"\tloss: 0.6738\n",
|
||||
"Iteration 200/100000\n",
|
||||
"\tloss: 0.4000\n",
|
||||
"Iteration 300/100000\n",
|
||||
"\tloss: 0.2642\n",
|
||||
"Iteration 400/100000\n",
|
||||
"\tloss: 0.2544\n",
|
||||
"Iteration 500/100000\n",
|
||||
"\tloss: 0.2533\n",
|
||||
"Iteration 600/100000\n",
|
||||
"\tloss: 0.2539\n",
|
||||
"Iteration 700/100000\n",
|
||||
"\tloss: 0.2570\n",
|
||||
"Iteration 800/100000\n",
|
||||
"\tloss: 0.2507\n",
|
||||
"Iteration 900/100000\n",
|
||||
"\tloss: 0.2462\n",
|
||||
"Iteration 1000/100000\n",
|
||||
"\tloss: 0.2464\n",
|
||||
"Iteration 1100/100000\n",
|
||||
"\tloss: 0.2491\n",
|
||||
"Iteration 1200/100000\n",
|
||||
"\tloss: 0.2412\n",
|
||||
"Iteration 1300/100000\n",
|
||||
"\tloss: 0.2340\n",
|
||||
"Iteration 1400/100000\n",
|
||||
"\tloss: 0.2343\n",
|
||||
"Iteration 1500/100000\n",
|
||||
"\tloss: 0.2303\n",
|
||||
"Iteration 1600/100000\n",
|
||||
"\tloss: 0.2196\n",
|
||||
"Iteration 1700/100000\n",
|
||||
"\tloss: 0.2305\n",
|
||||
"Iteration 1800/100000\n",
|
||||
"\tloss: 0.2237\n",
|
||||
"Iteration 1854/100000"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graph = tf.Graph()\n",
|
||||
"with graph.as_default():\n",
|
||||
" with tf.Session(graph=graph) as session:\n",
|
||||
" llprint(\"building graph...\\n\")\n",
|
||||
" optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, momentum=FLAGS.momentum)\n",
|
||||
" dnc = DNC(NNController, FLAGS)\n",
|
||||
"\n",
|
||||
" # define loss\n",
|
||||
" y_hat, _ = dnc.get_outputs()\n",
|
||||
" y_hat = tf.clip_by_value(tf.sigmoid(y_hat), 1e-6, 1. - 1e-6)\n",
|
||||
" loss = binary_cross_entropy(y_hat, dnc.y)\n",
|
||||
" \n",
|
||||
" llprint(\"computing gradients...\\n\")\n",
|
||||
" gradients = optimizer.compute_gradients(loss)\n",
|
||||
" grad_op = optimizer.apply_gradients(gradients)\n",
|
||||
" \n",
|
||||
" llprint(\"init variables... \\n\")\n",
|
||||
" session.run(tf.global_variables_initializer())\n",
|
||||
" llprint(\"starting to train...\\n\\n\")\n",
|
||||
"\n",
|
||||
" loss_history = []\n",
|
||||
"\n",
|
||||
" for i in xrange(FLAGS.iterations + 1):\n",
|
||||
" llprint(\"\\rIteration {}/{}\".format(i, FLAGS.iterations))\n",
|
||||
"\n",
|
||||
" random_length = np.random.randint(1, FLAGS.length + 1)\n",
|
||||
" X, y = generate_data(FLAGS.batch_size, random_length, FLAGS.xlen)\n",
|
||||
"\n",
|
||||
" fetch = [loss, grad_op]\n",
|
||||
" feed = {dnc.X: X, dnc.y: y, dnc.tsteps: 2 * random_length + 1}\n",
|
||||
" \n",
|
||||
" step_loss, _ = session.run(fetch, feed_dict=feed)\n",
|
||||
"\n",
|
||||
" loss_history.append(step_loss)\n",
|
||||
"\n",
|
||||
" if i % 100 == 0:\n",
|
||||
" llprint(\"\\n\\tloss: {:03.4f}\\n\".format(np.mean(loss_history)))\n",
|
||||
" loss_history = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 2",
|
||||
"language": "python",
|
||||
"name": "python2"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
6
copy/.ipynb_checkpoints/debug-checkpoint.ipynb
Normal file
6
copy/.ipynb_checkpoints/debug-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
610
copy/.ipynb_checkpoints/visualization-checkpoint.ipynb
Executable file
610
copy/.ipynb_checkpoints/visualization-checkpoint.ipynb
Executable file
File diff suppressed because one or more lines are too long
254
copy/copy.ipynb
Normal file
254
copy/copy.ipynb
Normal file
@ -0,0 +1,254 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Differentiable Neural Computer\n",
|
||||
"\n",
|
||||
"<a href=\"http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html\"><img src=\"./static/dnc_schema.png\" alt=\"DNC schema\" style=\"width: 900px;\"/></a>\n",
|
||||
"\n",
|
||||
"**Sam Greydanus $\\cdot$ February 2017 $\\cdot$ MIT License.**\n",
|
||||
"\n",
|
||||
"Represents the state of the art in differentiable memory. Inspired by this [Nature paper](http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html). Some ideas taken from [this Gihub repo](https://github.com/Mostafa-Samir/DNC-tensorflow)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tensorflow as tf\n",
|
||||
"import numpy as np\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"from dnc.dnc import DNC\n",
|
||||
"from nn_controller import NNController"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"length = 10\n",
|
||||
"xydim = 6\n",
|
||||
"tf.app.flags.DEFINE_integer(\"xlen\", xydim, \"Input dimension\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"ylen\", xydim, \"output dimension\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"max_sequence_length\", 2*length+1, \"Maximum sequence length\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"length\", length, \"Maximum sequence length\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"batch_size\", 2, \"Size of batch in minibatch gradient descent\")\n",
|
||||
"\n",
|
||||
"tf.app.flags.DEFINE_integer(\"R\", 1, \"Number of DNC read heads\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"W\", 10, \"Word length for DNC memory\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"N\", 15, \"Number of words the DNC memory can store\")\n",
|
||||
"\n",
|
||||
"tf.app.flags.DEFINE_integer(\"print_every\", 100, \"Print training info after this number of train steps\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"iterations\", 100000, \"Number of training iterations\")\n",
|
||||
"tf.app.flags.DEFINE_float(\"lr\", 1e-4, \"Learning rate (alpha) for the model\")\n",
|
||||
"tf.app.flags.DEFINE_float(\"momentum\", .9, \"Momentum for RMSProp\")\n",
|
||||
"tf.app.flags.DEFINE_integer(\"save_every\", 0, \"Save model after this number of train steps\")\n",
|
||||
"tf.app.flags.DEFINE_string(\"save_dir\", \"models\", \"Directory in which to save checkpoints\")\n",
|
||||
"tf.app.flags.DEFINE_string(\"log_dir\", \"logs\", \"Directory in which to save logs\")\n",
|
||||
"FLAGS = tf.app.flags.FLAGS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_data(batch_size, length, dim):\n",
|
||||
" X, y = np.zeros((batch_size, 2 * length + 1, dim)), np.zeros((batch_size, 2 * length + 1, dim))\n",
|
||||
" sequence = np.random.binomial(1, 0.5, (batch_size, length, dim - 1))\n",
|
||||
"\n",
|
||||
" X[:, :length, :dim - 1] = sequence\n",
|
||||
" X[:, length, -1] = 1 # end symbol\n",
|
||||
" y[:, length + 1:, :dim - 1] = sequence\n",
|
||||
" \n",
|
||||
" return X, y\n",
|
||||
"\n",
|
||||
"def binary_cross_entropy(y_hat, y):\n",
|
||||
" return tf.reduce_mean(-y*tf.log(y_hat) - (1-y)*tf.log(1-y_hat))\n",
|
||||
"\n",
|
||||
"def llprint(message):\n",
|
||||
" sys.stdout.write(message)\n",
|
||||
" sys.stdout.flush()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"building graph...\n",
|
||||
"computing gradients...\n",
|
||||
"init variables... \n",
|
||||
"starting to train...\n",
|
||||
"\n",
|
||||
"Iteration 0/100000\n",
|
||||
"\tloss: 0.6899\n",
|
||||
"Iteration 100/100000\n",
|
||||
"\tloss: 0.6738\n",
|
||||
"Iteration 200/100000\n",
|
||||
"\tloss: 0.4000\n",
|
||||
"Iteration 300/100000\n",
|
||||
"\tloss: 0.2642\n",
|
||||
"Iteration 400/100000\n",
|
||||
"\tloss: 0.2544\n",
|
||||
"Iteration 500/100000\n",
|
||||
"\tloss: 0.2533\n",
|
||||
"Iteration 600/100000\n",
|
||||
"\tloss: 0.2539\n",
|
||||
"Iteration 700/100000\n",
|
||||
"\tloss: 0.2570\n",
|
||||
"Iteration 800/100000\n",
|
||||
"\tloss: 0.2507\n",
|
||||
"Iteration 900/100000\n",
|
||||
"\tloss: 0.2462\n",
|
||||
"Iteration 1000/100000\n",
|
||||
"\tloss: 0.2464\n",
|
||||
"Iteration 1100/100000\n",
|
||||
"\tloss: 0.2491\n",
|
||||
"Iteration 1200/100000\n",
|
||||
"\tloss: 0.2412\n",
|
||||
"Iteration 1300/100000\n",
|
||||
"\tloss: 0.2340\n",
|
||||
"Iteration 1400/100000\n",
|
||||
"\tloss: 0.2343\n",
|
||||
"Iteration 1500/100000\n",
|
||||
"\tloss: 0.2303\n",
|
||||
"Iteration 1600/100000\n",
|
||||
"\tloss: 0.2196\n",
|
||||
"Iteration 1700/100000\n",
|
||||
"\tloss: 0.2305\n",
|
||||
"Iteration 1800/100000\n",
|
||||
"\tloss: 0.2237\n",
|
||||
"Iteration 1900/100000\n",
|
||||
"\tloss: 0.2082\n",
|
||||
"Iteration 2000/100000\n",
|
||||
"\tloss: 0.2180\n",
|
||||
"Iteration 2100/100000\n",
|
||||
"\tloss: 0.2105\n",
|
||||
"Iteration 2200/100000\n",
|
||||
"\tloss: 0.1964\n",
|
||||
"Iteration 2300/100000\n",
|
||||
"\tloss: 0.1891\n",
|
||||
"Iteration 2400/100000\n",
|
||||
"\tloss: 0.1780\n",
|
||||
"Iteration 2500/100000\n",
|
||||
"\tloss: 0.0984\n",
|
||||
"Iteration 2600/100000\n",
|
||||
"\tloss: 0.0283\n",
|
||||
"Iteration 2700/100000\n",
|
||||
"\tloss: 0.0027\n",
|
||||
"Iteration 2800/100000\n",
|
||||
"\tloss: 0.0000\n",
|
||||
"Iteration 2822/100000"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-4-c008f6894765>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mfeed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mdnc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdnc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdnc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtsteps\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mrandom_length\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0mstep_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0mloss_history\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstep_loss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/usr/local/lib/python2.7/site-packages/tensorflow/python/client/session.pyc\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 765\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 766\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 767\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 768\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/usr/local/lib/python2.7/site-packages/tensorflow/python/client/session.pyc\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m--> 965\u001b[0;31m feed_dict_string, options, run_metadata)\n\u001b[0m\u001b[1;32m 966\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/usr/local/lib/python2.7/site-packages/tensorflow/python/client/session.pyc\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1013\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1014\u001b[0m return self._do_call(_run_fn, self._session, feed_dict, fetch_list,\n\u001b[0;32m-> 1015\u001b[0;31m target_list, options, run_metadata)\n\u001b[0m\u001b[1;32m 1016\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1017\u001b[0m return self._do_call(_prun_fn, self._session, handle, feed_dict,\n",
|
||||
"\u001b[0;32m/usr/local/lib/python2.7/site-packages/tensorflow/python/client/session.pyc\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1022\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1023\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1024\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/usr/local/lib/python2.7/site-packages/tensorflow/python/client/session.pyc\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1002\u001b[0m return tf_session.TF_Run(session, options,\n\u001b[1;32m 1003\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1004\u001b[0;31m status, run_metadata)\n\u001b[0m\u001b[1;32m 1005\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1006\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graph = tf.Graph()\n",
|
||||
"with graph.as_default():\n",
|
||||
" with tf.Session(graph=graph) as session:\n",
|
||||
" llprint(\"building graph...\\n\")\n",
|
||||
" optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, momentum=FLAGS.momentum)\n",
|
||||
" dnc = DNC(NNController, FLAGS)\n",
|
||||
"\n",
|
||||
" # define loss\n",
|
||||
" y_hat, _ = dnc.get_outputs()\n",
|
||||
" y_hat = tf.clip_by_value(tf.sigmoid(y_hat), 1e-6, 1. - 1e-6)\n",
|
||||
" loss = binary_cross_entropy(y_hat, dnc.y)\n",
|
||||
" \n",
|
||||
" llprint(\"computing gradients...\\n\")\n",
|
||||
" gradients = optimizer.compute_gradients(loss)\n",
|
||||
" grad_op = optimizer.apply_gradients(gradients)\n",
|
||||
" \n",
|
||||
" llprint(\"init variables... \\n\")\n",
|
||||
" session.run(tf.global_variables_initializer())\n",
|
||||
" llprint(\"starting to train...\\n\\n\")\n",
|
||||
"\n",
|
||||
" loss_history = []\n",
|
||||
"\n",
|
||||
" for i in xrange(FLAGS.iterations + 1):\n",
|
||||
" llprint(\"\\rIteration {}/{}\".format(i, FLAGS.iterations))\n",
|
||||
"\n",
|
||||
" random_length = np.random.randint(1, FLAGS.length + 1)\n",
|
||||
" X, y = generate_data(FLAGS.batch_size, random_length, FLAGS.xlen)\n",
|
||||
"\n",
|
||||
" fetch = [loss, grad_op]\n",
|
||||
" feed = {dnc.X: X, dnc.y: y, dnc.tsteps: 2 * random_length + 1}\n",
|
||||
" \n",
|
||||
" step_loss, _ = session.run(fetch, feed_dict=feed)\n",
|
||||
"\n",
|
||||
" loss_history.append(step_loss)\n",
|
||||
"\n",
|
||||
" if i % 100 == 0:\n",
|
||||
" llprint(\"\\n\\tloss: {:03.4f}\\n\".format(np.mean(loss_history)))\n",
|
||||
" loss_history = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 2",
|
||||
"language": "python",
|
||||
"name": "python2"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
0
copy/dnc/__init__.py
Executable file
0
copy/dnc/__init__.py
Executable file
BIN
copy/dnc/__init__.pyc
Normal file
BIN
copy/dnc/__init__.pyc
Normal file
Binary file not shown.
161
copy/dnc/controller.py
Executable file
161
copy/dnc/controller.py
Executable file
@ -0,0 +1,161 @@
|
||||
# Differentiable Neural Computer
|
||||
# inspired by (http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html)
|
||||
# some ideas taken from https://github.com/Mostafa-Samir/DNC-tensorflow
|
||||
# Sam Greydanus. February 2017. MIT License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
class Controller():
|
||||
def __init__(self, FLAGS):
|
||||
'''
|
||||
An interface that defines how the neural network "controller" interacts with the DNC
|
||||
Parameters:
|
||||
----------
|
||||
FLAGS: a set of TensorFlow FlagValues which must include
|
||||
FLAGS.xlen: the length of the input vector of the controller
|
||||
FLAGS.ylen: the length of the output vector of the controller
|
||||
FLAGS.batch_size: the number of batches
|
||||
FLAGS.R: the number of DNC read heads
|
||||
FLAGS.W: the DNC "word length" (length of each DNC memory vector)
|
||||
|
||||
Returns: Tensor (batch_size, nn_output_size)
|
||||
'''
|
||||
|
||||
self.xlen = FLAGS.xlen
|
||||
self.ylen = FLAGS.ylen
|
||||
self.batch_size = FLAGS.batch_size
|
||||
self.R = R = FLAGS.R
|
||||
self.W = W = FLAGS.W
|
||||
|
||||
self.chi_dim = self.xlen + self.W * self.R
|
||||
self.zeta_dim = W*R + 3*W + 5*R + 3
|
||||
self.r_dim = W*R
|
||||
|
||||
# define network vars
|
||||
self.params = {}
|
||||
with tf.name_scope("controller"):
|
||||
self.init_controller_params()
|
||||
self.controller_dim = self.get_controller_dim()
|
||||
|
||||
init = tf.truncated_normal_initializer(stddev=0.075, dtype=tf.float32)
|
||||
self.params['W_z'] = tf.get_variable("W_z", [self.controller_dim, self.zeta_dim], initializer=init)
|
||||
self.params['W_v'] = tf.get_variable("W_v", [self.controller_dim, self.ylen], initializer=init)
|
||||
self.params['W_r'] = tf.get_variable("W_r", [self.r_dim, self.ylen], initializer=init)
|
||||
|
||||
def init_controller_params(self):
|
||||
'''
|
||||
Initializes all the parameters of the neural network controller
|
||||
'''
|
||||
raise NotImplementedError("init_controller_params does not exist")
|
||||
|
||||
def nn_step(self, chi, state):
|
||||
'''
|
||||
Performs the feedforward step of the controller in order to get the DNC interface vector, zeta
|
||||
Parameters:
|
||||
----------
|
||||
chi: Tensor (batch_size, chi_dim)
|
||||
the input concatenated with the previous output of the DNC
|
||||
state: LSTMStateTensor or another type of state tensor
|
||||
Returns: Tuple
|
||||
zeta_hat: Tensor (batch_size, controller_dim)
|
||||
next_state: LSTMStateTensor or another type of state tensor
|
||||
'''
|
||||
raise NotImplementedError("nn_step does not exist")
|
||||
|
||||
def zero_state(self):
|
||||
'''
|
||||
Returns the initial state of the controller. If the controller is not recurrent, it still needs to return a dummy value
|
||||
Returns: LSTMStateTensor or another type of state tensor
|
||||
nn_state: LSTMStateTensor or another type of state tensor
|
||||
'''
|
||||
raise NotImplementedError("get_state does not exist")
|
||||
|
||||
def get_controller_dim(self):
|
||||
'''
|
||||
Feeds zeros through the controller and obtains an output in order to find the controller's output dimension
|
||||
Returns: int
|
||||
controller_dim: the output dimension of the controller
|
||||
'''
|
||||
test_chi = tf.zeros([self.batch_size, self.chi_dim])
|
||||
nn_output, nn_state = self.nn_step(test_chi, state=None)
|
||||
return nn_output.get_shape().as_list()[-1]
|
||||
|
||||
def prepare_interface(self, zeta_hat):
|
||||
'''
|
||||
Packages the interface vector, zeta, as a dictionary of variables as described in the DNC Nature paper
|
||||
Parameters:
|
||||
----------
|
||||
zeta_hat: Tensor (batch_size, zeta_dim)
|
||||
the interface vector before processing, zeta_hat
|
||||
Returns: dict
|
||||
zeta: variable names (string) mapping to tensors (Tensor)
|
||||
'''
|
||||
zeta = {}
|
||||
R, W = self.R, self.W
|
||||
splits = np.cumsum([0,W*R,R,W,1,W,W,R,1,1,3*R])
|
||||
vs = [zeta_hat[:, splits[i]:splits[i+1]] for i in range(len(splits)-1)]
|
||||
|
||||
kappa_r = tf.reshape(vs[0], (-1, W, R))
|
||||
beta_r = tf.reshape(vs[1], (-1, R))
|
||||
kappa_w = tf.reshape(vs[2], (-1, W, 1))
|
||||
beta_w = tf.reshape(vs[3], (-1, 1))
|
||||
e = tf.reshape(vs[4], (-1, W))
|
||||
v = tf.reshape(vs[5], (-1, W))
|
||||
f = tf.reshape(vs[6], (-1, R))
|
||||
g_a = vs[7]
|
||||
g_w = vs[8]
|
||||
pi = tf.reshape(vs[9], (-1, 3, R))
|
||||
|
||||
zeta['kappa_r'] = kappa_r
|
||||
zeta['beta_r'] = 1 + tf.nn.softplus(beta_r)
|
||||
zeta['kappa_w'] = kappa_w
|
||||
zeta['beta_w'] = 1 + tf.nn.softplus(beta_w)
|
||||
zeta['e'] = tf.nn.sigmoid(e)
|
||||
zeta['v'] = v
|
||||
zeta['f'] = tf.nn.sigmoid(f)
|
||||
zeta['g_a'] = tf.nn.sigmoid(g_a)
|
||||
zeta['g_w'] = tf.nn.sigmoid(g_w)
|
||||
zeta['pi'] = tf.nn.softmax(pi, 1)
|
||||
|
||||
return zeta
|
||||
|
||||
def step(self, X, r_prev, state):
|
||||
'''
|
||||
The sum of operations executed by the Controller at a given time step before interfacing with the DNC
|
||||
Parameters:
|
||||
----------
|
||||
X: Tensor (batch_size, chi_dim)
|
||||
the input for this time step
|
||||
r_prev: previous output of the DNC
|
||||
state: LSTMStateTensor or another type of state tensor
|
||||
Returns: Tuple
|
||||
v: Tensor (batch_size, ylen)
|
||||
The controller's output (eventually added elementwise to the DNC output)
|
||||
zeta: The processed interface vector which the network will use to interact with the DNC
|
||||
nn_state: LSTMStateTensor or another type of state tensor
|
||||
'''
|
||||
r_prev = tf.reshape(r_prev, (-1, self.r_dim)) # flatten
|
||||
chi = tf.concat([X, r_prev], 1)
|
||||
nn_output, nn_state = self.nn_step(chi, state)
|
||||
|
||||
v = tf.matmul(nn_output, self.params['W_v'])
|
||||
zeta_hat = tf.matmul(nn_output, self.params['W_z'])
|
||||
zeta = self.prepare_interface(zeta_hat)
|
||||
return v, zeta, nn_state
|
||||
|
||||
def next_y_hat(self, v, r):
|
||||
'''
|
||||
The sum of operations executed by the Controller at a given time step after interacting with the DNC
|
||||
Parameters:
|
||||
----------
|
||||
v: Tensor (batch_size, ylen)
|
||||
The controller's output (added elementwise to the DNC output)
|
||||
r_prev: the current output of the DNC
|
||||
Returns: Tensor (batch_size, ylen)
|
||||
y_hat: Tensor (batch_size, controller_dim)
|
||||
The DNC's ouput
|
||||
'''
|
||||
r = tf.reshape(r, (-1, self.W * self.R)) # flatten
|
||||
y_hat = v + tf.matmul(r, self.params['W_r'])
|
||||
return y_hat
|
BIN
copy/dnc/controller.pyc
Normal file
BIN
copy/dnc/controller.pyc
Normal file
Binary file not shown.
150
copy/dnc/dnc.py
Executable file
150
copy/dnc/dnc.py
Executable file
@ -0,0 +1,150 @@
|
||||
# Differentiable Neural Computer
|
||||
# inspired by (http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html)
|
||||
# some ideas taken from https://github.com/Mostafa-Samir/DNC-tensorflow
|
||||
# Sam Greydanus. February 2017. MIT License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from memory import Memory
|
||||
|
||||
import os
|
||||
|
||||
class DNC:
|
||||
def __init__(self, make_controller, FLAGS):
|
||||
'''
|
||||
Builds a TensorFlow graph for the Differentiable Neural Computer. Uses TensorArrays and a while loop for efficiency
|
||||
Parameters:
|
||||
----------
|
||||
make_controller: Controller
|
||||
An object class which inherits from the Controller class. We build the object in this function
|
||||
FLAGS: a set of TensorFlow FlagValues which must include
|
||||
FLAGS.xlen: the length of the input vector of the controller
|
||||
FLAGS.ylen: the length of the output vector of the controller
|
||||
FLAGS.batch_size: the number of batches
|
||||
FLAGS.R: the number of DNC read heads
|
||||
FLAGS.W: the DNC "word length" (length of each DNC memory vector)
|
||||
FLAGS.N: the number of DNC word vectors (corresponds to memory size)
|
||||
'''
|
||||
self.xlen = xlen = FLAGS.xlen
|
||||
self.ylen = ylen = FLAGS.ylen
|
||||
self.batch_size = batch_size = FLAGS.batch_size
|
||||
self.R = R = FLAGS.R
|
||||
self.W = W = FLAGS.W
|
||||
self.N = N = FLAGS.N
|
||||
|
||||
# create 1) the DNC's memory and 2) the DNC's controller
|
||||
self.memory = Memory(R, W, N, batch_size)
|
||||
self.controller = make_controller(FLAGS)
|
||||
|
||||
# input data placeholders
|
||||
self.X = tf.placeholder(tf.float32, [batch_size, None, xlen], name='X')
|
||||
self.y = tf.placeholder(tf.float32, [batch_size, None, ylen], name='y')
|
||||
self.tsteps = tf.placeholder(tf.int32, name='tsteps')
|
||||
|
||||
self.X_tensor_array = self.unstack_time_dim(self.X)
|
||||
|
||||
# initialize states
|
||||
nn_state = self.controller.zero_state()
|
||||
dnc_state = self.memory.zero_state()
|
||||
|
||||
# values for which we want a history
|
||||
self.hist_keys = ['y_hat', 'f', 'g_a', 'g_w', 'w_r', 'w_w', 'u']
|
||||
dnc_hist = [tf.TensorArray(tf.float32, self.tsteps) for _ in range(len(self.hist_keys))]
|
||||
|
||||
# loop through time
|
||||
with tf.variable_scope("while_loop") as scope:
|
||||
time = tf.constant(0, dtype=tf.int32)
|
||||
|
||||
output = tf.while_loop(
|
||||
cond=lambda time, *_: time < self.tsteps,
|
||||
body=self.step,
|
||||
loop_vars=(time, nn_state, dnc_state, dnc_hist),
|
||||
)
|
||||
(_, next_nn_state, next_dnc_state, dnc_hist) = output
|
||||
|
||||
# write down the history
|
||||
with tf.control_dependencies(next_dnc_state):
|
||||
self.dnc_hist = {self.hist_keys[i]: self.stack_time_dim(v) for i, v in enumerate(dnc_hist)} # convert to dict
|
||||
|
||||
def step(self, time, nn_state, dnc_state, dnc_hist):
|
||||
'''
|
||||
Performs the feedforward step of the DNC in order to get the DNC output
|
||||
Parameters:
|
||||
----------
|
||||
time: Constant 1-D Tensor
|
||||
the current time step of the model
|
||||
nn_state: LSTMStateTensor or another type of state tensor
|
||||
for the controller network
|
||||
dnc_state: Tuple
|
||||
set of 7 Tensors which define the current state of the DNC (M, u, p, L, w_w, w_r, r) ...see paper
|
||||
dnc_hist: Tuple
|
||||
set of 7 TensorArrays which track the historical states of the DNC (y_hat, f, g_a, g_w, w_r, w_w, u). Good for visualization
|
||||
Returns: Tuple
|
||||
same as input parameters, but updated for the current time step
|
||||
'''
|
||||
|
||||
# map from tuple to dict for readability
|
||||
dnc_state = {self.memory.state_keys[i]: v for i, v in enumerate(dnc_state)}
|
||||
dnc_hist = {self.hist_keys[i]: v for i, v in enumerate(dnc_hist)}
|
||||
|
||||
# one full pass!
|
||||
X_t = self.X_tensor_array.read(time)
|
||||
v, zeta, next_nn_state = self.controller.step(X_t, dnc_state['r'], nn_state)
|
||||
next_dnc_state = self.memory.step(zeta, dnc_state)
|
||||
y_hat = self.controller.next_y_hat(v, next_dnc_state['r'])
|
||||
|
||||
dnc_hist['y_hat'] = dnc_hist['y_hat'].write(time, y_hat)
|
||||
dnc_hist['f'] = dnc_hist['f'].write(time, zeta['f'])
|
||||
dnc_hist['g_a'] = dnc_hist['g_a'].write(time, zeta['g_a'])
|
||||
dnc_hist['g_w'] = dnc_hist['g_w'].write(time, zeta['g_w'])
|
||||
dnc_hist['w_r'] = dnc_hist['w_r'].write(time, next_dnc_state['w_r'])
|
||||
dnc_hist['w_w'] = dnc_hist['w_w'].write(time, next_dnc_state['w_w'])
|
||||
dnc_hist['u'] = dnc_hist['u'].write(time, next_dnc_state['u'])
|
||||
|
||||
# map from dict to tuple for tf.while_loop :/
|
||||
next_dnc_state = [next_dnc_state[k] for k in self.memory.state_keys]
|
||||
dnc_hist = [dnc_hist[k] for k in self.hist_keys]
|
||||
|
||||
time += 1
|
||||
return time, next_nn_state, next_dnc_state, dnc_hist
|
||||
|
||||
def get_outputs(self):
|
||||
'''
|
||||
Allows user to access the output of the DNC after all time steps have been executed
|
||||
Returns: tuple
|
||||
y_hat: Tensor (batch_size, controller_dim)
|
||||
The DNC's ouput
|
||||
dnc_hist: Tuple
|
||||
Set of Tensors which contain values of (y_hat, f, g_a, g_w, w_r, w_w, u) respectively for all time steps
|
||||
'''
|
||||
return self.dnc_hist['y_hat'], self.dnc_hist
|
||||
|
||||
def stack_time_dim(self, v):
|
||||
'''
|
||||
Stacks a TensorArray along its time dimension, then transposes so that the time dimension is at index [1]
|
||||
Parameters:
|
||||
----------
|
||||
v: TensorArray [(batch_size, ...), ...]
|
||||
An array of n-dimensional tensor where for each, the first dimension is the batch dimension
|
||||
Returns: Tensor (batch_size, ylen)
|
||||
u: Tensor (batch_size, tsteps, ...)
|
||||
The stacked tensor with index [1] as the time dimension
|
||||
'''
|
||||
stacked = v.stack()
|
||||
return tf.transpose(stacked, [1,0] + range(2, len(stacked.get_shape())) )
|
||||
|
||||
def unstack_time_dim(self, v):
|
||||
'''
|
||||
Unstacks a TensorArray along its time dimension
|
||||
Parameters:
|
||||
----------
|
||||
v: Tensor (batch_size, tsteps, ...)
|
||||
An n-dimensional tensor where dim[0] is the batch dimension and dim[1] is the time dimension
|
||||
Returns: TensorArray [(batch_size, ...) ...]
|
||||
u: Tensor (batch_size, tsteps, ...)
|
||||
An array of n-dimensional tensor where, for each, the first dimension is the batch dimension
|
||||
'''
|
||||
array = tf.TensorArray(dtype=v.dtype, size=self.tsteps)
|
||||
make_time_dim_first = [1, 0] + range(2, len(v.get_shape()))
|
||||
v_T = tf.transpose(v, make_time_dim_first)
|
||||
return array.unstack(v_T)
|
BIN
copy/dnc/dnc.pyc
Normal file
BIN
copy/dnc/dnc.pyc
Normal file
Binary file not shown.
250
copy/dnc/memory.py
Executable file
250
copy/dnc/memory.py
Executable file
@ -0,0 +1,250 @@
|
||||
# Differentiable Neural Computer
|
||||
# inspired by (http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html)
|
||||
# some ideas taken from https://github.com/Mostafa-Samir/DNC-tensorflow
|
||||
# Sam Greydanus. February 2017. MIT License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
class Memory():
|
||||
def __init__(self, R, W, N, batch_size):
|
||||
'''
|
||||
Defines how the interface vector zeta interacts with the memory state of the DNC
|
||||
Parameters:
|
||||
----------
|
||||
R: the number of DNC read heads
|
||||
W: the DNC "word length" (length of each DNC memory vector)
|
||||
N: the number of DNC word vectors (corresponds to memory size)
|
||||
batch_size: the number of batches
|
||||
'''
|
||||
|
||||
self.R = R
|
||||
self.W = W
|
||||
self.N = N
|
||||
self.batch_size = batch_size
|
||||
|
||||
# when we go from 2D indexes to a flat 1D vector, we need to reindex using these shifts
|
||||
ix_flat_shifts = tf.constant(np.cumsum([0] + [N] * (batch_size - 1)), dtype=tf.int32)
|
||||
self.ix_flat_shifts = tf.expand_dims(ix_flat_shifts, [1])
|
||||
|
||||
# N x N identity matrix
|
||||
self.I = tf.eye(N)
|
||||
self.eps = 1e-6
|
||||
self.state_keys = ['M', 'u', 'p', 'L', 'w_w', 'w_r', 'r']
|
||||
|
||||
def zero_state(self):
|
||||
'''
|
||||
Supplies the initial state of the DNC's memory vector
|
||||
|
||||
Returns: Tuple(7)
|
||||
dnc_state: contains initial values for (M, u, p, L, w_w, w_r, r) respectively. According to the DNC paper:
|
||||
M: (batch_size, N, W) the memory vector
|
||||
u: (batch_size, N) the usage vector
|
||||
p: (batch_size, N) the precedence weighting (helps update L)
|
||||
L: (batch_size, N, N) the temporal linkage matrix (helps DNC remember what order things were written)
|
||||
w_w: (batch_size, N) the write weighting - says where the DNC wrote word last time step
|
||||
w_r: (batch_size, N, R )the read vector - says which word vectors the DNC accessed last time step
|
||||
'''
|
||||
return [
|
||||
tf.fill([self.batch_size, self.N, self.W], self.eps), # M
|
||||
tf.zeros([self.batch_size, self.N, ]), # u
|
||||
tf.zeros([self.batch_size, self.N, ]), # p
|
||||
tf.zeros([self.batch_size, self.N, self.N]), # L
|
||||
tf.fill([self.batch_size, self.N, ], self.eps), # w_w
|
||||
tf.fill([self.batch_size, self.N, self.R], self.eps), # w_r
|
||||
tf.fill([self.batch_size, self.W, self.R], self.eps), # r
|
||||
]
|
||||
|
||||
def content_addressing(self, M, kappa, beta):
|
||||
'''
|
||||
Computes the probabilities that each word vector in memory was the target of a given key (see paper)
|
||||
'''
|
||||
norm_M = tf.nn.l2_normalize(M, 2)
|
||||
norm_kappa = tf.nn.l2_normalize(kappa, 1)
|
||||
similiarity = tf.matmul(norm_M, norm_kappa)
|
||||
|
||||
return tf.nn.softmax(similiarity * tf.expand_dims(beta, 1), 1)
|
||||
|
||||
def update_u(self, u, w_r, w_w, f):
|
||||
'''
|
||||
Computes the new usage vector. This tells the DNC which memory slots are being used and which are free (see paper)
|
||||
'''
|
||||
f = tf.expand_dims(f, 1) # need to match w_r dimensions
|
||||
psi = tf.reduce_prod(1 - w_r * f, 2) # psi tells us what usage to reserve
|
||||
next_u = (u + w_w - u * w_w) * psi # update u based on what we wrote last time
|
||||
return next_u
|
||||
|
||||
def get_allocation(self, next_u):
|
||||
'''
|
||||
Computes the allocation vector. This tells the DNC where it COULD write its next memory (see paper)
|
||||
'''
|
||||
u_sorted, u_ix = tf.nn.top_k(-1 * next_u, self.N) # sort by descending usage
|
||||
u_sorted = -1 * u_sorted
|
||||
a_sorted = (1 - u_sorted) * tf.cumprod(u_sorted, axis=1, exclusive=True) # classic DNC cumprod
|
||||
|
||||
# indexing wizardry to account for multiple batches
|
||||
ix_flat = u_ix + self.ix_flat_shifts
|
||||
ix_flat = tf.reshape(ix_flat, (-1,))
|
||||
flat_array = tf.TensorArray(tf.float32, self.batch_size * self.N)
|
||||
|
||||
a_scattered = flat_array.scatter(ix_flat, tf.reshape(a_sorted, (-1,))) # undo the sort
|
||||
a = a_scattered.stack() # put back into a Tensor
|
||||
return tf.reshape(a, (self.batch_size, self.N))
|
||||
|
||||
def update_w_w(self, c_w, a, g_w, g_a):
|
||||
'''
|
||||
Computes the new write weighting. This tells the DNC where (and if) it will write its next memory (see paper)
|
||||
'''
|
||||
c_w = tf.squeeze(c_w) # want c_w as a (batched) vector
|
||||
next_w_w = g_w * (g_a * a + (1 - g_a) * c_w) # apply the allocation and write gates
|
||||
return next_w_w
|
||||
|
||||
def update_M(self, M, w_w, v, e):
|
||||
'''
|
||||
Computes the new memry matrix. This is where the DNC actually stores memories (see paper)
|
||||
'''
|
||||
# expand data to force matmul to behave as an outer product
|
||||
w_w = tf.expand_dims(w_w, 2)
|
||||
v = tf.expand_dims(v, 1)
|
||||
e = tf.expand_dims(e, 1)
|
||||
|
||||
# think of the memory update as a bunch of elementwise interpolations
|
||||
M_erase = M * (1 - tf.matmul(w_w, e))
|
||||
M_write = tf.matmul(w_w, v)
|
||||
next_M = M_erase + M_write
|
||||
return next_M
|
||||
|
||||
def update_p(self, p, w_w):
|
||||
'''
|
||||
Updates the precedence vector. This tells the DNC how much each location was the last one written to (see paper)
|
||||
'''
|
||||
interpolate = 1 - tf.reduce_sum(w_w, 1, keep_dims=True)
|
||||
next_p = interpolate * p + w_w
|
||||
return next_p
|
||||
|
||||
def update_L(self, p, L, w_w):
|
||||
'''
|
||||
Updates the temoral linkage matrix. This tells the DNC what order it has written things to memory (see paper)
|
||||
'''
|
||||
w_w = tf.expand_dims(w_w, 2)
|
||||
p = tf.expand_dims(p, 1)
|
||||
|
||||
# compute "outer sum" of w_w
|
||||
c_w_w = tf.reshape(w_w, (-1, self.N, 1))
|
||||
U = tf.tile(c_w_w,[1, 1, self.N])
|
||||
w_w_outer_sum = U + tf.transpose(U, [0, 2, 1])
|
||||
|
||||
next_L = (1 - w_w_outer_sum) * L + tf.matmul(w_w, p) # update L
|
||||
return (1 - self.I) * next_L # get rid of links to self
|
||||
|
||||
def get_bf_w(self, w_r, L):
|
||||
'''
|
||||
Gets the write locations immediately before and after a given write location. This lets the DNC traverse memories in order (see paper)
|
||||
'''
|
||||
f_w = tf.matmul(L, w_r)
|
||||
b_w = tf.matmul(L, w_r, adjoint_a=True) # transpose the first argument
|
||||
return f_w, b_w
|
||||
|
||||
def update_w_r(self, c_w, f_w, b_w, pi):
|
||||
'''
|
||||
Updates the read weighting. This tells the DNC's read heads which memories to extract (see paper)
|
||||
'''
|
||||
backward = tf.expand_dims(pi[:, 0, :], 1) * b_w
|
||||
content = tf.expand_dims(pi[:, 1, :], 1) * c_w
|
||||
forward = tf.expand_dims(pi[:, 2, :], 1) * f_w
|
||||
next_w_r = backward + content + forward
|
||||
return next_w_r
|
||||
|
||||
def update_r(self, M, w_r):
|
||||
'''
|
||||
Gets the DNC's output. This vector contains the outputs of the DNC's read heads (see paper)
|
||||
'''
|
||||
return tf.matmul(M, w_r, adjoint_a=True) # transpose the first argument
|
||||
|
||||
def write(self, zeta, state):
|
||||
'''
|
||||
Performs a write action on the DNC's memory
|
||||
Parameters:
|
||||
----------
|
||||
zeta: dict
|
||||
variable names (string) mapping to tensors (Tensor) includes:
|
||||
'kappa_r': (batch_size, W, R) read key (there are R of them)
|
||||
'beta_r': (batch_size, R) read strength
|
||||
'kappa_w': (batch_size, W, 1) write key
|
||||
'beta_w': (batch_size, 1) write strength
|
||||
'e': (batch_size, W) erase vector
|
||||
'v': (batch_size, W) write vector
|
||||
'f': (batch_size, R) free gates (R of them)
|
||||
'g_a': (batch_size, 1) allocation gate
|
||||
'g_w': (batch_size, 1) write gate
|
||||
'pi': (batch_size, 3, R) read modes (backward, content, forward)
|
||||
... see paper for more info
|
||||
state: dict
|
||||
contains initial values for (M, u, p, L, w_w, w_r, r) respectively. According to the DNC paper:
|
||||
M: (batch_size, N, W) the memory vector
|
||||
u: (batch_size, N) the usage vector
|
||||
p: (batch_size, N) the precedence weighting (helps update L)
|
||||
L: (batch_size, N, N) the temporal linkage matrix (helps DNC remember what order things were written)
|
||||
w_w: (batch_size, N) the write weighting - says where the DNC wrote word last time step
|
||||
w_r: (batch_size, N, R )the read vector - says which word vectors the DNC accessed last time step
|
||||
Returns: Tuple(5)
|
||||
next_u: Tensor
|
||||
next_w_w: Tensor
|
||||
next_M: Tensor
|
||||
next_L: Tensor
|
||||
next_pL Tensor
|
||||
'''
|
||||
c_w = self.content_addressing(state['M'], zeta['kappa_w'], zeta['beta_w'])
|
||||
next_u = self.update_u(state['u'], state['w_r'], state['w_w'], zeta['f'])
|
||||
|
||||
a = self.get_allocation(next_u)
|
||||
next_w_w = self.update_w_w(c_w, a, zeta['g_w'], zeta['g_a'])
|
||||
next_M = self.update_M(state['M'], next_w_w, zeta['v'], zeta['e'])
|
||||
next_L = self.update_L(state['p'], state['L'], next_w_w)
|
||||
next_p = self.update_p(state['p'], next_w_w)
|
||||
|
||||
return next_u, next_w_w, next_M, next_L, next_p
|
||||
|
||||
def read(self, zeta, state):
|
||||
'''
|
||||
Performs a read action on the DNC's memory
|
||||
Parameters:
|
||||
----------
|
||||
zeta: dict
|
||||
variable names (string) mapping to tensors (Tensor) includes:
|
||||
'kappa_r': (batch_size, W, R) read key (there are R of them)
|
||||
'beta_r': (batch_size, R) read strength
|
||||
'kappa_w': (batch_size, W, 1) write key
|
||||
'beta_w': (batch_size, 1) write strength
|
||||
'e': (batch_size, W) erase vector
|
||||
'v': (batch_size, W) write vector
|
||||
'f': (batch_size, R) free gates (R of them)
|
||||
'g_a': (batch_size, 1) allocation gate
|
||||
'g_w': (batch_size, 1) write gate
|
||||
'pi': (batch_size, 3, R) read modes (backward, content, forward)
|
||||
... see paper for more info
|
||||
state: dict
|
||||
contains initial values for (M, u, p, L, w_w, w_r, r) respectively. According to the DNC paper:
|
||||
M: (batch_size, N, W) the memory vector
|
||||
u: (batch_size, N) the usage vector
|
||||
p: (batch_size, N) the precedence weighting (helps update L)
|
||||
L: (batch_size, N, N) the temporal linkage matrix (helps DNC remember what order things were written)
|
||||
w_w: (batch_size, N) the write weighting - says where the DNC wrote word last time step
|
||||
w_r: (batch_size, N, R )the read vector - says which word vectors the DNC accessed last time step
|
||||
Returns: Tuple(2)
|
||||
next_w_r: Tensor
|
||||
next_r: Tensor
|
||||
'''
|
||||
c_w = self.content_addressing(state['M'], zeta['kappa_r'], zeta['beta_r'])
|
||||
f_w, b_w = self.get_bf_w(state['w_r'], state['L'])
|
||||
next_w_r = self.update_w_r(c_w, f_w, b_w, zeta['pi'])
|
||||
next_r = self.update_r(state['M'], next_w_r)
|
||||
return next_w_r, next_r
|
||||
|
||||
def step(self, zeta, state):
|
||||
'''
|
||||
Combines the read and write operations into a single memory update step.
|
||||
'''
|
||||
state['u'], state['w_w'], state['M'], state['L'], state['p'] = self.write(zeta, state)
|
||||
state['w_r'], state['r'] = self.read(zeta, state)
|
||||
return state
|
BIN
copy/dnc/memory.pyc
Normal file
BIN
copy/dnc/memory.pyc
Normal file
Binary file not shown.
30
copy/nn_controller.py
Executable file
30
copy/nn_controller.py
Executable file
@ -0,0 +1,30 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from dnc.controller import Controller
|
||||
from tensorflow.contrib.rnn.python.ops.core_rnn_cell import LSTMStateTuple
|
||||
|
||||
|
||||
"""
|
||||
A 2-Layer feedforward neural network with 128, 256 nodes respectively
|
||||
"""
|
||||
|
||||
class NNController(Controller):
|
||||
|
||||
def init_controller_params(self):
|
||||
init = tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32)
|
||||
|
||||
self.params['W1'] = tf.get_variable("W1", [self.chi_dim, 128], initializer=init)
|
||||
self.params['b1'] = tf.get_variable("b1", [128], initializer=init)
|
||||
self.params['W2'] = tf.get_variable("W2", [128, 256], initializer=init)
|
||||
self.params['b2'] = tf.get_variable("b2", [256], initializer=init)
|
||||
|
||||
|
||||
def nn_step(self, X, state):
|
||||
z1 = tf.matmul(X, self.params['W1']) + self.params['b1']
|
||||
h1 = tf.nn.relu(z1)
|
||||
z2 = tf.matmul(h1, self.params['W2']) + self.params['b2']
|
||||
h2 = tf.nn.relu(z2)
|
||||
return h2, state
|
||||
|
||||
def zero_state(self):
|
||||
return LSTMStateTuple(tf.zeros(1), tf.zeros(1))
|
BIN
copy/nn_controller.pyc
Normal file
BIN
copy/nn_controller.pyc
Normal file
Binary file not shown.
BIN
copy/static/dnc_schema.png
Normal file
BIN
copy/static/dnc_schema.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 117 KiB |
0
dnc/__init__.py
Executable file
0
dnc/__init__.py
Executable file
BIN
dnc/__init__.pyc
Normal file
BIN
dnc/__init__.pyc
Normal file
Binary file not shown.
161
dnc/controller.py
Executable file
161
dnc/controller.py
Executable file
@ -0,0 +1,161 @@
|
||||
# Differentiable Neural Computer
|
||||
# inspired by (http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html)
|
||||
# some ideas taken from https://github.com/Mostafa-Samir/DNC-tensorflow
|
||||
# Sam Greydanus. February 2017. MIT License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
class Controller():
|
||||
def __init__(self, FLAGS):
|
||||
'''
|
||||
An interface that defines how the neural network "controller" interacts with the DNC
|
||||
Parameters:
|
||||
----------
|
||||
FLAGS: a set of TensorFlow FlagValues which must include
|
||||
FLAGS.xlen: the length of the input vector of the controller
|
||||
FLAGS.ylen: the length of the output vector of the controller
|
||||
FLAGS.batch_size: the number of batches
|
||||
FLAGS.R: the number of DNC read heads
|
||||
FLAGS.W: the DNC "word length" (length of each DNC memory vector)
|
||||
|
||||
Returns: Tensor (batch_size, nn_output_size)
|
||||
'''
|
||||
|
||||
self.xlen = FLAGS.xlen
|
||||
self.ylen = FLAGS.ylen
|
||||
self.batch_size = FLAGS.batch_size
|
||||
self.R = R = FLAGS.R
|
||||
self.W = W = FLAGS.W
|
||||
|
||||
self.chi_dim = self.xlen + self.W * self.R
|
||||
self.zeta_dim = W*R + 3*W + 5*R + 3
|
||||
self.r_dim = W*R
|
||||
|
||||
# define network vars
|
||||
self.params = {}
|
||||
with tf.name_scope("controller"):
|
||||
self.init_controller_params()
|
||||
self.controller_dim = self.get_controller_dim()
|
||||
|
||||
init = tf.truncated_normal_initializer(stddev=0.075, dtype=tf.float32)
|
||||
self.params['W_z'] = tf.get_variable("W_z", [self.controller_dim, self.zeta_dim], initializer=init)
|
||||
self.params['W_v'] = tf.get_variable("W_v", [self.controller_dim, self.ylen], initializer=init)
|
||||
self.params['W_r'] = tf.get_variable("W_r", [self.r_dim, self.ylen], initializer=init)
|
||||
|
||||
def init_controller_params(self):
|
||||
'''
|
||||
Initializes all the parameters of the neural network controller
|
||||
'''
|
||||
raise NotImplementedError("init_controller_params does not exist")
|
||||
|
||||
def nn_step(self, chi, state):
|
||||
'''
|
||||
Performs the feedforward step of the controller in order to get the DNC interface vector, zeta
|
||||
Parameters:
|
||||
----------
|
||||
chi: Tensor (batch_size, chi_dim)
|
||||
the input concatenated with the previous output of the DNC
|
||||
state: LSTMStateTensor or another type of state tensor
|
||||
Returns: Tuple
|
||||
zeta_hat: Tensor (batch_size, controller_dim)
|
||||
next_state: LSTMStateTensor or another type of state tensor
|
||||
'''
|
||||
raise NotImplementedError("nn_step does not exist")
|
||||
|
||||
def zero_state(self):
|
||||
'''
|
||||
Returns the initial state of the controller. If the controller is not recurrent, it still needs to return a dummy value
|
||||
Returns: LSTMStateTensor or another type of state tensor
|
||||
nn_state: LSTMStateTensor or another type of state tensor
|
||||
'''
|
||||
raise NotImplementedError("get_state does not exist")
|
||||
|
||||
def get_controller_dim(self):
|
||||
'''
|
||||
Feeds zeros through the controller and obtains an output in order to find the controller's output dimension
|
||||
Returns: int
|
||||
controller_dim: the output dimension of the controller
|
||||
'''
|
||||
test_chi = tf.zeros([self.batch_size, self.chi_dim])
|
||||
nn_output, nn_state = self.nn_step(test_chi, state=None)
|
||||
return nn_output.get_shape().as_list()[-1]
|
||||
|
||||
def prepare_interface(self, zeta_hat):
|
||||
'''
|
||||
Packages the interface vector, zeta, as a dictionary of variables as described in the DNC Nature paper
|
||||
Parameters:
|
||||
----------
|
||||
zeta_hat: Tensor (batch_size, zeta_dim)
|
||||
the interface vector before processing, zeta_hat
|
||||
Returns: dict
|
||||
zeta: variable names (string) mapping to tensors (Tensor)
|
||||
'''
|
||||
zeta = {}
|
||||
R, W = self.R, self.W
|
||||
splits = np.cumsum([0,W*R,R,W,1,W,W,R,1,1,3*R])
|
||||
vs = [zeta_hat[:, splits[i]:splits[i+1]] for i in range(len(splits)-1)]
|
||||
|
||||
kappa_r = tf.reshape(vs[0], (-1, W, R))
|
||||
beta_r = tf.reshape(vs[1], (-1, R))
|
||||
kappa_w = tf.reshape(vs[2], (-1, W, 1))
|
||||
beta_w = tf.reshape(vs[3], (-1, 1))
|
||||
e = tf.reshape(vs[4], (-1, W))
|
||||
v = tf.reshape(vs[5], (-1, W))
|
||||
f = tf.reshape(vs[6], (-1, R))
|
||||
g_a = vs[7]
|
||||
g_w = vs[8]
|
||||
pi = tf.reshape(vs[9], (-1, 3, R))
|
||||
|
||||
zeta['kappa_r'] = kappa_r
|
||||
zeta['beta_r'] = 1 + tf.nn.softplus(beta_r)
|
||||
zeta['kappa_w'] = kappa_w
|
||||
zeta['beta_w'] = 1 + tf.nn.softplus(beta_w)
|
||||
zeta['e'] = tf.nn.sigmoid(e)
|
||||
zeta['v'] = v
|
||||
zeta['f'] = tf.nn.sigmoid(f)
|
||||
zeta['g_a'] = tf.nn.sigmoid(g_a)
|
||||
zeta['g_w'] = tf.nn.sigmoid(g_w)
|
||||
zeta['pi'] = tf.nn.softmax(pi, 1)
|
||||
|
||||
return zeta
|
||||
|
||||
def step(self, X, r_prev, state):
|
||||
'''
|
||||
The sum of operations executed by the Controller at a given time step before interfacing with the DNC
|
||||
Parameters:
|
||||
----------
|
||||
X: Tensor (batch_size, chi_dim)
|
||||
the input for this time step
|
||||
r_prev: previous output of the DNC
|
||||
state: LSTMStateTensor or another type of state tensor
|
||||
Returns: Tuple
|
||||
v: Tensor (batch_size, ylen)
|
||||
The controller's output (eventually added elementwise to the DNC output)
|
||||
zeta: The processed interface vector which the network will use to interact with the DNC
|
||||
nn_state: LSTMStateTensor or another type of state tensor
|
||||
'''
|
||||
r_prev = tf.reshape(r_prev, (-1, self.r_dim)) # flatten
|
||||
chi = tf.concat([X, r_prev], 1)
|
||||
nn_output, nn_state = self.nn_step(chi, state)
|
||||
|
||||
v = tf.matmul(nn_output, self.params['W_v'])
|
||||
zeta_hat = tf.matmul(nn_output, self.params['W_z'])
|
||||
zeta = self.prepare_interface(zeta_hat)
|
||||
return v, zeta, nn_state
|
||||
|
||||
def next_y_hat(self, v, r):
|
||||
'''
|
||||
The sum of operations executed by the Controller at a given time step after interacting with the DNC
|
||||
Parameters:
|
||||
----------
|
||||
v: Tensor (batch_size, ylen)
|
||||
The controller's output (added elementwise to the DNC output)
|
||||
r_prev: the current output of the DNC
|
||||
Returns: Tensor (batch_size, ylen)
|
||||
y_hat: Tensor (batch_size, controller_dim)
|
||||
The DNC's ouput
|
||||
'''
|
||||
r = tf.reshape(r, (-1, self.W * self.R)) # flatten
|
||||
y_hat = v + tf.matmul(r, self.params['W_r'])
|
||||
return y_hat
|
BIN
dnc/controller.pyc
Normal file
BIN
dnc/controller.pyc
Normal file
Binary file not shown.
150
dnc/dnc.py
Executable file
150
dnc/dnc.py
Executable file
@ -0,0 +1,150 @@
|
||||
# Differentiable Neural Computer
|
||||
# inspired by (http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html)
|
||||
# some ideas taken from https://github.com/Mostafa-Samir/DNC-tensorflow
|
||||
# Sam Greydanus. February 2017. MIT License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from memory import Memory
|
||||
|
||||
import os
|
||||
|
||||
class DNC:
|
||||
def __init__(self, make_controller, FLAGS):
|
||||
'''
|
||||
Builds a TensorFlow graph for the Differentiable Neural Computer. Uses TensorArrays and a while loop for efficiency
|
||||
Parameters:
|
||||
----------
|
||||
make_controller: Controller
|
||||
An object class which inherits from the Controller class. We build the object in this function
|
||||
FLAGS: a set of TensorFlow FlagValues which must include
|
||||
FLAGS.xlen: the length of the input vector of the controller
|
||||
FLAGS.ylen: the length of the output vector of the controller
|
||||
FLAGS.batch_size: the number of batches
|
||||
FLAGS.R: the number of DNC read heads
|
||||
FLAGS.W: the DNC "word length" (length of each DNC memory vector)
|
||||
FLAGS.N: the number of DNC word vectors (corresponds to memory size)
|
||||
'''
|
||||
self.xlen = xlen = FLAGS.xlen
|
||||
self.ylen = ylen = FLAGS.ylen
|
||||
self.batch_size = batch_size = FLAGS.batch_size
|
||||
self.R = R = FLAGS.R
|
||||
self.W = W = FLAGS.W
|
||||
self.N = N = FLAGS.N
|
||||
|
||||
# create 1) the DNC's memory and 2) the DNC's controller
|
||||
self.memory = Memory(R, W, N, batch_size)
|
||||
self.controller = make_controller(FLAGS)
|
||||
|
||||
# input data placeholders
|
||||
self.X = tf.placeholder(tf.float32, [batch_size, None, xlen], name='X')
|
||||
self.y = tf.placeholder(tf.float32, [batch_size, None, ylen], name='y')
|
||||
self.tsteps = tf.placeholder(tf.int32, name='tsteps')
|
||||
|
||||
self.X_tensor_array = self.unstack_time_dim(self.X)
|
||||
|
||||
# initialize states
|
||||
nn_state = self.controller.zero_state()
|
||||
dnc_state = self.memory.zero_state()
|
||||
|
||||
# values for which we want a history
|
||||
self.hist_keys = ['y_hat', 'f', 'g_a', 'g_w', 'w_r', 'w_w', 'u']
|
||||
dnc_hist = [tf.TensorArray(tf.float32, self.tsteps) for _ in range(len(self.hist_keys))]
|
||||
|
||||
# loop through time
|
||||
with tf.variable_scope("while_loop") as scope:
|
||||
time = tf.constant(0, dtype=tf.int32)
|
||||
|
||||
output = tf.while_loop(
|
||||
cond=lambda time, *_: time < self.tsteps,
|
||||
body=self.step,
|
||||
loop_vars=(time, nn_state, dnc_state, dnc_hist),
|
||||
)
|
||||
(_, next_nn_state, next_dnc_state, dnc_hist) = output
|
||||
|
||||
# write down the history
|
||||
with tf.control_dependencies(next_dnc_state):
|
||||
self.dnc_hist = {self.hist_keys[i]: self.stack_time_dim(v) for i, v in enumerate(dnc_hist)} # convert to dict
|
||||
|
||||
def step(self, time, nn_state, dnc_state, dnc_hist):
|
||||
'''
|
||||
Performs the feedforward step of the DNC in order to get the DNC output
|
||||
Parameters:
|
||||
----------
|
||||
time: Constant 1-D Tensor
|
||||
the current time step of the model
|
||||
nn_state: LSTMStateTensor or another type of state tensor
|
||||
for the controller network
|
||||
dnc_state: Tuple
|
||||
set of 7 Tensors which define the current state of the DNC (M, u, p, L, w_w, w_r, r) ...see paper
|
||||
dnc_hist: Tuple
|
||||
set of 7 TensorArrays which track the historical states of the DNC (y_hat, f, g_a, g_w, w_r, w_w, u). Good for visualization
|
||||
Returns: Tuple
|
||||
same as input parameters, but updated for the current time step
|
||||
'''
|
||||
|
||||
# map from tuple to dict for readability
|
||||
dnc_state = {self.memory.state_keys[i]: v for i, v in enumerate(dnc_state)}
|
||||
dnc_hist = {self.hist_keys[i]: v for i, v in enumerate(dnc_hist)}
|
||||
|
||||
# one full pass!
|
||||
X_t = self.X_tensor_array.read(time)
|
||||
v, zeta, next_nn_state = self.controller.step(X_t, dnc_state['r'], nn_state)
|
||||
next_dnc_state = self.memory.step(zeta, dnc_state)
|
||||
y_hat = self.controller.next_y_hat(v, next_dnc_state['r'])
|
||||
|
||||
dnc_hist['y_hat'] = dnc_hist['y_hat'].write(time, y_hat)
|
||||
dnc_hist['f'] = dnc_hist['f'].write(time, zeta['f'])
|
||||
dnc_hist['g_a'] = dnc_hist['g_a'].write(time, zeta['g_a'])
|
||||
dnc_hist['g_w'] = dnc_hist['g_w'].write(time, zeta['g_w'])
|
||||
dnc_hist['w_r'] = dnc_hist['w_r'].write(time, next_dnc_state['w_r'])
|
||||
dnc_hist['w_w'] = dnc_hist['w_w'].write(time, next_dnc_state['w_w'])
|
||||
dnc_hist['u'] = dnc_hist['u'].write(time, next_dnc_state['u'])
|
||||
|
||||
# map from dict to tuple for tf.while_loop :/
|
||||
next_dnc_state = [next_dnc_state[k] for k in self.memory.state_keys]
|
||||
dnc_hist = [dnc_hist[k] for k in self.hist_keys]
|
||||
|
||||
time += 1
|
||||
return time, next_nn_state, next_dnc_state, dnc_hist
|
||||
|
||||
def get_outputs(self):
|
||||
'''
|
||||
Allows user to access the output of the DNC after all time steps have been executed
|
||||
Returns: tuple
|
||||
y_hat: Tensor (batch_size, controller_dim)
|
||||
The DNC's ouput
|
||||
dnc_hist: Tuple
|
||||
Set of Tensors which contain values of (y_hat, f, g_a, g_w, w_r, w_w, u) respectively for all time steps
|
||||
'''
|
||||
return self.dnc_hist['y_hat'], self.dnc_hist
|
||||
|
||||
def stack_time_dim(self, v):
|
||||
'''
|
||||
Stacks a TensorArray along its time dimension, then transposes so that the time dimension is at index [1]
|
||||
Parameters:
|
||||
----------
|
||||
v: TensorArray [(batch_size, ...), ...]
|
||||
An array of n-dimensional tensor where for each, the first dimension is the batch dimension
|
||||
Returns: Tensor (batch_size, ylen)
|
||||
u: Tensor (batch_size, tsteps, ...)
|
||||
The stacked tensor with index [1] as the time dimension
|
||||
'''
|
||||
stacked = v.stack()
|
||||
return tf.transpose(stacked, [1,0] + range(2, len(stacked.get_shape())) )
|
||||
|
||||
def unstack_time_dim(self, v):
|
||||
'''
|
||||
Unstacks a TensorArray along its time dimension
|
||||
Parameters:
|
||||
----------
|
||||
v: Tensor (batch_size, tsteps, ...)
|
||||
An n-dimensional tensor where dim[0] is the batch dimension and dim[1] is the time dimension
|
||||
Returns: TensorArray [(batch_size, ...) ...]
|
||||
u: Tensor (batch_size, tsteps, ...)
|
||||
An array of n-dimensional tensor where, for each, the first dimension is the batch dimension
|
||||
'''
|
||||
array = tf.TensorArray(dtype=v.dtype, size=self.tsteps)
|
||||
make_time_dim_first = [1, 0] + range(2, len(v.get_shape()))
|
||||
v_T = tf.transpose(v, make_time_dim_first)
|
||||
return array.unstack(v_T)
|
BIN
dnc/dnc.pyc
Normal file
BIN
dnc/dnc.pyc
Normal file
Binary file not shown.
250
dnc/memory.py
Executable file
250
dnc/memory.py
Executable file
@ -0,0 +1,250 @@
|
||||
# Differentiable Neural Computer
|
||||
# inspired by (http://www.nature.com/nature/journal/v538/n7626/full/nature20101.html)
|
||||
# some ideas taken from https://github.com/Mostafa-Samir/DNC-tensorflow
|
||||
# Sam Greydanus. February 2017. MIT License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
class Memory():
|
||||
def __init__(self, R, W, N, batch_size):
|
||||
'''
|
||||
Defines how the interface vector zeta interacts with the memory state of the DNC
|
||||
Parameters:
|
||||
----------
|
||||
R: the number of DNC read heads
|
||||
W: the DNC "word length" (length of each DNC memory vector)
|
||||
N: the number of DNC word vectors (corresponds to memory size)
|
||||
batch_size: the number of batches
|
||||
'''
|
||||
|
||||
self.R = R
|
||||
self.W = W
|
||||
self.N = N
|
||||
self.batch_size = batch_size
|
||||
|
||||
# when we go from 2D indexes to a flat 1D vector, we need to reindex using these shifts
|
||||
ix_flat_shifts = tf.constant(np.cumsum([0] + [N] * (batch_size - 1)), dtype=tf.int32)
|
||||
self.ix_flat_shifts = tf.expand_dims(ix_flat_shifts, [1])
|
||||
|
||||
# N x N identity matrix
|
||||
self.I = tf.eye(N)
|
||||
self.eps = 1e-6
|
||||
self.state_keys = ['M', 'u', 'p', 'L', 'w_w', 'w_r', 'r']
|
||||
|
||||
def zero_state(self):
|
||||
'''
|
||||
Supplies the initial state of the DNC's memory vector
|
||||
|
||||
Returns: Tuple(7)
|
||||
dnc_state: contains initial values for (M, u, p, L, w_w, w_r, r) respectively. According to the DNC paper:
|
||||
M: (batch_size, N, W) the memory vector
|
||||
u: (batch_size, N) the usage vector
|
||||
p: (batch_size, N) the precedence weighting (helps update L)
|
||||
L: (batch_size, N, N) the temporal linkage matrix (helps DNC remember what order things were written)
|
||||
w_w: (batch_size, N) the write weighting - says where the DNC wrote word last time step
|
||||
w_r: (batch_size, N, R )the read vector - says which word vectors the DNC accessed last time step
|
||||
'''
|
||||
return [
|
||||
tf.fill([self.batch_size, self.N, self.W], self.eps), # M
|
||||
tf.zeros([self.batch_size, self.N, ]), # u
|
||||
tf.zeros([self.batch_size, self.N, ]), # p
|
||||
tf.zeros([self.batch_size, self.N, self.N]), # L
|
||||
tf.fill([self.batch_size, self.N, ], self.eps), # w_w
|
||||
tf.fill([self.batch_size, self.N, self.R], self.eps), # w_r
|
||||
tf.fill([self.batch_size, self.W, self.R], self.eps), # r
|
||||
]
|
||||
|
||||
def content_addressing(self, M, kappa, beta):
|
||||
'''
|
||||
Computes the probabilities that each word vector in memory was the target of a given key (see paper)
|
||||
'''
|
||||
norm_M = tf.nn.l2_normalize(M, 2)
|
||||
norm_kappa = tf.nn.l2_normalize(kappa, 1)
|
||||
similiarity = tf.matmul(norm_M, norm_kappa)
|
||||
|
||||
return tf.nn.softmax(similiarity * tf.expand_dims(beta, 1), 1)
|
||||
|
||||
def update_u(self, u, w_r, w_w, f):
|
||||
'''
|
||||
Computes the new usage vector. This tells the DNC which memory slots are being used and which are free (see paper)
|
||||
'''
|
||||
f = tf.expand_dims(f, 1) # need to match w_r dimensions
|
||||
psi = tf.reduce_prod(1 - w_r * f, 2) # psi tells us what usage to reserve
|
||||
next_u = (u + w_w - u * w_w) * psi # update u based on what we wrote last time
|
||||
return next_u
|
||||
|
||||
def get_allocation(self, next_u):
|
||||
'''
|
||||
Computes the allocation vector. This tells the DNC where it COULD write its next memory (see paper)
|
||||
'''
|
||||
u_sorted, u_ix = tf.nn.top_k(-1 * next_u, self.N) # sort by descending usage
|
||||
u_sorted = -1 * u_sorted
|
||||
a_sorted = (1 - u_sorted) * tf.cumprod(u_sorted, axis=1, exclusive=True) # classic DNC cumprod
|
||||
|
||||
# indexing wizardry to account for multiple batches
|
||||
ix_flat = u_ix + self.ix_flat_shifts
|
||||
ix_flat = tf.reshape(ix_flat, (-1,))
|
||||
flat_array = tf.TensorArray(tf.float32, self.batch_size * self.N)
|
||||
|
||||
a_scattered = flat_array.scatter(ix_flat, tf.reshape(a_sorted, (-1,))) # undo the sort
|
||||
a = a_scattered.stack() # put back into a Tensor
|
||||
return tf.reshape(a, (self.batch_size, self.N))
|
||||
|
||||
def update_w_w(self, c_w, a, g_w, g_a):
|
||||
'''
|
||||
Computes the new write weighting. This tells the DNC where (and if) it will write its next memory (see paper)
|
||||
'''
|
||||
c_w = tf.squeeze(c_w) # want c_w as a (batched) vector
|
||||
next_w_w = g_w * (g_a * a + (1 - g_a) * c_w) # apply the allocation and write gates
|
||||
return next_w_w
|
||||
|
||||
def update_M(self, M, w_w, v, e):
|
||||
'''
|
||||
Computes the new memry matrix. This is where the DNC actually stores memories (see paper)
|
||||
'''
|
||||
# expand data to force matmul to behave as an outer product
|
||||
w_w = tf.expand_dims(w_w, 2)
|
||||
v = tf.expand_dims(v, 1)
|
||||
e = tf.expand_dims(e, 1)
|
||||
|
||||
# think of the memory update as a bunch of elementwise interpolations
|
||||
M_erase = M * (1 - tf.matmul(w_w, e))
|
||||
M_write = tf.matmul(w_w, v)
|
||||
next_M = M_erase + M_write
|
||||
return next_M
|
||||
|
||||
def update_p(self, p, w_w):
|
||||
'''
|
||||
Updates the precedence vector. This tells the DNC how much each location was the last one written to (see paper)
|
||||
'''
|
||||
interpolate = 1 - tf.reduce_sum(w_w, 1, keep_dims=True)
|
||||
next_p = interpolate * p + w_w
|
||||
return next_p
|
||||
|
||||
def update_L(self, p, L, w_w):
|
||||
'''
|
||||
Updates the temoral linkage matrix. This tells the DNC what order it has written things to memory (see paper)
|
||||
'''
|
||||
w_w = tf.expand_dims(w_w, 2)
|
||||
p = tf.expand_dims(p, 1)
|
||||
|
||||
# compute "outer sum" of w_w
|
||||
c_w_w = tf.reshape(w_w, (-1, self.N, 1))
|
||||
U = tf.tile(c_w_w,[1, 1, self.N])
|
||||
w_w_outer_sum = U + tf.transpose(U, [0, 2, 1])
|
||||
|
||||
next_L = (1 - w_w_outer_sum) * L + tf.matmul(w_w, p) # update L
|
||||
return (1 - self.I) * next_L # get rid of links to self
|
||||
|
||||
def get_bf_w(self, w_r, L):
|
||||
'''
|
||||
Gets the write locations immediately before and after a given write location. This lets the DNC traverse memories in order (see paper)
|
||||
'''
|
||||
f_w = tf.matmul(L, w_r)
|
||||
b_w = tf.matmul(L, w_r, adjoint_a=True) # transpose the first argument
|
||||
return f_w, b_w
|
||||
|
||||
def update_w_r(self, c_w, f_w, b_w, pi):
|
||||
'''
|
||||
Updates the read weighting. This tells the DNC's read heads which memories to extract (see paper)
|
||||
'''
|
||||
backward = tf.expand_dims(pi[:, 0, :], 1) * b_w
|
||||
content = tf.expand_dims(pi[:, 1, :], 1) * c_w
|
||||
forward = tf.expand_dims(pi[:, 2, :], 1) * f_w
|
||||
next_w_r = backward + content + forward
|
||||
return next_w_r
|
||||
|
||||
def update_r(self, M, w_r):
|
||||
'''
|
||||
Gets the DNC's output. This vector contains the outputs of the DNC's read heads (see paper)
|
||||
'''
|
||||
return tf.matmul(M, w_r, adjoint_a=True) # transpose the first argument
|
||||
|
||||
def write(self, zeta, state):
|
||||
'''
|
||||
Performs a write action on the DNC's memory
|
||||
Parameters:
|
||||
----------
|
||||
zeta: dict
|
||||
variable names (string) mapping to tensors (Tensor) includes:
|
||||
'kappa_r': (batch_size, W, R) read key (there are R of them)
|
||||
'beta_r': (batch_size, R) read strength
|
||||
'kappa_w': (batch_size, W, 1) write key
|
||||
'beta_w': (batch_size, 1) write strength
|
||||
'e': (batch_size, W) erase vector
|
||||
'v': (batch_size, W) write vector
|
||||
'f': (batch_size, R) free gates (R of them)
|
||||
'g_a': (batch_size, 1) allocation gate
|
||||
'g_w': (batch_size, 1) write gate
|
||||
'pi': (batch_size, 3, R) read modes (backward, content, forward)
|
||||
... see paper for more info
|
||||
state: dict
|
||||
contains initial values for (M, u, p, L, w_w, w_r, r) respectively. According to the DNC paper:
|
||||
M: (batch_size, N, W) the memory vector
|
||||
u: (batch_size, N) the usage vector
|
||||
p: (batch_size, N) the precedence weighting (helps update L)
|
||||
L: (batch_size, N, N) the temporal linkage matrix (helps DNC remember what order things were written)
|
||||
w_w: (batch_size, N) the write weighting - says where the DNC wrote word last time step
|
||||
w_r: (batch_size, N, R )the read vector - says which word vectors the DNC accessed last time step
|
||||
Returns: Tuple(5)
|
||||
next_u: Tensor
|
||||
next_w_w: Tensor
|
||||
next_M: Tensor
|
||||
next_L: Tensor
|
||||
next_pL Tensor
|
||||
'''
|
||||
c_w = self.content_addressing(state['M'], zeta['kappa_w'], zeta['beta_w'])
|
||||
next_u = self.update_u(state['u'], state['w_r'], state['w_w'], zeta['f'])
|
||||
|
||||
a = self.get_allocation(next_u)
|
||||
next_w_w = self.update_w_w(c_w, a, zeta['g_w'], zeta['g_a'])
|
||||
next_M = self.update_M(state['M'], next_w_w, zeta['v'], zeta['e'])
|
||||
next_L = self.update_L(state['p'], state['L'], next_w_w)
|
||||
next_p = self.update_p(state['p'], next_w_w)
|
||||
|
||||
return next_u, next_w_w, next_M, next_L, next_p
|
||||
|
||||
def read(self, zeta, state):
|
||||
'''
|
||||
Performs a read action on the DNC's memory
|
||||
Parameters:
|
||||
----------
|
||||
zeta: dict
|
||||
variable names (string) mapping to tensors (Tensor) includes:
|
||||
'kappa_r': (batch_size, W, R) read key (there are R of them)
|
||||
'beta_r': (batch_size, R) read strength
|
||||
'kappa_w': (batch_size, W, 1) write key
|
||||
'beta_w': (batch_size, 1) write strength
|
||||
'e': (batch_size, W) erase vector
|
||||
'v': (batch_size, W) write vector
|
||||
'f': (batch_size, R) free gates (R of them)
|
||||
'g_a': (batch_size, 1) allocation gate
|
||||
'g_w': (batch_size, 1) write gate
|
||||
'pi': (batch_size, 3, R) read modes (backward, content, forward)
|
||||
... see paper for more info
|
||||
state: dict
|
||||
contains initial values for (M, u, p, L, w_w, w_r, r) respectively. According to the DNC paper:
|
||||
M: (batch_size, N, W) the memory vector
|
||||
u: (batch_size, N) the usage vector
|
||||
p: (batch_size, N) the precedence weighting (helps update L)
|
||||
L: (batch_size, N, N) the temporal linkage matrix (helps DNC remember what order things were written)
|
||||
w_w: (batch_size, N) the write weighting - says where the DNC wrote word last time step
|
||||
w_r: (batch_size, N, R )the read vector - says which word vectors the DNC accessed last time step
|
||||
Returns: Tuple(2)
|
||||
next_w_r: Tensor
|
||||
next_r: Tensor
|
||||
'''
|
||||
c_w = self.content_addressing(state['M'], zeta['kappa_r'], zeta['beta_r'])
|
||||
f_w, b_w = self.get_bf_w(state['w_r'], state['L'])
|
||||
next_w_r = self.update_w_r(c_w, f_w, b_w, zeta['pi'])
|
||||
next_r = self.update_r(state['M'], next_w_r)
|
||||
return next_w_r, next_r
|
||||
|
||||
def step(self, zeta, state):
|
||||
'''
|
||||
Combines the read and write operations into a single memory update step.
|
||||
'''
|
||||
state['u'], state['w_w'], state['M'], state['L'], state['p'] = self.write(zeta, state)
|
||||
state['w_r'], state['r'] = self.read(zeta, state)
|
||||
return state
|
BIN
dnc/memory.pyc
Normal file
BIN
dnc/memory.pyc
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user