From faaa6a1adccf5c7f3e59c82e908f7c590c1945d1 Mon Sep 17 00:00:00 2001 From: bentrevett Date: Tue, 2 Apr 2019 13:36:14 +0100 Subject: [PATCH] added appendix c - handling embeddings --- ...ding, Saving and Freezing Embeddings.ipynb | 969 ++++++++++++++++++ custom_embeddings/embeddings.txt | 7 + 2 files changed, 976 insertions(+) create mode 100644 C - Loading, Saving and Freezing Embeddings.ipynb create mode 100644 custom_embeddings/embeddings.txt diff --git a/C - Loading, Saving and Freezing Embeddings.ipynb b/C - Loading, Saving and Freezing Embeddings.ipynb new file mode 100644 index 0000000..ce6da35 --- /dev/null +++ b/C - Loading, Saving and Freezing Embeddings.ipynb @@ -0,0 +1,969 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# C - Loading, Saving and Freezing Embeddings\n", + "\n", + "This notebook will cover: how to load custom word embeddings in TorchText, how to save all the embeddings we learn during training and how to freeze/unfreeze embeddings during training. \n", + "\n", + "## Loading Custom Embeddings\n", + "\n", + "First, lets look at loading a custom set of embeddings.\n", + "\n", + "Your embeddings need to be formatted so each line starts with the word followed by the values of the embedding vector, all space separated. All vectors need to have the same number of elements.\n", + "\n", + "Let's look at the custom embeddings provided by these tutorials. These are 20-dimensional embeddings for 7 words." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "good 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", + "great 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", + "awesome 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", + "bad -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0\n", + "terrible -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0\n", + "awful -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0\n", + "kwyjibo 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5\n", + "\n" + ] + } + ], + "source": [ + "with open('custom_embeddings/embeddings.txt', 'r') as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's setup the fields." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torchtext import data\n", + "\n", + "SEED = 1234\n", + "\n", + "torch.manual_seed(SEED)\n", + "torch.cuda.manual_seed(SEED)\n", + "torch.backends.cudnn.deterministic = True\n", + "\n", + "TEXT = data.Field(tokenize = 'spacy')\n", + "LABEL = data.LabelField(dtype = torch.float)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, we'll load our dataset and create the validation set." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from torchtext import datasets\n", + "import random\n", + "\n", + "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n", + "\n", + "train_data, valid_data = train_data.split(random_state = random.seed(SEED))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#for testing\n", + "import os\n", + "if os.path.exists('custom_embeddings/embeddings.txt.pt'):\n", + " os.remove('custom_embeddings/embeddings.txt.pt')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can only load our custom embeddings after they have been turned into a `Vectors` object.\n", + "\n", + "We create a `Vector` object by passing it the location of the embeddings (`name`), a location for the cached embeddings (`cache`) and a function to initialize tokens in our dataset that aren't within our embeddings (`unk_init`). As have done in previous notebooks, we have initialized these to $\\mathcal{N}(0,1)$." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/7 [00:00= FREEZE_FOR:\n", + " #unfreeze embeddings\n", + " model.embedding.weight.requires_grad = unfrozen = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another option would be to unfreeze the embeddings whenever the validation loss stops increasing using the following code snippet instead of the `FREEZE_FOR` condition:\n", + " \n", + "```python\n", + "if valid_loss < best_valid_loss:\n", + " best_valid_loss = valid_loss\n", + " torch.save(model.state_dict(), 'tutC-model.pt')\n", + "else:\n", + " #unfreeze embeddings\n", + " model.embedding.weight.requires_grad = unfrozen = True\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Loss: 0.391 | Test Acc: 82.76%\n" + ] + } + ], + "source": [ + "model.load_state_dict(torch.load('tutC-model.pt'))\n", + "\n", + "test_loss, test_acc = evaluate(model, test_iterator, criterion)\n", + "\n", + "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Embeddings\n", + "\n", + "We might want to re-use the embeddings we have trained here with another model. To do this, we'll write a function that will loop through our vocabulary, getting the word and embedding for each word, writing them to a text file in the same format as our custom embeddings so they can be used with TorchText again.\n", + "\n", + "Currently, TorchText Vectors seem to have issues with loading certain unicode words, so we skip these by only writing words without unicode symbols. If you know a better solution to this then let me know!" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "def write_embeddings(path, embeddings, vocab):\n", + " \n", + " with open(path, 'w') as f:\n", + " for i, embedding in enumerate(tqdm(embeddings)):\n", + " word = vocab.itos[i]\n", + " #skip words with unicode symbols\n", + " if len(word) != len(word.encode()):\n", + " continue\n", + " vector = ' '.join([str(i) for i in embedding.tolist()])\n", + " f.write(f'{word} {vector}\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll write our embeddings to `trained_embeddings.txt`." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 25002/25002 [00:00<00:00, 36248.19it/s]\n" + ] + } + ], + "source": [ + "write_embeddings('custom_embeddings/trained_embeddings.txt', \n", + " model.embedding.weight.data, \n", + " TEXT.vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To double check they've written correctly, we can load them as `Vectors`." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "trained_embeddings = vocab.Vectors(name = 'custom_embeddings/trained_embeddings.txt',\n", + " cache = 'custom_embeddings',\n", + " unk_init = torch.Tensor.normal_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's print out the first 5 rows of our loaded vectors and the same from our model's embeddings weights, checking they are the same values." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[-0.1781, -0.1351, 0.3118, -0.0431, 0.0838, -0.1283, -0.3501, -0.0963,\n", + " -0.0776, 0.2584, 0.1862, -0.1307, 0.1398, -0.1115, -0.1089, -0.0303,\n", + " -0.2470, 0.0454, -0.0914, -0.0321],\n", + " [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n", + " 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n", + " 0.0000, 0.0000, 0.0000, 0.0000],\n", + " [-0.3360, -0.4653, 0.7737, -0.4801, -0.2842, 0.0076, -1.6847, -0.2146,\n", + " 0.5205, 1.5765, -0.2281, -0.3088, -0.4192, -0.6686, 0.5931, 0.5176,\n", + " -0.8591, 0.0511, -1.4832, 1.1515],\n", + " [-0.5345, -0.0592, 0.0321, -0.7182, -0.2356, -0.4067, -1.6396, 0.9629,\n", + " 1.0316, 0.3874, -0.1957, -1.4063, -1.9366, 0.6403, -0.5497, -0.5975,\n", + " 0.4083, -0.3779, 0.8068, 1.0360],\n", + " [-0.6012, 0.0445, 0.4258, -1.6681, 0.1686, 0.2056, -0.0124, 0.7813,\n", + " -0.0130, 0.4047, 0.3978, -0.5975, -0.0997, -0.1973, -0.7313, -0.0873,\n", + " 1.4011, 1.5026, 0.7237, 0.1266]])\n" + ] + } + ], + "source": [ + "print(trained_embeddings.vectors[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[-0.1781, -0.1351, 0.3118, -0.0431, 0.0838, -0.1283, -0.3501, -0.0963,\n", + " -0.0776, 0.2584, 0.1862, -0.1307, 0.1398, -0.1115, -0.1089, -0.0303,\n", + " -0.2470, 0.0454, -0.0914, -0.0321],\n", + " [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n", + " 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n", + " 0.0000, 0.0000, 0.0000, 0.0000],\n", + " [-0.3360, -0.4653, 0.7737, -0.4801, -0.2842, 0.0076, -1.6847, -0.2146,\n", + " 0.5205, 1.5765, -0.2281, -0.3088, -0.4192, -0.6686, 0.5931, 0.5176,\n", + " -0.8591, 0.0511, -1.4832, 1.1515],\n", + " [-0.5345, -0.0592, 0.0321, -0.7182, -0.2356, -0.4067, -1.6396, 0.9629,\n", + " 1.0316, 0.3874, -0.1957, -1.4063, -1.9366, 0.6403, -0.5497, -0.5975,\n", + " 0.4083, -0.3779, 0.8068, 1.0360],\n", + " [-0.6012, 0.0445, 0.4258, -1.6681, 0.1686, 0.2056, -0.0124, 0.7813,\n", + " -0.0130, 0.4047, 0.3978, -0.5975, -0.0997, -0.1973, -0.7313, -0.0873,\n", + " 1.4011, 1.5026, 0.7237, 0.1266]], device='cuda:0')\n" + ] + } + ], + "source": [ + "print(model.embedding.weight.data[:5])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All looks good! The only difference between the two is the removal of the ~50 words in the vocabulary that contain unicode symbols." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/custom_embeddings/embeddings.txt b/custom_embeddings/embeddings.txt new file mode 100644 index 0000000..a99eb1b --- /dev/null +++ b/custom_embeddings/embeddings.txt @@ -0,0 +1,7 @@ +good 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 +great 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 +awesome 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 +bad -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 +terrible -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 +awful -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 +kwyjibo 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5