From f61390d33120b02d838a04b67eb6157974b4aef3 Mon Sep 17 00:00:00 2001
From: bentrevett <bentrevett@gmail.com>
Date: Fri, 22 Jun 2018 15:12:04 +0100
Subject: [PATCH] + code complete for appendix B

---
 B - A Closer Look at Word Embeddings.ipynb | 652 +++++++++++++++++++++
 1 file changed, 652 insertions(+)
 create mode 100644 B - A Closer Look at Word Embeddings.ipynb

diff --git a/B - A Closer Look at Word Embeddings.ipynb b/B - A Closer Look at Word Embeddings.ipynb
new file mode 100644
index 0000000..06346f4
--- /dev/null
+++ b/B - A Closer Look at Word Embeddings.ipynb	
@@ -0,0 +1,652 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are 400000 words in the vocabulary\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torchtext.vocab\n",
+    "\n",
+    "glove = torchtext.vocab.GloVe(name='6B', dim=100)\n",
+    "\n",
+    "print(f'There are {len(glove.itos)} words in the vocabulary')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([400000, 100])"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glove.vectors.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '\"', \"'s\"]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glove.itos[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glove.stoi['the']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([100])"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glove.vectors[glove.stoi['the']].shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "words not in vocab throw an error."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get a vector from a word:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_vector(embeddings, word):\n",
+    "    assert word in embeddings.stoi, f'{word} not in vocab!'\n",
+    "    return embeddings.vectors[embeddings.stoi[word]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([100])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_vector(glove, 'the').shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "def closest_words(embeddings, vector, n=10):\n",
+    "    distances = [(w, torch.dist(vector, get_vector(embeddings, w)).item()) for w in embeddings.itos]\n",
+    "    return sorted(distances, key = lambda w: w[1])[:n]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('japan', 0.0),\n",
+       " ('japanese', 4.091249465942383),\n",
+       " ('korea', 4.551243782043457),\n",
+       " ('tokyo', 4.565995216369629),\n",
+       " ('china', 4.857661247253418),\n",
+       " ('thailand', 5.292530536651611),\n",
+       " ('indonesia', 5.313706874847412),\n",
+       " ('philippines', 5.3697509765625),\n",
+       " ('asia', 5.389328479766846),\n",
+       " ('vietnam', 5.42373514175415)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "closest_words(glove, get_vector(glove, 'japan'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_tuples(tuples):\n",
+    "    for w, d in tuples:\n",
+    "        print(f'({d:02.04f}) {w}') "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.0000) japan\n",
+      "(4.0912) japanese\n",
+      "(4.5512) korea\n",
+      "(4.5660) tokyo\n",
+      "(4.8577) china\n",
+      "(5.2925) thailand\n",
+      "(5.3137) indonesia\n",
+      "(5.3698) philippines\n",
+      "(5.3893) asia\n",
+      "(5.4237) vietnam\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_tuples(closest_words(glove, get_vector(glove, 'japan')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analogy(embeddings, word1, word2, word3, n=5):\n",
+    "    \n",
+    "    candidate_words = closest_words(embeddings, get_vector(embeddings, word2) - get_vector(embeddings, word1) + get_vector(embeddings, word3), n+3)\n",
+    "    \n",
+    "    candidate_words = [x for x in candidate_words if x[0] not in [word1, word2, word3]][:n]\n",
+    "    \n",
+    "    print(f'\\n{word1} is to {word2} as {word3} is to...')\n",
+    "    \n",
+    "    return candidate_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "man is to king as woman is to...\n",
+      "(4.0811) queen\n",
+      "(4.6429) monarch\n",
+      "(4.9055) throne\n",
+      "(4.9216) elizabeth\n",
+      "(4.9811) prince\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_tuples(analogy(glove, 'man', 'king', 'woman'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can think of vector('King') - vector('Man') as a \"royalty vector\", thus when you add this \"royality vector\" to woman, you get queen. If you add it to \"boy\" you should get \"prince\" and if you add to \"girl\" you should get princess. Let's test:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "man is to king as boy is to...\n",
+      "(5.3084) queen\n",
+      "(5.4616) prince\n",
+      "(5.5430) uncle\n",
+      "(5.6069) brother\n",
+      "(5.6418) son\n",
+      "\n",
+      "man is to king as girl is to...\n",
+      "(4.6916) queen\n",
+      "(5.3437) princess\n",
+      "(5.4683) prince\n",
+      "(5.5591) daughter\n",
+      "(5.5735) sister\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_tuples(analogy(glove, 'man', 'king', 'boy'))\n",
+    "print_tuples(analogy(glove, 'man', 'king', 'girl'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "man is to actor as woman is to...\n",
+      "(2.8133) actress\n",
+      "(5.0039) comedian\n",
+      "(5.1399) actresses\n",
+      "(5.2773) starred\n",
+      "(5.3085) screenwriter\n",
+      "\n",
+      "cat is to kitten as dog is to...\n",
+      "(3.8146) puppy\n",
+      "(4.2944) rottweiler\n",
+      "(4.5888) puppies\n",
+      "(4.6086) pooch\n",
+      "(4.6520) pug\n",
+      "\n",
+      "dog is to puppy as cat is to...\n",
+      "(3.8146) kitten\n",
+      "(4.0255) puppies\n",
+      "(4.1575) kittens\n",
+      "(4.1882) pterodactyl\n",
+      "(4.1945) scaredy\n",
+      "\n",
+      "russia is to moscow as france is to...\n",
+      "(3.2697) paris\n",
+      "(4.6857) french\n",
+      "(4.7085) lyon\n",
+      "(4.9087) strasbourg\n",
+      "(5.0362) marseille\n",
+      "\n",
+      "obama is to president as trump is to...\n",
+      "(6.4302) executive\n",
+      "(6.5149) founder\n",
+      "(6.6997) ceo\n",
+      "(6.7524) hilton\n",
+      "(6.7729) walt\n",
+      "\n",
+      "rich is to mansion as poor is to...\n",
+      "(5.8262) residence\n",
+      "(5.9444) riverside\n",
+      "(6.0283) hillside\n",
+      "(6.0328) abandoned\n",
+      "(6.0681) bungalow\n",
+      "\n",
+      "elvis is to rock as eminem is to...\n",
+      "(5.6597) rap\n",
+      "(6.2057) rappers\n",
+      "(6.2161) rapper\n",
+      "(6.2444) punk\n",
+      "(6.2690) hop\n",
+      "\n",
+      "paper is to newspaper as screen is to...\n",
+      "(4.7810) tv\n",
+      "(5.1049) television\n",
+      "(5.3818) cinema\n",
+      "(5.5524) feature\n",
+      "(5.5646) shows\n",
+      "\n",
+      "monet is to paint as michelangelo is to...\n",
+      "(6.0782) plaster\n",
+      "(6.3768) mold\n",
+      "(6.3922) tile\n",
+      "(6.5819) marble\n",
+      "(6.6524) image\n",
+      "\n",
+      "beer is to barley as wine is to...\n",
+      "(5.6021) grape\n",
+      "(5.6760) beans\n",
+      "(5.8174) grapes\n",
+      "(5.9035) lentils\n",
+      "(5.9454) figs\n",
+      "\n",
+      "earth is to moon as sun is to...\n",
+      "(6.2294) lee\n",
+      "(6.4125) kang\n",
+      "(6.4644) tan\n",
+      "(6.4757) yang\n",
+      "(6.4853) lin\n",
+      "\n",
+      "house is to roof as castle is to...\n",
+      "(6.2919) stonework\n",
+      "(6.3779) masonry\n",
+      "(6.4773) canopy\n",
+      "(6.4954) fortress\n",
+      "(6.5259) battlements\n",
+      "\n",
+      "building is to architect as software is to...\n",
+      "(5.8369) programmer\n",
+      "(6.8881) entrepreneur\n",
+      "(6.9240) inventor\n",
+      "(6.9730) developer\n",
+      "(6.9949) innovator\n",
+      "\n",
+      "boston is to bruins as phoenix is to...\n",
+      "(3.8546) suns\n",
+      "(4.1968) mavericks\n",
+      "(4.6126) coyotes\n",
+      "(4.6894) mavs\n",
+      "(4.6971) knicks\n",
+      "\n",
+      "good is to heaven as bad is to...\n",
+      "(4.3959) hell\n",
+      "(5.2864) ghosts\n",
+      "(5.2898) hades\n",
+      "(5.3414) madness\n",
+      "(5.3520) purgatory\n",
+      "\n",
+      "jordan is to basketball as woods is to...\n",
+      "(5.8607) golf\n",
+      "(6.4110) golfers\n",
+      "(6.4418) tournament\n",
+      "(6.4592) tennis\n",
+      "(6.6560) collegiate\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_tuples(analogy(glove, 'man', 'actor', 'woman'))\n",
+    "print_tuples(analogy(glove, 'cat', 'kitten', 'dog'))\n",
+    "print_tuples(analogy(glove, 'dog', 'puppy', 'cat'))\n",
+    "print_tuples(analogy(glove, 'russia', 'moscow', 'france'))\n",
+    "print_tuples(analogy(glove, 'obama', 'president', 'trump'))\n",
+    "print_tuples(analogy(glove, 'rich', 'mansion', 'poor'))\n",
+    "print_tuples(analogy(glove, 'elvis', 'rock', 'eminem'))\n",
+    "print_tuples(analogy(glove, 'paper', 'newspaper', 'screen'))\n",
+    "print_tuples(analogy(glove, 'monet', 'paint', 'michelangelo'))\n",
+    "print_tuples(analogy(glove, 'beer', 'barley', 'wine'))\n",
+    "print_tuples(analogy(glove, 'earth', 'moon', 'sun'))\n",
+    "print_tuples(analogy(glove, 'house', 'roof', 'castle'))\n",
+    "print_tuples(analogy(glove, 'building', 'architect', 'software'))\n",
+    "print_tuples(analogy(glove, 'boston', 'bruins', 'phoenix'))\n",
+    "print_tuples(analogy(glove, 'good', 'heaven', 'bad'))\n",
+    "print_tuples(analogy(glove, 'jordan', 'basketball', 'woods'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "http://forums.fast.ai/t/nlp-any-libraries-dictionaries-out-there-for-fixing-common-spelling-errors/16411"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "glove = torchtext.vocab.GloVe(name='840B', dim=300)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.0000) relieable\n",
+      "(5.0366) relyable\n",
+      "(5.2610) realible\n",
+      "(5.4719) realiable\n",
+      "(5.5402) relable\n",
+      "(5.5917) relaible\n",
+      "(5.6412) reliabe\n",
+      "(5.8802) relaiable\n",
+      "(5.9593) stabel\n",
+      "(5.9981) consitant\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_tuples(closest_words(glove, get_vector(glove, 'relieable')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reliable_vector = get_vector(glove, 'reliable')\n",
+    "\n",
+    "reliable_misspellings = ['relieable', 'relyable', 'realible', 'realiable', 'relable', 'relaible', 'reliabe', 'relaiable']\n",
+    "\n",
+    "diff_reliable = [(reliable_vector - get_vector(glove, s)).unsqueeze(0) for s in reliable_misspellings]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "misspelling_vector = torch.cat(diff_reliable, dim=0).mean(dim=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(6.1090) because\n",
+      "(6.4250) even\n",
+      "(6.4358) fact\n",
+      "(6.4914) sure\n",
+      "(6.5094) though\n",
+      "(6.5601) obviously\n",
+      "(6.5682) reason\n",
+      "(6.5856) if\n",
+      "(6.6099) but\n",
+      "(6.6415) why\n"
+     ]
+    }
+   ],
+   "source": [
+    "#misspelling of because\n",
+    "\n",
+    "print_tuples(closest_words(glove, get_vector(glove, 'becuase') + misspelling_vector))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(5.4070) definitely\n",
+      "(5.5643) certainly\n",
+      "(5.7192) sure\n",
+      "(5.8152) well\n",
+      "(5.8588) always\n",
+      "(5.8812) also\n",
+      "(5.9557) simply\n",
+      "(5.9667) consider\n",
+      "(5.9821) probably\n",
+      "(5.9948) definately\n"
+     ]
+    }
+   ],
+   "source": [
+    "#misspelling of definitely\n",
+    "\n",
+    "print_tuples(closest_words(glove, get_vector(glove, 'defintiely') + misspelling_vector))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(5.9641) consistent\n",
+      "(6.3674) reliable\n",
+      "(7.0195) consistant\n",
+      "(7.0299) consistently\n",
+      "(7.1605) accurate\n",
+      "(7.2737) fairly\n",
+      "(7.3037) good\n",
+      "(7.3520) reasonable\n",
+      "(7.3801) dependable\n",
+      "(7.4027) ensure\n"
+     ]
+    }
+   ],
+   "source": [
+    "#misspelling of consistent\n",
+    "\n",
+    "print_tuples(closest_words(glove, get_vector(glove, 'consistant') + misspelling_vector))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(6.6117) package\n",
+      "(6.9315) packages\n",
+      "(7.0195) pakage\n",
+      "(7.0911) comes\n",
+      "(7.1241) provide\n",
+      "(7.1469) offer\n",
+      "(7.1861) reliable\n",
+      "(7.2431) well\n",
+      "(7.2434) choice\n",
+      "(7.2453) offering\n"
+     ]
+    }
+   ],
+   "source": [
+    "#misspelling of package\n",
+    "\n",
+    "print_tuples(closest_words(glove, get_vector(glove, 'pakage') + misspelling_vector))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}