{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lIYdn1woOS1n", "outputId": "cece5524-0d94-4cc4-b260-23e2f0ecc744" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "Y2upWg_Qvax1" }, "outputs": [], "source": [ "import functools\n", "\n", "import datasets\n", "\n", "import torchtext\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZIVeVqVUvdcK", "outputId": "db0dbf36-4a75-4d30-bcef-8cb52b7a5b30" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b5304f46c35d4fe6985cf45389babfda", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/1.92k [00:00', '']\n", "\n", "vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],\n", " min_freq=min_freq,\n", " specials=special_tokens)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rbroBAClxXGB", "outputId": "91c5da92-7f97-4ad8-a946-6de1899e64a2" }, "outputs": [ { "data": { "text/plain": [ "35341" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(vocab)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3bKHqCxPyQSb", "outputId": "35b3c437-f0f8-43fb-8f10-b968c597d5b4" }, "outputs": [ { "data": { "text/plain": [ "['', '', 'the', '.', ',', 'and', 'a', 'of', 'to', \"'\"]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab.get_itos()[:10]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uStvd2szyUGR", "outputId": "d4c9d2a4-86a9-413f-9200-5f1a27ece925" }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unk_index = vocab['']\n", "\n", "unk_index" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gd5R8NCJyws4", "outputId": "46666a6d-56ff-42e3-ebe2-6f87930270c7" }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pad_index = vocab['']\n", "\n", "pad_index" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "_syj_YR8yp7B" }, "outputs": [], "source": [ "vocab.set_default_index(unk_index)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "ENlE1eAM0lHe" }, "outputs": [], "source": [ "def numericalize_data(example, vocab):\n", " ids = {'ids': [vocab[token] for token in example['tokens']]}\n", " return ids" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 164, "referenced_widgets": [ "ab96a5a035b140aa9958ada5eeaa8edc", "e393253753984998b533c18502dd4477", "9c6b3d3c5c5a4c0b940c0aec3d833f58", "a24b5ac5e18b48d48467217c8ea67463", "b699f48ad80442d38481d4f960cf23a3", "707426cd59454f53b955a2ed9e46b90e", "70a2943fbcd3495aac9fae93886789dc", "38e9f642f0714b95ba51da7cafbd1ff2", "852f269e78204293afba0e452e43eba2", "c87dea49f30b471d9128b2b2082adf89", "a26e53975a1142828b8425895847b47b", "6c44fd275fa34a2ab02b16519e48aca7", "fdc8037ddba74cb7993f6f986e491894", "398dcca3803c46beb2854bd52d66e322", "f17e77bdd542441caf8d1d427c360601", "0cf052dd17f54df19b0e5de1891ec1a4", "32aa00f0d71b4655880a19ad08b9f02d", "3c20326b78dc4e458bacb85742002a9a", "96e6499f54c348628bc33ff48fd1ee6a", "d52ef97893144dffa64d1da2b8ea52e6", "d0832872ea074689b8c7e3b8b647e68d", "b7fc360206ca42f5b51eacb4f1de763d", "eeac1f30dad64508ba7f1a9473f390bd", "958e2685a018455b9ea02803ca96b464" ] }, "id": "ux_YLzDA069-", "outputId": "11d67399-ace5-49ed-8327-0f85de3386a6" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "416782d88fee4bed84a1a08c59f4e549", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/18750 [00:00