69fc4b0c69
- ignore trying to keep things as generators
1524 lines
44 KiB
Plaintext
1524 lines
44 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 228
|
|
},
|
|
"colab_type": "code",
|
|
"id": "-V90fMxJdFl7",
|
|
"outputId": "2bbc3f28-84e3-47bd-97a2-ea0c2f0cf395"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import torch\n",
|
|
"import torch.nn as nn\n",
|
|
"import torch.optim as optim\n",
|
|
"\n",
|
|
"import torchtext\n",
|
|
"import torchtext.experimental\n",
|
|
"import torchtext.experimental.vectors\n",
|
|
"from torchtext.experimental.datasets.raw.text_classification import RawTextIterableDataset\n",
|
|
"from torchtext.experimental.datasets.text_classification import TextClassificationDataset\n",
|
|
"from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor\n",
|
|
"\n",
|
|
"import collections\n",
|
|
"import random\n",
|
|
"import time"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "tOO7b-Z1dFmA"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"seed = 1234\n",
|
|
"\n",
|
|
"torch.manual_seed(seed)\n",
|
|
"random.seed(seed)\n",
|
|
"torch.backends.cudnn.deterministic = True\n",
|
|
"torch.backends.cudnn.benchmark = False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "FhBXG95YdFmD"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"raw_train_data, raw_test_data = torchtext.experimental.datasets.raw.IMDB()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"raw_train_data = list(raw_train_data)\n",
|
|
"raw_test_data = list(raw_test_data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"('neg',\n",
|
|
" 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it\\'s not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.<br /><br />I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn\\'t have much of a plot.')"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"raw_train_data[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"('neg',\n",
|
|
" 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\\'t match the background, and painfully one-dimensional characters cannot be overcome with a \\'sci-fi\\' setting. (I\\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\\'s not. It\\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\\'s rubbish as they have to always say \"Gene Roddenberry\\'s Earth...\" otherwise people would not continue watching. Roddenberry\\'s ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.')"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"raw_test_data[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of training examples: 25,000\n",
|
|
"Number of testing examples: 25,000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'Number of training examples: {len(raw_train_data):,}')\n",
|
|
"print(f'Number of testing examples: {len(raw_test_data):,}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "rOTczrIEdFmY"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_train_valid_split(raw_train_data, split_ratio = 0.7):\n",
|
|
" \n",
|
|
" random.shuffle(raw_train_data)\n",
|
|
" \n",
|
|
" n_train_examples = int(len(raw_train_data) * split_ratio)\n",
|
|
" \n",
|
|
" train_data = raw_train_data[:n_train_examples]\n",
|
|
" valid_data = raw_train_data[n_train_examples:]\n",
|
|
" \n",
|
|
" return train_data, valid_data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "C6Tp4CyQdFma"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"raw_train_data, raw_valid_data = get_train_valid_split(raw_train_data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of training examples: 17,500\n",
|
|
"Number of validation examples: 7,500\n",
|
|
"Number of testing examples: 25,000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'Number of training examples: {len(raw_train_data):,}')\n",
|
|
"print(f'Number of validation examples: {len(raw_valid_data):,}')\n",
|
|
"print(f'Number of testing examples: {len(raw_test_data):,}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "LTJjCocRdFmh"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class Tokenizer:\n",
|
|
" def __init__(self, tokenize_fn = 'basic_english', lower = True, max_length = None):\n",
|
|
" \n",
|
|
" self.tokenize_fn = torchtext.data.utils.get_tokenizer(tokenize_fn)\n",
|
|
" self.lower = lower\n",
|
|
" self.max_length = max_length\n",
|
|
" \n",
|
|
" def tokenize(self, s):\n",
|
|
" \n",
|
|
" tokens = self.tokenize_fn(s)\n",
|
|
" \n",
|
|
" if self.lower:\n",
|
|
" tokens = [token.lower() for token in tokens]\n",
|
|
" \n",
|
|
" if self.max_length is not None:\n",
|
|
" tokens = tokens[:self.max_length]\n",
|
|
" \n",
|
|
" return tokens"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "5P2KumuDdFmj"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"max_length = 250\n",
|
|
"\n",
|
|
"tokenizer = Tokenizer(max_length = max_length)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "V1albCvadFmm",
|
|
"outputId": "5c7c30f2-c6b7-4098-990d-7bfcdc2446f1"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['this', 'film', 'is', 'terrible', '.', 'i', 'hate', 'it', 'and', 'it', \"'\", 's', 'bad', '!']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"s = \"this film is terrible. i hate it and it's bad!\"\n",
|
|
"\n",
|
|
"print(tokenizer.tokenize(s))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "anC7_ViodFmp"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def build_vocab_from_data(raw_data, tokenizer, **vocab_kwargs):\n",
|
|
" \n",
|
|
" token_freqs = collections.Counter()\n",
|
|
" \n",
|
|
" for label, text in raw_data:\n",
|
|
" tokens = tokenizer.tokenize(text)\n",
|
|
" token_freqs.update(tokens)\n",
|
|
" \n",
|
|
" vocab = torchtext.vocab.Vocab(token_freqs, **vocab_kwargs)\n",
|
|
" \n",
|
|
" return vocab"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "rgHPS1xzdFmt"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"max_size = 25_000\n",
|
|
"\n",
|
|
"vocab = build_vocab_from_data(raw_train_data, tokenizer, max_size = max_size)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Unique tokens in vocab: 25,002\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'Unique tokens in vocab: {len(vocab):,}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 364
|
|
},
|
|
"colab_type": "code",
|
|
"id": "PsRQLrlddFmw",
|
|
"outputId": "5357c17c-b0ba-429d-b675-aa3fd9c39b72"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[('the', 165322),\n",
|
|
" ('.', 164239),\n",
|
|
" (',', 133647),\n",
|
|
" ('a', 81952),\n",
|
|
" ('and', 80334),\n",
|
|
" ('of', 71820),\n",
|
|
" ('to', 65662),\n",
|
|
" (\"'\", 64249),\n",
|
|
" ('is', 53598),\n",
|
|
" ('it', 49589),\n",
|
|
" ('i', 48810),\n",
|
|
" ('in', 45611),\n",
|
|
" ('this', 40868),\n",
|
|
" ('that', 35609),\n",
|
|
" ('s', 29273),\n",
|
|
" ('was', 26159),\n",
|
|
" ('movie', 24543),\n",
|
|
" ('as', 22276),\n",
|
|
" ('with', 21494),\n",
|
|
" ('for', 21332)]"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab.freqs.most_common(20)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "FGS5tZKmdFmy",
|
|
"outputId": "5304c151-6696-4d2e-bd4e-ac9cfb2e3f23"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['<unk>', '<pad>', 'the', '.', ',', 'a', 'and', 'of', 'to', \"'\"]"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab.itos[:10]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "izsoXluedFm3",
|
|
"outputId": "1ab77cea-612b-4d86-cca3-5273f0964fbe"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"2"
|
|
]
|
|
},
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab.stoi['the']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "xiW0GItTdFm6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def raw_data_to_dataset(raw_data, tokenizer, vocab):\n",
|
|
" \n",
|
|
" text_transform = sequential_transforms(tokenizer.tokenize,\n",
|
|
" vocab_func(vocab),\n",
|
|
" totensor(dtype=torch.long))\n",
|
|
" \n",
|
|
" label_transform = sequential_transforms(lambda x: 1 if x == 'pos' else 0, \n",
|
|
" totensor(dtype=torch.long))\n",
|
|
"\n",
|
|
" transforms = (label_transform, text_transform)\n",
|
|
"\n",
|
|
" dataset = TextClassificationDataset(raw_data,\n",
|
|
" vocab,\n",
|
|
" transforms)\n",
|
|
" \n",
|
|
" return dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "LCslagnudFm8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data = raw_data_to_dataset(raw_train_data, tokenizer, vocab)\n",
|
|
"valid_data = raw_data_to_dataset(raw_valid_data, tokenizer, vocab)\n",
|
|
"test_data = raw_data_to_dataset(raw_test_data, tokenizer, vocab)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of training examples: 17,500\n",
|
|
"Number of validation examples: 7,500\n",
|
|
"Number of testing examples: 25,000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'Number of training examples: {len(train_data):,}')\n",
|
|
"print(f'Number of validation examples: {len(valid_data):,}')\n",
|
|
"print(f'Number of testing examples: {len(test_data):,}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 416
|
|
},
|
|
"colab_type": "code",
|
|
"id": "FDsGUUeydFm_",
|
|
"outputId": "848655ba-b5b2-4307-ca5b-a827200fdef2"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tensor([ 12, 121, 1013, 6, 219, 1855, 8, 276, 70, 20,\n",
|
|
" 5, 177, 3, 1013, 0, 30, 541, 0, 4, 15259,\n",
|
|
" 6, 7022, 3, 12, 751, 8, 45, 14, 4, 12,\n",
|
|
" 69, 123, 4, 22, 11, 10, 8, 56, 241, 1013,\n",
|
|
" 19, 12534, 563, 10, 8, 338, 1803, 25, 2, 196,\n",
|
|
" 24, 3, 717, 0, 4, 745, 3428, 686, 4, 4315,\n",
|
|
" 3437, 4, 4258, 15, 170, 9, 28, 1209, 2, 951,\n",
|
|
" 4, 6, 2005, 5083, 113, 544, 35, 2957, 20, 5,\n",
|
|
" 9, 1013, 9, 925, 3, 25, 12, 9, 145, 255,\n",
|
|
" 46, 30, 160, 7, 26, 54, 46, 42, 107, 12534,\n",
|
|
" 563, 10, 56, 1013, 241, 3, 11, 9, 16, 29,\n",
|
|
" 3, 11, 9, 16, 2966, 6, 8018, 3, 24, 143,\n",
|
|
" 199, 773, 249, 45, 1364, 6, 120, 893, 4, 1013,\n",
|
|
" 10, 5, 516, 15, 135, 29, 205, 437, 599, 25,\n",
|
|
" 24229, 3, 338, 1803, 24, 3, 11, 222, 1655, 734,\n",
|
|
" 1296, 4, 265, 29, 19, 5, 618, 4793, 3, 11,\n",
|
|
" 9, 16, 69, 866, 8, 474, 47, 2, 113, 138,\n",
|
|
" 19, 39, 30, 29, 343, 6136, 4, 48, 984, 5,\n",
|
|
" 5212, 7, 122, 3, 77, 1894, 6, 3550, 30, 1650,\n",
|
|
" 6, 634, 4, 403, 1266, 8, 110, 3, 2, 1332,\n",
|
|
" 7, 649, 130, 11, 9, 16, 1834, 19, 39, 31,\n",
|
|
" 8, 215, 134, 1965, 13961, 9, 16, 649, 3, 3,\n",
|
|
" 3, 910, 81, 68, 29, 1677, 142, 3, 13961, 9,\n",
|
|
" 16, 13264, 208, 35, 1685, 13, 77, 13826, 19, 14,\n",
|
|
" 696, 4, 745, 4, 793, 2192, 25, 142, 11, 211])\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"label, indexes = test_data[0]\n",
|
|
"\n",
|
|
"print(indexes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 55
|
|
},
|
|
"colab_type": "code",
|
|
"id": "nXOay2JUdFnB",
|
|
"outputId": "148242f9-c657-46be-e71d-c7503f662fc9"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['i', 'love', 'sci-fi', 'and', 'am', 'willing', 'to', 'put', 'up', 'with', 'a', 'lot', '.', 'sci-fi', '<unk>', 'are', 'usually', '<unk>', ',', 'under-appreciated', 'and', 'misunderstood', '.', 'i', 'tried', 'to', 'like', 'this', ',', 'i', 'really', 'did', ',', 'but', 'it', 'is', 'to', 'good', 'tv', 'sci-fi', 'as', 'babylon', '5', 'is', 'to', 'star', 'trek', '(', 'the', 'original', ')', '.', 'silly', '<unk>', ',', 'cheap', 'cardboard', 'sets', ',', 'stilted', 'dialogues', ',', 'cg', 'that', 'doesn', \"'\", 't', 'match', 'the', 'background', ',', 'and', 'painfully', 'one-dimensional', 'characters', 'cannot', 'be', 'overcome', 'with', 'a', \"'\", 'sci-fi', \"'\", 'setting', '.', '(', 'i', \"'\", 'm', 'sure', 'there', 'are', 'those', 'of', 'you', 'out', 'there', 'who', 'think', 'babylon', '5', 'is', 'good', 'sci-fi', 'tv', '.', 'it', \"'\", 's', 'not', '.', 'it', \"'\", 's', 'clichéd', 'and', 'uninspiring', '.', ')', 'while', 'us', 'viewers', 'might', 'like', 'emotion', 'and', 'character', 'development', ',', 'sci-fi', 'is', 'a', 'genre', 'that', 'does', 'not', 'take', 'itself', 'seriously', '(', 'cf', '.', 'star', 'trek', ')', '.', 'it', 'may', 'treat', 'important', 'issues', ',', 'yet', 'not', 'as', 'a', 'serious', 'philosophy', '.', 'it', \"'\", 's', 'really', 'difficult', 'to', 'care', 'about', 'the', 'characters', 'here', 'as', 'they', 'are', 'not', 'simply', 'foolish', ',', 'just', 'missing', 'a', 'spark', 'of', 'life', '.', 'their', 'actions', 'and', 'reactions', 'are', 'wooden', 'and', 'predictable', ',', 'often', 'painful', 'to', 'watch', '.', 'the', 'makers', 'of', 'earth', 'know', 'it', \"'\", 's', 'rubbish', 'as', 'they', 'have', 'to', 'always', 'say', 'gene', 'roddenberry', \"'\", 's', 'earth', '.', '.', '.', 'otherwise', 'people', 'would', 'not', 'continue', 'watching', '.', 'roddenberry', \"'\", 's', 'ashes', 'must', 'be', 'turning', 'in', 'their', 'orbit', 'as', 'this', 'dull', ',', 'cheap', ',', 'poorly', 'edited', '(', 'watching', 'it', 'without']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print([vocab.itos[i] for i in indexes])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "egzlLweTdFnH"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class Collator:\n",
|
|
" def __init__(self, pad_idx):\n",
|
|
" \n",
|
|
" self.pad_idx = pad_idx\n",
|
|
" \n",
|
|
" def collate(self, batch):\n",
|
|
" \n",
|
|
" labels, text = zip(*batch)\n",
|
|
" \n",
|
|
" labels = torch.LongTensor(labels)\n",
|
|
" \n",
|
|
" text = nn.utils.rnn.pad_sequence(text, padding_value = self.pad_idx)\n",
|
|
" \n",
|
|
" return labels, text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "TYLvjhoSdFnM"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"pad_token = '<pad>'\n",
|
|
"pad_idx = vocab[pad_token]\n",
|
|
"\n",
|
|
"collator = Collator(pad_idx)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "7Ly4l1I8dFnR"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"batch_size = 256\n",
|
|
"\n",
|
|
"train_iterator = torch.utils.data.DataLoader(train_data, \n",
|
|
" batch_size, \n",
|
|
" shuffle = True, \n",
|
|
" collate_fn = collator.collate)\n",
|
|
"\n",
|
|
"valid_iterator = torch.utils.data.DataLoader(valid_data, \n",
|
|
" batch_size, \n",
|
|
" shuffle = False, \n",
|
|
" collate_fn = collator.collate)\n",
|
|
"\n",
|
|
"test_iterator = torch.utils.data.DataLoader(test_data, \n",
|
|
" batch_size, \n",
|
|
" shuffle = False, \n",
|
|
" collate_fn = collator.collate)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "dbh38jHEdFnV"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class NBOW(nn.Module):\n",
|
|
" def __init__(self, input_dim, emb_dim, output_dim, pad_idx):\n",
|
|
" super().__init__()\n",
|
|
" \n",
|
|
" self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)\n",
|
|
" self.fc = nn.Linear(emb_dim, output_dim)\n",
|
|
" \n",
|
|
" def forward(self, text):\n",
|
|
" \n",
|
|
" # text = [seq len, batch size]\n",
|
|
" \n",
|
|
" embedded = self.embedding(text)\n",
|
|
" \n",
|
|
" # embedded = [seq len, batch size, emb dim]\n",
|
|
" \n",
|
|
" pooled = embedded.mean(0)\n",
|
|
" \n",
|
|
" # pooled = [batch size, emb dim]\n",
|
|
" \n",
|
|
" prediction = self.fc(pooled)\n",
|
|
" \n",
|
|
" # prediction = [batch size, output dim]\n",
|
|
" \n",
|
|
" return prediction"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Ga1nXhindFnZ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"input_dim = len(vocab)\n",
|
|
"emb_dim = 100\n",
|
|
"output_dim = 2\n",
|
|
"\n",
|
|
"model = NBOW(input_dim, emb_dim, output_dim, pad_idx)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "UyIJC0tYdFnc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def count_parameters(model):\n",
|
|
" return sum(p.numel() for p in model.parameters() if p.requires_grad)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "1sJRLyewdFng",
|
|
"outputId": "e7e357e1-1cc7-4aa4-ff40-4d749209759d"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The model has 2,500,402 trainable parameters\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f'The model has {count_parameters(model):,} trainable parameters')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "BPsihrZudFnl"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"glove = torchtext.experimental.vectors.GloVe(name = '6B',\n",
|
|
" dim = emb_dim)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 243
|
|
},
|
|
"colab_type": "code",
|
|
"id": "hUIoXGkpdFno",
|
|
"outputId": "b58af33d-b40f-4783-b997-8e85a0edc583"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tensor([-0.0382, -0.2449, 0.7281, -0.3996, 0.0832, 0.0440, -0.3914, 0.3344,\n",
|
|
" -0.5755, 0.0875, 0.2879, -0.0673, 0.3091, -0.2638, -0.1323, -0.2076,\n",
|
|
" 0.3340, -0.3385, -0.3174, -0.4834, 0.1464, -0.3730, 0.3458, 0.0520,\n",
|
|
" 0.4495, -0.4697, 0.0263, -0.5415, -0.1552, -0.1411, -0.0397, 0.2828,\n",
|
|
" 0.1439, 0.2346, -0.3102, 0.0862, 0.2040, 0.5262, 0.1716, -0.0824,\n",
|
|
" -0.7179, -0.4153, 0.2033, -0.1276, 0.4137, 0.5519, 0.5791, -0.3348,\n",
|
|
" -0.3656, -0.5486, -0.0629, 0.2658, 0.3020, 0.9977, -0.8048, -3.0243,\n",
|
|
" 0.0125, -0.3694, 2.2167, 0.7220, -0.2498, 0.9214, 0.0345, 0.4674,\n",
|
|
" 1.1079, -0.1936, -0.0746, 0.2335, -0.0521, -0.2204, 0.0572, -0.1581,\n",
|
|
" -0.3080, -0.4162, 0.3797, 0.1501, -0.5321, -0.2055, -1.2526, 0.0716,\n",
|
|
" 0.7056, 0.4974, -0.4206, 0.2615, -1.5380, -0.3022, -0.0734, -0.2831,\n",
|
|
" 0.3710, -0.2522, 0.0162, -0.0171, -0.3898, 0.8742, -0.7257, -0.5106,\n",
|
|
" -0.5203, -0.1459, 0.8278, 0.2706])"
|
|
]
|
|
},
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"glove['the']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 104
|
|
},
|
|
"colab_type": "code",
|
|
"id": "vz_X14INdFnq",
|
|
"outputId": "b41c1997-b970-4042-fab9-2d72f07540b0"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0.])"
|
|
]
|
|
},
|
|
"execution_count": 34,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"glove['shoggoth']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 104
|
|
},
|
|
"colab_type": "code",
|
|
"id": "iBKvWWCwdFnu",
|
|
"outputId": "821572aa-2743-4b1e-a03d-afeb5387bd9f"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
|
" 0., 0., 0., 0.])"
|
|
]
|
|
},
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"glove['The']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"glove_vocab = glove.vectors.get_stoi()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"'the' in glove_vocab"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"False"
|
|
]
|
|
},
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"'The' in glove_vocab"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "4BFftRDMdFnx"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_pretrained_embedding(initial_embedding, pretrained_vectors, vocab, unk_token):\n",
|
|
" \n",
|
|
" pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).detach() \n",
|
|
" pretrained_vocab = pretrained_vectors.vectors.get_stoi()\n",
|
|
" \n",
|
|
" unk_tokens = []\n",
|
|
" \n",
|
|
" for idx, token in enumerate(vocab.itos):\n",
|
|
" if token in pretrained_vocab:\n",
|
|
" pretrained_vector = pretrained_vectors[token]\n",
|
|
" pretrained_embedding[idx] = pretrained_vector\n",
|
|
" else:\n",
|
|
" unk_tokens.append(token)\n",
|
|
" \n",
|
|
" return pretrained_embedding, unk_tokens"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "QRToW07JdFnz"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"unk_token = '<unk>'\n",
|
|
"\n",
|
|
"pretrained_embedding, unk_tokens = get_pretrained_embedding(model.embedding, glove, vocab, unk_token)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 41,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tensor([[-0.1117, -0.4966, 0.1631, ..., 1.5903, -0.1947, -0.2415],\n",
|
|
" [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n",
|
|
" [ 0.7289, -0.7336, 1.5624, ..., -0.5592, -0.4480, -0.6476],\n",
|
|
" ...,\n",
|
|
" [ 0.0914, 1.5196, 0.4670, ..., 0.6393, -0.0332, 0.0185],\n",
|
|
" [-0.6290, 0.4650, -0.7165, ..., -1.3171, 2.0381, -2.0497],\n",
|
|
" [-1.1222, -0.0240, -1.0878, ..., -0.4948, -0.3874, 0.0339]])"
|
|
]
|
|
},
|
|
"execution_count": 41,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"model.embedding.weight.data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 42,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tensor([[-0.1117, -0.4966, 0.1631, ..., 1.5903, -0.1947, -0.2415],\n",
|
|
" [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n",
|
|
" [-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],\n",
|
|
" ...,\n",
|
|
" [ 0.4029, 0.1353, 0.6673, ..., -0.3300, 0.7533, -0.1666],\n",
|
|
" [ 0.1226, 0.0419, 0.0746, ..., -0.0024, -0.2733, -1.0033],\n",
|
|
" [-0.1009, -0.1484, 0.3141, ..., -0.3414, -0.3768, 0.5605]])"
|
|
]
|
|
},
|
|
"execution_count": 42,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pretrained_embedding"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "j36jzQpPdFn3",
|
|
"outputId": "7ebe041d-b092-498e-ea16-0fce8c20ed33"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"734"
|
|
]
|
|
},
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(unk_tokens)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 44,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "yzvhgf8tdFn5",
|
|
"outputId": "8c30dc4a-9a2b-4c11-8c7b-1d2cb3ba0aee"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['<unk>', '<pad>', '\\x96', '****', 'hadn', 'camera-work', '*1/2', '100%', '*****', '$1']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(unk_tokens[:10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 139
|
|
},
|
|
"colab_type": "code",
|
|
"id": "AnE6D4MAdFn_",
|
|
"outputId": "8b3fea1a-9bcb-4fd9-ba78-72baee94f96a"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"tensor([[-0.1117, -0.4966, 0.1631, ..., 1.5903, -0.1947, -0.2415],\n",
|
|
" [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n",
|
|
" [-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],\n",
|
|
" ...,\n",
|
|
" [ 0.4029, 0.1353, 0.6673, ..., -0.3300, 0.7533, -0.1666],\n",
|
|
" [ 0.1226, 0.0419, 0.0746, ..., -0.0024, -0.2733, -1.0033],\n",
|
|
" [-0.1009, -0.1484, 0.3141, ..., -0.3414, -0.3768, 0.5605]])"
|
|
]
|
|
},
|
|
"execution_count": 45,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"model.embedding.weight.data.copy_(pretrained_embedding)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 46,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "DJloauERdFoF"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"optimizer = optim.Adam(model.parameters())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 47,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "fPPZ0cs_dFoH"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"criterion = nn.CrossEntropyLoss()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "HGUcFIupdFoK",
|
|
"outputId": "e5d9b842-689b-49ca-a4f4-08574f0524ee"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Using: cuda\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
|
"\n",
|
|
"print(f'Using: {device}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Ynf7j6kQdFoM"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = model.to(device)\n",
|
|
"criterion = criterion.to(device)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "977iykeOdFoP"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def calculate_accuracy(predictions, labels):\n",
|
|
" top_predictions = predictions.argmax(1, keepdim = True)\n",
|
|
" correct = top_predictions.eq(labels.view_as(top_predictions)).sum()\n",
|
|
" accuracy = correct.float() / labels.shape[0]\n",
|
|
" return accuracy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "HPNI8DJudFoS"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def train(model, iterator, optimizer, criterion, device):\n",
|
|
" \n",
|
|
" epoch_loss = 0\n",
|
|
" epoch_acc = 0\n",
|
|
" \n",
|
|
" model.train()\n",
|
|
" \n",
|
|
" for labels, text in iterator:\n",
|
|
" \n",
|
|
" labels = labels.to(device)\n",
|
|
" text = text.to(device)\n",
|
|
" \n",
|
|
" optimizer.zero_grad()\n",
|
|
" \n",
|
|
" predictions = model(text)\n",
|
|
" \n",
|
|
" loss = criterion(predictions, labels)\n",
|
|
" \n",
|
|
" acc = calculate_accuracy(predictions, labels)\n",
|
|
" \n",
|
|
" loss.backward()\n",
|
|
" \n",
|
|
" optimizer.step()\n",
|
|
" \n",
|
|
" epoch_loss += loss.item()\n",
|
|
" epoch_acc += acc.item()\n",
|
|
" \n",
|
|
" return epoch_loss / len(iterator), epoch_acc / len(iterator)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 52,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "kp6pV5xKdFoV"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def evaluate(model, iterator, criterion, device):\n",
|
|
" \n",
|
|
" epoch_loss = 0\n",
|
|
" epoch_acc = 0\n",
|
|
" \n",
|
|
" model.eval()\n",
|
|
" \n",
|
|
" with torch.no_grad():\n",
|
|
" \n",
|
|
" for labels, text in iterator:\n",
|
|
"\n",
|
|
" labels = labels.to(device)\n",
|
|
" text = text.to(device)\n",
|
|
" \n",
|
|
" predictions = model(text)\n",
|
|
" \n",
|
|
" loss = criterion(predictions, labels)\n",
|
|
" \n",
|
|
" acc = calculate_accuracy(predictions, labels)\n",
|
|
"\n",
|
|
" epoch_loss += loss.item()\n",
|
|
" epoch_acc += acc.item()\n",
|
|
" \n",
|
|
" return epoch_loss / len(iterator), epoch_acc / len(iterator)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 53,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "8YzL45gYdFoX"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def epoch_time(start_time, end_time):\n",
|
|
" elapsed_time = end_time - start_time\n",
|
|
" elapsed_mins = int(elapsed_time / 60)\n",
|
|
" elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
|
|
" return elapsed_mins, elapsed_secs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 54,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 537
|
|
},
|
|
"colab_type": "code",
|
|
"id": "0A8wv7-xdFoa",
|
|
"outputId": "238f01bf-5438-482a-80ac-75c70cb20ed1"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch: 01 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.683 | Train Acc: 60.00%\n",
|
|
"\t Val. Loss: 0.669 | Val. Acc: 67.02%\n",
|
|
"Epoch: 02 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.651 | Train Acc: 68.09%\n",
|
|
"\t Val. Loss: 0.632 | Val. Acc: 71.31%\n",
|
|
"Epoch: 03 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.603 | Train Acc: 74.06%\n",
|
|
"\t Val. Loss: 0.582 | Val. Acc: 74.86%\n",
|
|
"Epoch: 04 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.545 | Train Acc: 78.13%\n",
|
|
"\t Val. Loss: 0.528 | Val. Acc: 78.88%\n",
|
|
"Epoch: 05 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.485 | Train Acc: 82.10%\n",
|
|
"\t Val. Loss: 0.477 | Val. Acc: 81.64%\n",
|
|
"Epoch: 06 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.430 | Train Acc: 85.15%\n",
|
|
"\t Val. Loss: 0.437 | Val. Acc: 83.25%\n",
|
|
"Epoch: 07 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.386 | Train Acc: 86.92%\n",
|
|
"\t Val. Loss: 0.404 | Val. Acc: 84.59%\n",
|
|
"Epoch: 08 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.350 | Train Acc: 88.21%\n",
|
|
"\t Val. Loss: 0.383 | Val. Acc: 85.19%\n",
|
|
"Epoch: 09 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.319 | Train Acc: 89.36%\n",
|
|
"\t Val. Loss: 0.363 | Val. Acc: 85.86%\n",
|
|
"Epoch: 10 | Epoch Time: 0m 4s\n",
|
|
"\tTrain Loss: 0.295 | Train Acc: 90.17%\n",
|
|
"\t Val. Loss: 0.349 | Val. Acc: 86.27%\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"n_epochs = 10\n",
|
|
"\n",
|
|
"best_valid_loss = float('inf')\n",
|
|
"\n",
|
|
"for epoch in range(n_epochs):\n",
|
|
"\n",
|
|
" start_time = time.monotonic()\n",
|
|
" \n",
|
|
" train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)\n",
|
|
" valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)\n",
|
|
" \n",
|
|
" end_time = time.monotonic()\n",
|
|
"\n",
|
|
" epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
|
|
" \n",
|
|
" if valid_loss < best_valid_loss:\n",
|
|
" best_valid_loss = valid_loss\n",
|
|
" torch.save(model.state_dict(), 'nbow-model.pt')\n",
|
|
" \n",
|
|
" print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n",
|
|
" print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
|
|
" print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 55,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "oMHAuMFNdFoc",
|
|
"outputId": "58b32f9a-8c39-4818-b526-1a80e435f3ae"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Test Loss: 0.374 | Test Acc: 84.75%\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model.load_state_dict(torch.load('nbow-model.pt'))\n",
|
|
"\n",
|
|
"test_loss, test_acc = evaluate(model, test_iterator, criterion, device)\n",
|
|
"\n",
|
|
"print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 56,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "sEDiSM3fdFog"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def predict_sentiment(tokenizer, vocab, model, device, sentence):\n",
|
|
" model.eval()\n",
|
|
" tokens = tokenizer.tokenize(sentence)\n",
|
|
" indexes = [vocab.stoi[token] for token in tokens]\n",
|
|
" tensor = torch.LongTensor(indexes).unsqueeze(-1).to(device)\n",
|
|
" prediction = model(tensor)\n",
|
|
" probabilities = nn.functional.softmax(prediction, dim = -1)\n",
|
|
" pos_probability = probabilities.squeeze()[-1].item()\n",
|
|
" return pos_probability"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 57,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "ycEAWhYIdFoi",
|
|
"outputId": "8a675641-fd79-46a6-b4e6-0b2006f866cc"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"2.818893153744284e-05"
|
|
]
|
|
},
|
|
"execution_count": 57,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"sentence = 'the absolute worst movie of all time.'\n",
|
|
"\n",
|
|
"predict_sentiment(tokenizer, vocab, model, device, sentence)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 58,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "cuMFqIoJdFok",
|
|
"outputId": "12c964fc-6788-459c-ad5e-ca0af366b1d4"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.9997795224189758"
|
|
]
|
|
},
|
|
"execution_count": 58,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"sentence = 'one of the greatest films i have ever seen in my life.'\n",
|
|
"\n",
|
|
"predict_sentiment(tokenizer, vocab, model, device, sentence)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 59,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "zausUPENdFoo",
|
|
"outputId": "2bdd06df-dab7-47ea-8952-8bd82d39bac2"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.6041761040687561"
|
|
]
|
|
},
|
|
"execution_count": 59,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"sentence = \"i thought it was going to be one of the greatest films i have ever seen in my life, \\\n",
|
|
"but it was actually the absolute worst movie of all time.\"\n",
|
|
"\n",
|
|
"predict_sentiment(tokenizer, vocab, model, device, sentence)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 60,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "e15vpNJYdFor",
|
|
"outputId": "eed3ae38-d01a-4476-a235-8fd3582240f3"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.6041760444641113"
|
|
]
|
|
},
|
|
"execution_count": 60,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"sentence = \"i thought it was going to be the absolute worst movie of all time, \\\n",
|
|
"but it was actually one of the greatest films i have ever seen in my life.\"\n",
|
|
"\n",
|
|
"predict_sentiment(tokenizer, vocab, model, device, sentence)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"machine_shape": "hm",
|
|
"name": "1_nbow.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|