In [1]:
import functools

import datasets

import torchtext
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


In [3]:
train_data, test_data

(Dataset({
     features: ['label', 'text'],
     num_rows: 25000
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 25000
 }))

In [4]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [5]:
def tokenize_data(example, tokenizer):
    tokens = {'tokens': tokenizer(example['text'])}
    return tokens

In [6]:
train_data = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer})
test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer})

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

In [7]:
train_data, test_data

(Dataset({
     features: ['label', 'text', 'tokens'],
     num_rows: 25000
 }),
 Dataset({
     features: ['label', 'text', 'tokens'],
     num_rows: 25000
 }))

In [8]:
train_valid_data = train_data.train_test_split(test_size=0.25)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [9]:
len(train_data), len(valid_data), len(test_data)

(18750, 6250, 25000)

In [10]:
min_freq = 3
special_tokens = ['<unk>', '<pad>']

vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],
                                                  min_freq=min_freq,
                                                  specials=special_tokens)

In [11]:
len(vocab)

35341

In [12]:
vocab.get_itos()[:10]

['<unk>', '<pad>', 'the', '.', ',', 'and', 'a', 'of', 'to', "'"]

In [13]:
unk_index = vocab['<unk>']

unk_index

0

In [14]:
pad_index = vocab['<pad>']

pad_index

1

In [15]:
vocab.set_default_index(unk_index)

In [16]:
def numericalize_data(example, vocab):
    ids = {'ids': [vocab[token] for token in example['tokens']]}
    return ids

In [17]:
train_data = train_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
valid_data = valid_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
test_data = test_data.map(numericalize_data, fn_kwargs={'vocab': vocab})

  0%|          | 0/18750 [00:00<?, ?ex/s]

  0%|          | 0/6250 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

In [18]:
train_data.set_format(type='torch', columns=['ids', 'label'])
valid_data.set_format(type='torch', columns=['ids', 'label'])
test_data.set_format(type='torch', columns=['ids', 'label'])

In [19]:
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        # text = [batch size, seq len]
        embedded = self.embedding(text)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction

In [20]:
vocab_size = len(vocab)
embedding_dim = 256
output_dim = 2

model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)

In [None]:
vectors = torchtext.vocab.FastText()

.vector_cache/wiki.en.vec:   7%|▋         | 465M/6.60G [08:18<3:21:18, 508kB/s]    

In [None]:
vectors = torchtext.vocab.GloVe()

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_labels = [i['label'] for i in batch]
    batch_labels = torch.stack(batch_labels)
    batch = {'ids': batch_ids,
             'labels': batch_labels}
    return batch

In [None]:
batch_size = 512

collate = functools.partial(collate, pad_index=pad_index)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=collate)
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

In [None]:
def train(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_loss = 0
    epoch_accuracy = 0

    for batch in dataloader:
        tokens = batch['ids'].to(device)
        labels = batch['labels'].to(device)
        predictions = model(tokens)
        loss = criterion(predictions, labels)
        accuracy = get_accuracy(predictions, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()

    return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)

In [None]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0

    with torch.no_grad():
        for batch in dataloader:
            tokens = batch['ids'].to(device)
            labels = batch['labels'].to(device)
            predictions = model(tokens)
            loss = criterion(predictions, labels)
            accuracy = get_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()

    return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)

In [None]:
def get_accuracy(predictions, labels):
    batch_size = predictions.shape[0]
    predicted_classes = predictions.argmax(1, keepdim=True)
    correct_predictions = predicted_classes.eq(labels.view_as(predicted_classes)).sum()
    accuracy = correct_predictions.float() / batch_size
    return accuracy

In [None]:
n_epochs = 10

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)

    print(f'epoch: {epoch+1}')
    print(f'train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}')
    print(f'valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}')