added TAN repository

2019-06-18 21:04:57 +05:30 · 2019-06-18 21:04:57 +05:30 · 180aedfe5a
commit 180aedfe5a
parent 3004ad2a7e
9 changed files with 41848 additions and 0 deletions
--- a/TAN/README.md
+++ b/TAN/README.md
@ -0,0 +1,10 @@
+# Here is the code for TAN, LSTM and LSTM+SVM
+### Different classes for different models
+*networks.py* file contains the codes for different networks - <br>
+* TAN : LSTM_TAN class (called with version = 0 for for basic TAN)<br>
+* LSTM : LSTM_TAN class (called with version = 2)<br>
+* LSTM-SVM - LSTM_TAN_svm_features class (called with version = 2)<br><br>
+
+### Running the code
+To run the code you have to run *get_training_plots.py* or *early_stopping_training.py*<br>
+To change models, call the classes with specific versions as mentioned above<br>
--- a/TAN/pycache/networks.cpython-35.pyc
+++ b/TAN/pycache/networks.cpython-35.pyc
--- a/TAN/pycache/utils.cpython-35.pyc
+++ b/TAN/pycache/utils.cpython-35.pyc
--- a/TAN/pycache/utils.cpython-37.pyc
+++ b/TAN/pycache/utils.cpython-37.pyc
--- a/TAN/early_stopping_training.py
+++ b/TAN/early_stopping_training.py
@ -0,0 +1,215 @@
+import sys
+import csv
+import copy
+import numpy as np
+import re
+import itertools
+from collections import Counter
+from utils import *
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import sys
+from networks import *
+import pickle
+from datetime import datetime
+import random
+from statistics import mode
+import copy
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+
+D = None
+
+random.seed(42)
+
+if len(sys.argv) !=3:
+    print("Usage :- python tuning_test.py <dataset name> <attention vairant>")
+    sys.exit(-1)
+
+version = sys.argv[2]
+dataset = sys.argv[1]
+
+def f_score(table):
+    return "%.2f" % (100*table[0][0]/(table[0][1]+table[0][2]) + 100*table[1][0]/(table[1][1]+table[1][2]))
+
+
+def train_bagging_tan_CV(version="tan-",n_epochs=50,batch_size=50,l2=0,dropout = 0.5,n_folds=5):
+
+    NUM_EPOCHS = n_epochs
+    loss_fn = nn.NLLLoss()
+    n_models = 1
+    print("\n\n starting cross validation \n\n")
+    print("class : ",dataset, " :-")
+
+    score = 0
+
+    fold_sz = len(x_train)//n_folds
+    foldwise_val_scores = []
+    ensemble_models = []
+    print("dataset size :- ",len(x_train))
+    for fold_no in range(n_folds):
+        print("Fold number {}".format(fold_no+1))
+        best_val_score = 0
+        model = LSTM_TAN(version,300,100,len(embedding_matrix),3,embedding_matrix,dropout=dropout).to(device)
+        optimizer = optim.Adam(filter(lambda p: p.requires_grad,model.parameters()),lr=0.0005,weight_decay = l2)
+        if fold_no == n_folds-1:
+            ul = len(x_train)
+        else:
+            ul = (fold_no+1)*fold_sz
+        print("ll : {}, ul : {}".format(fold_no*fold_sz,ul))
+        best_ensemble = []
+        best_score = 0
+        temp_ensemble = []
+        temp_score = 0
+        for _ in range(NUM_EPOCHS):
+            ep_loss = 0
+            target = torch.tensor(vector_target,dtype=torch.long).to(device)
+            optimizer.zero_grad()
+
+            #training
+
+            model.train()
+            loss = 0
+
+
+
+
+            for i in range(fold_no*fold_sz):
+                model.hidden = model.init_hidden()
+
+                x = torch.tensor(np.array(x_train[i]),dtype=torch.long).to(device)
+                y = torch.tensor([y_train[i]],dtype=torch.long).to(device)
+
+                preds = model(x,target,verbose=False)
+
+                x_ = loss_fn(preds,y)
+                loss += x_
+                ep_loss += x_
+                if (i+1) % batch_size == 0:
+                    loss.backward()
+                    loss = 0
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+            for i in range(ul,len(x_train)):
+                model.hidden = model.init_hidden()
+
+                x = torch.tensor(np.array(x_train[i]),dtype=torch.long).to(device)
+                y = torch.tensor([y_train[i]],dtype=torch.long).to(device)
+
+                preds = model(x,target,verbose=False)
+
+                x_ = loss_fn(preds,y)
+                loss += x_
+                ep_loss += x_
+                if (i+1) % batch_size == 0:
+                    loss.backward()
+                    loss = 0
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+            optimizer.step()
+            optimizer.zero_grad()
+
+            #validation
+            corr = 0
+            with torch.no_grad():
+                conf_matrix = np.zeros((2,3))
+                for j in range(fold_sz*fold_no,ul):
+                    x = torch.tensor(np.array(x_train[j]),dtype=torch.long).to(device)
+                    y = torch.tensor([y_train[j]],dtype=torch.long).to(device)
+                    model.eval()
+                    preds = model(x,target,verbose=False)
+                    label = np.argmax(preds.cpu().numpy(),axis=1)[0]
+                    if label == y_train[j]:
+                        corr+=1
+                        if label <=1:
+                            conf_matrix[label][0]+=1
+                    if y_train[j] <=1:
+                        conf_matrix[y_train[j]][2]+=1
+                    if label <=1:
+                        conf_matrix[label][1]+=1
+                    ep_loss+=loss_fn(preds,y)
+            val_f_score = float(f_score(conf_matrix))
+
+            #if val_f_score > best_val_score:
+            #    best_val_score = val_f_score
+            #    best_model = copy.deepcopy(model)
+
+
+            if _%10 ==0 and _ != 0:
+                print("current last 10- score ",temp_score*1.0/10)
+                if temp_score > best_score:
+                    best_score = temp_score
+                    best_ensemble = temp_ensemble
+                    print("this is current best score now")
+                temp_ensemble = []
+                temp_score = 0
+
+            temp_ensemble.append(copy.deepcopy(model))
+            temp_score += val_f_score
+
+
+
+            print("epoch number {} , val_f_score {}".\
+            format(_+1,f_score(conf_matrix)))
+
+
+        print("current last 10- score ",temp_score*1.0/10)
+        if temp_score > best_score:
+            best_score = temp_score
+            best_ensemble = temp_ensemble
+            print("this is current best score now")
+
+        ensemble_models.extend(best_ensemble)
+
+    with torch.no_grad():
+        conf_matrix = np.zeros((2,3))
+        for j in range(len(x_test)):
+            x = torch.tensor(np.array(x_test[j]),dtype=torch.long).to(device)
+            y = torch.tensor([y_test[j]],dtype=torch.long).to(device)
+            all_preds = []
+            for model in ensemble_models:
+                model.eval()
+                all_preds.append(np.argmax(model(x,target).cpu().numpy(),axis=1)[0])
+            cnts = [0,0,0]
+            for prediction in all_preds:
+                cnts[prediction]+=1
+            label = np.argmax(cnts)
+            if label == y_test[j]:
+                corr+=1
+                if label <=1:
+                    conf_matrix[label][0]+=1
+            if y_test[j] <=1:
+                conf_matrix[y_test[j]][2]+=1
+            if label <=1:
+                conf_matrix[label][1]+=1
+            ep_loss+=loss_fn(preds,y)
+
+    print("test_f_score {}".format(f_score(conf_matrix)))
+    print(conf_matrix)
+    return conf_matrix
+
+
+
+dataset = sys.argv[1]
+
+fin_matrix = np.zeros((2,3))
+
+
+stances, word2emb, word_ind, ind_word, embedding_matrix, device,\
+    x_train, y_train, x_test, y_test, vector_target, train_tweets, test_tweets  = load_dataset(dataset,dev = "cuda")
+
+
+combined = list(zip(x_train, y_train))
+random.shuffle(combined)
+x_train[:], y_train[:] = zip(*combined)
+
+
+fin_matrix += train_bagging_tan_CV(version=version,n_epochs=100,batch_size=50,l2=0.0,dropout = 0.6,n_folds=10)
+
+print(f_score(fin_matrix))
--- a/TAN/emnlp_dict.txt
+++ b/TAN/emnlp_dict.txt
--- a/TAN/networks.py
+++ b/TAN/networks.py
@ -0,0 +1,108 @@
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import sys
+
+
+
+
+class LSTM_TAN(nn.Module):
+    def __init__(self,version,embedding_dim, hidden_dim, vocab_size, n_targets,embedding_matrix,dropout = 0.5):
+        super(LSTM_TAN, self).__init__()
+        if version not in ["tan-","tan","lstm"]:
+            print("Version is tan-,tan,lstm")
+            sys.exit(-1)
+
+        self.hidden_dim = hidden_dim
+        self.embedding_dim = embedding_dim
+        #WORD_EMBEDDINGS
+        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
+        self.word_embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float))
+        self.word_embeddings.weight.requires_grad=True
+        self.version = version
+
+
+        if version == "tan-":
+            self.attention = nn.Linear(embedding_dim,1)
+        elif version == "tan":
+            self.attention = nn.Linear(2*embedding_dim,1)
+
+
+        #LSTM
+        # The LSTM takes word embeddings as inputs, and outputs hidden states
+        # with dimensionality hidden_dim.
+
+        self.dropout = nn.Dropout(dropout)
+
+        #FINAL_LAYER
+        if version !="lstm":
+            self.hidden2target = nn.Linear(2*self.hidden_dim, n_targets)
+        else:
+            self.hidden2target = nn.Linear(self.hidden_dim, n_targets)
+
+        self.hidden = self.init_hidden()
+
+    def init_hidden(self):
+        # Before we've done anything, we dont have any hidden state.
+        # Refer to the Pytorch documentation to see exactly
+        # why they have this dimensionality.
+        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
+        return (torch.zeros(1, 1, self.hidden_dim),
+                torch.zeros(1, 1, self.hidden_dim))
+
+
+
+
+    def forward(self, sentence, target,verbose=False):
+        x_emb = self.word_embeddings(sentence)
+        E = self.E
+        version = self.version
+
+        if version != "tan-":
+            t_emb = self.word_embeddings(target)
+            #print(t_emb)
+            #print(torch.mean(t_emb,dim=0,keepdim=True).shape)
+            t_emb = torch.mean(t_emb,dim=0,keepdim=True)
+            xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
+                #print(xt_emb)
+
+        if version == "tan-":
+            lstm_out, _ = self.lstm(
+                x_emb.view(len(sentence), 1 , self.embedding_dim))
+
+            a = self.attention(x_emb)
+
+            final_hidden_state = torch.mm(F.softmax(a.view(1,-1),dim=1),lstm_out.view(len(sentence),-1))
+
+        elif version == "tan":
+            a = self.attention(xt_emb)
+
+            lstm_out, _ = self.lstm(x_emb.view(len(sentence), 1 , self.embedding_dim))
+
+            final_hidden_state = torch.mm(F.softmax(a.view(1,-1),dim=1),lstm_out.view(len(sentence),-1))
+
+        elif version == "lstm":
+            _, hidden_state = self.lstm(
+                    x_emb.view(len(sentence), 1 , self.embedding_dim))
+
+            final_hidden_state = hidden_state[0].view(-1,self.hidden_dim)
+
+
+        target_space = self.hidden2target(self.dropout(final_hidden_state))
+        target_scores = F.log_softmax(target_space, dim=1)
+
+
+        return target_scores
+
+        #t_emb = self.word_embeddings(target)
+        #print(t_emb)
+        #print(torch.mean(t_emb,dim=0,keepdim=True).shape)
+        #t_emb = torch.mean(t_emb,dim=0,keep    dim=True)
+
+        #xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
+        #print(xt_emb)
+
+# In[26]:
--- a/TAN/noslang_data.json
+++ b/TAN/noslang_data.json
--- a/TAN/utils.py
+++ b/TAN/utils.py
@ -0,0 +1,333 @@
+import csv
+import copy
+import numpy as np
+import re
+import itertools
+from collections import Counter,defaultdict
+import torch
+import json
+from collections import Counter
+import wordninja
+"""
+
+Tokenization/string cleaning for all datasets.
+Every dataset is lower cased.
+Original taken from https://github.com/dennybritz/cnn-text-classification-tf
+
+string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
+string = re.sub(r"#SemST", "", string)
+string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
+#string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
+string =  re.sub(r"([A-Z])", r" \1", string)
+string = re.sub(r"\'s", " \'s", string)
+string = re.sub(r"\'ve", " \'ve", string)
+string = re.sub(r"n\'t", " n\'t", string)
+string = re.sub(r"\'re", " \'re", string)
+string = re.sub(r"\'d", " \'d", string)
+string = re.sub(r"\'ll", " \'ll", string)
+string = re.sub(r",", " , ", string)
+string = re.sub(r"!", " ! ", string)
+string = re.sub(r"\(", " ( ", string)
+string = re.sub(r"\)", " ) ", string)
+string = re.sub(r"\?", " ? ", string)
+string = re.sub(r"\s{2,}", " ", string)
+return string.strip() if TREC else string.strip().lower()
+
+"""
+def clean_str2(string, TREC=False):
+    """
+    Tokenization/string cleaning for all datasets.
+    Every dataset is lower cased.
+    Original taken from https://github.com/dennybritz/cnn-text-classification-tf
+    """
+    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
+    string = re.sub(r"\'s", " \'s", string)
+    string = re.sub(r"\'ve", " \'ve", string)
+    string = re.sub(r"n\'t", " n\'t", string)
+    string = re.sub(r"\'re", " \'re", string)
+    string = re.sub(r"\'d", " \'d", string)
+    string = re.sub(r"\'ll", " \'ll", string)
+    string = re.sub(r",", " , ", string)
+    string = re.sub(r"!", " ! ", string)
+    string = re.sub(r"\(", " \( ", string)
+    string = re.sub(r"\)", " \) ", string)
+    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\s{2,}", " ", string)
+    return string.strip() if TREC else string.strip().lower()
+
+
+
+def clean_str(string, TREC=False):
+    """
+    Tokenization/string cleaning for all datasets.
+    Every dataset is lower cased.
+    Original taken from https://github.com/dennybritz/cnn-text-classification-tf
+    """
+    string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
+    string = re.sub(r"#SemST", "", string)
+    string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
+    #string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
+    #string =  re.sub(r"([A-Z])", r" \1", string)
+    string = re.sub(r"\'s", " \'s", string)
+    string = re.sub(r"\'ve", " \'ve", string)
+    string = re.sub(r"n\'t", " n\'t", string)
+    string = re.sub(r"\'re", " \'re", string)
+    string = re.sub(r"\'d", " \'d", string)
+    string = re.sub(r"\'ll", " \'ll", string)
+    string = re.sub(r",", " , ", string)
+    string = re.sub(r"!", " ! ", string)
+    string = re.sub(r"\(", " ( ", string)
+    string = re.sub(r"\)", " ) ", string)
+    string = re.sub(r"\?", " ? ", string)
+    string = re.sub(r"\s{2,}", " ", string)
+    return string.strip() if TREC else string.strip().lower()
+
+
+def create_normalise_dict(no_slang_data = "./noslang_data.json", emnlp_dict = "./emnlp_dict.txt"):
+    print("Creating Normalization Dictionary")
+    with open(no_slang_data, "r") as f:
+        data1 = json.load(f)
+
+    data2 = {}
+
+    with open(emnlp_dict,"r") as f:
+        lines = f.readlines()
+        for line in lines:
+            row = line.split('\t')
+            data2[row[0]] = row[1].rstrip()
+
+    normalization_dict = {**data1,**data2}
+    #print(normalization_dict)
+    return normalization_dict
+
+def normalise(normalization_dict,sentence):
+    normalised_tokens = []
+    word_tokens = sentence.split()
+    for word in word_tokens:
+        if word in normalization_dict:
+        #if False:
+            normalised_tokens.extend(normalization_dict[word].lower().split(" "))
+            #print(word," normalised to ",normalization_dict[word])
+        else:
+            normalised_tokens.append(word.lower())
+    #print(normalised_tokens)
+    return normalised_tokens
+
+
+def load_dataset(dataset,dev = "cuda"):
+    def split(word):
+        if word in word2emb:
+        #if True:
+            return [word]
+        return wordninja.split(word)
+
+
+    assert dataset in ['VC', 'HC', 'HRT', 'LA', 'CC', 'SC', 'EC', 'MMR', 'AT', 'FM'], "unknown dataset"
+
+    folder = "Data_SemE_P"
+
+    if dataset == 'EC':
+        topic = 'E-ciggarettes are safer than normal ciggarettes'
+        folder = "Data_MPCHI_P"
+    elif dataset == 'SC':
+        topic = 'Sun exposure can lead to skin cancer'
+        folder = "Data_MPCHI_P"
+    elif dataset == 'VC':
+        topic = 'Vitamin C prevents common cold'
+        folder = "Data_MPCHI_P"
+    elif dataset == 'HRT':
+        topic = 'Women should take HRT post menopause'
+        folder = "Data_MPCHI_P"
+    elif dataset == 'MMR':
+        topic = 'MMR vaccine can cause autism'
+        folder = "Data_MPCHI_P"
+    elif dataset == 'AT' :
+        topic = "atheism"
+    elif dataset == 'HC' :
+        topic = "hillary clinton"
+    elif dataset == 'LA' :
+        topic = "legalization of abortion"
+    elif dataset == 'CC' :
+        topic = "climate change is a real concern"
+    elif dataset == 'FM' :
+        topic = "feminist movement"
+    elif dataset == 'VCA':
+        topic = "vaccines cause autism"
+    elif dataset == 'VTI':
+        topic = "vaccines treat influenza"
+    print(topic)
+
+
+    if dataset == "AT":
+        dataset = "Atheism"
+
+    target = normalise(normalization_dict,clean_str(topic))
+    stances = {'FAVOR' : 0, 'AGAINST' : 1, 'NONE' : 2}
+
+    train_x = []
+    train_y = []
+
+    with open("../Preprocessing/{}/{}/train_preprocessed.csv".format(folder,dataset),"r",encoding='latin-1') as f:
+        reader = csv.DictReader(f, delimiter=',')
+        for row in reader:
+            if row['Stance'] in stances:
+                train_x.append(row['Tweet'].split(' '))
+                train_y.append(stances[row['Stance']])
+
+    test_x = []
+    test_y = []
+
+    with open("../Preprocessing/{}/{}/test_preprocessed.csv".format(folder,dataset),"r",encoding='latin-1') as f:
+        reader = csv.DictReader(f, delimiter=',')
+        for row in reader:
+            if row['Stance'] in stances:
+                test_x.append(row['Tweet'].split(' '))
+                test_y.append(stances[row['Stance']])
+
+
+    word2emb = load_glove_embeddings()
+
+
+    word_ind = {}
+
+    # for i,sent in enumerate(train_x):
+    #     final_sent = []
+    #     j = 0
+    #     while j < len(sent):
+    #         final_sent += split(sent[j])
+    #         j+=1
+    #     train_x[i] = final_sent
+    #
+    # for i,sent in enumerate(test_x):
+    #     final_sent = []
+    #     j = 0
+    #     while j < len(sent):
+    #         final_sent += split(sent[j])
+    #         j+=1
+    #     test_x[i] = final_sent
+    #
+
+    for sent in train_x:
+        for word in sent:
+            if word not in word_ind and word in word2emb:
+                word_ind[word] = len(word_ind)
+
+    for sent in test_x:
+        for word in sent:
+            if word not in word_ind and word in word2emb:
+                word_ind[word] = len(word_ind)
+
+    for word in target:
+        if word not in word_ind and word in word2emb:
+            word_ind[word] = len(word_ind)
+
+
+
+    UNK = len(word_ind)
+    PAD = len(word_ind)+1
+
+
+    ind_word = {v:k for k,v in word_ind.items()}
+
+
+    print("Number of words - {}".format(len(ind_word)))
+
+
+    # In[12]:
+
+
+
+    # In[13]:
+
+    #x_train = np.full((len(train_x),MAX_LEN),PAD)
+    x_train = []
+    OOV = 0
+    oovs = []
+
+    for i,sent in enumerate(train_x):
+        temp = []
+        for j,word in enumerate(sent):
+            if word in word_ind:
+                temp.append(word_ind[word])
+            else:
+                #print(word)
+                temp.append(UNK)
+                OOV+=1
+                oovs.append(word)
+        x_train.append(temp)
+
+    print("OOV words :- ",OOV)
+    a = Counter(oovs)
+    print(a)
+
+    # In[14]:
+
+    y_train = np.array(train_y)
+    y_test = np.array(test_y)
+
+
+    # In[15]:
+
+    x_test = []
+
+    for i,sent in enumerate(test_x):
+        temp = []
+        for j,word in enumerate(sent):
+            if word in word_ind:
+                temp.append(word_ind[word])
+            else:
+                temp.append(UNK)
+
+        x_test.append(temp)
+
+
+
+
+    embedding_matrix = np.zeros((len(word_ind) + 2, 300))
+    embedding_matrix[len(word_ind)] = np.random.randn((300))
+    for word in word_ind:
+        embedding_matrix[word_ind[word]] = word2emb[word]
+
+
+
+
+    print("Number of training examples :- ",len(x_train))
+    print("Sample vectorised sentence :- ",x_train[0])
+
+    device = torch.device(dev)
+    print("Using this device :- ", device)
+
+
+
+
+
+    vector_target = []
+    for w in target:
+        if w in word_ind:
+            vector_target.append(word_ind[w])
+        else:
+            vector_target.append(UNK)
+
+
+    print("vectorised target:-")
+    print(vector_target)
+
+    return stances, word2emb, word_ind, ind_word, embedding_matrix, device,\
+     x_train, y_train, x_test, y_test, vector_target, train_x, test_x
+
+
+
+
+
+
+def load_glove_embeddings():
+    word2emb = {}
+    WORD2VEC_MODEL = "../Preprocessing/glove.6B.300d.txt"
+    fglove = open(WORD2VEC_MODEL,"r")
+    for line in fglove:
+        cols = line.strip().split()
+        word = cols[0]
+        embedding = np.array(cols[1:],dtype="float32")
+        word2emb[word]=embedding
+    fglove.close()
+    return word2emb