added TAN repository
This commit is contained in:
parent
3004ad2a7e
commit
180aedfe5a
10
TAN/README.md
Normal file
10
TAN/README.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# Here is the code for TAN, LSTM and LSTM+SVM
|
||||||
|
### Different classes for different models
|
||||||
|
*networks.py* file contains the codes for different networks - <br>
|
||||||
|
* TAN : LSTM_TAN class (called with version = 0 for for basic TAN)<br>
|
||||||
|
* LSTM : LSTM_TAN class (called with version = 2)<br>
|
||||||
|
* LSTM-SVM - LSTM_TAN_svm_features class (called with version = 2)<br><br>
|
||||||
|
|
||||||
|
### Running the code
|
||||||
|
To run the code you have to run *get_training_plots.py* or *early_stopping_training.py*<br>
|
||||||
|
To change models, call the classes with specific versions as mentioned above<br>
|
BIN
TAN/__pycache__/networks.cpython-35.pyc
Normal file
BIN
TAN/__pycache__/networks.cpython-35.pyc
Normal file
Binary file not shown.
BIN
TAN/__pycache__/utils.cpython-35.pyc
Normal file
BIN
TAN/__pycache__/utils.cpython-35.pyc
Normal file
Binary file not shown.
BIN
TAN/__pycache__/utils.cpython-37.pyc
Normal file
BIN
TAN/__pycache__/utils.cpython-37.pyc
Normal file
Binary file not shown.
215
TAN/early_stopping_training.py
Normal file
215
TAN/early_stopping_training.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
import sys
|
||||||
|
import csv
|
||||||
|
import copy
|
||||||
|
import numpy as np
|
||||||
|
import re
|
||||||
|
import itertools
|
||||||
|
from collections import Counter
|
||||||
|
from utils import *
|
||||||
|
import torch
|
||||||
|
import torch.autograd as autograd
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.optim as optim
|
||||||
|
import sys
|
||||||
|
from networks import *
|
||||||
|
import pickle
|
||||||
|
from datetime import datetime
|
||||||
|
import random
|
||||||
|
from statistics import mode
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
D = None
|
||||||
|
|
||||||
|
random.seed(42)
|
||||||
|
|
||||||
|
if len(sys.argv) !=3:
|
||||||
|
print("Usage :- python tuning_test.py <dataset name> <attention vairant>")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
version = sys.argv[2]
|
||||||
|
dataset = sys.argv[1]
|
||||||
|
|
||||||
|
def f_score(table):
|
||||||
|
return "%.2f" % (100*table[0][0]/(table[0][1]+table[0][2]) + 100*table[1][0]/(table[1][1]+table[1][2]))
|
||||||
|
|
||||||
|
|
||||||
|
def train_bagging_tan_CV(version="tan-",n_epochs=50,batch_size=50,l2=0,dropout = 0.5,n_folds=5):
|
||||||
|
|
||||||
|
NUM_EPOCHS = n_epochs
|
||||||
|
loss_fn = nn.NLLLoss()
|
||||||
|
n_models = 1
|
||||||
|
print("\n\n starting cross validation \n\n")
|
||||||
|
print("class : ",dataset, " :-")
|
||||||
|
|
||||||
|
score = 0
|
||||||
|
|
||||||
|
fold_sz = len(x_train)//n_folds
|
||||||
|
foldwise_val_scores = []
|
||||||
|
ensemble_models = []
|
||||||
|
print("dataset size :- ",len(x_train))
|
||||||
|
for fold_no in range(n_folds):
|
||||||
|
print("Fold number {}".format(fold_no+1))
|
||||||
|
best_val_score = 0
|
||||||
|
model = LSTM_TAN(version,300,100,len(embedding_matrix),3,embedding_matrix,dropout=dropout).to(device)
|
||||||
|
optimizer = optim.Adam(filter(lambda p: p.requires_grad,model.parameters()),lr=0.0005,weight_decay = l2)
|
||||||
|
if fold_no == n_folds-1:
|
||||||
|
ul = len(x_train)
|
||||||
|
else:
|
||||||
|
ul = (fold_no+1)*fold_sz
|
||||||
|
print("ll : {}, ul : {}".format(fold_no*fold_sz,ul))
|
||||||
|
best_ensemble = []
|
||||||
|
best_score = 0
|
||||||
|
temp_ensemble = []
|
||||||
|
temp_score = 0
|
||||||
|
for _ in range(NUM_EPOCHS):
|
||||||
|
ep_loss = 0
|
||||||
|
target = torch.tensor(vector_target,dtype=torch.long).to(device)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
#training
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
loss = 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(fold_no*fold_sz):
|
||||||
|
model.hidden = model.init_hidden()
|
||||||
|
|
||||||
|
x = torch.tensor(np.array(x_train[i]),dtype=torch.long).to(device)
|
||||||
|
y = torch.tensor([y_train[i]],dtype=torch.long).to(device)
|
||||||
|
|
||||||
|
preds = model(x,target,verbose=False)
|
||||||
|
|
||||||
|
x_ = loss_fn(preds,y)
|
||||||
|
loss += x_
|
||||||
|
ep_loss += x_
|
||||||
|
if (i+1) % batch_size == 0:
|
||||||
|
loss.backward()
|
||||||
|
loss = 0
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
for i in range(ul,len(x_train)):
|
||||||
|
model.hidden = model.init_hidden()
|
||||||
|
|
||||||
|
x = torch.tensor(np.array(x_train[i]),dtype=torch.long).to(device)
|
||||||
|
y = torch.tensor([y_train[i]],dtype=torch.long).to(device)
|
||||||
|
|
||||||
|
preds = model(x,target,verbose=False)
|
||||||
|
|
||||||
|
x_ = loss_fn(preds,y)
|
||||||
|
loss += x_
|
||||||
|
ep_loss += x_
|
||||||
|
if (i+1) % batch_size == 0:
|
||||||
|
loss.backward()
|
||||||
|
loss = 0
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
#validation
|
||||||
|
corr = 0
|
||||||
|
with torch.no_grad():
|
||||||
|
conf_matrix = np.zeros((2,3))
|
||||||
|
for j in range(fold_sz*fold_no,ul):
|
||||||
|
x = torch.tensor(np.array(x_train[j]),dtype=torch.long).to(device)
|
||||||
|
y = torch.tensor([y_train[j]],dtype=torch.long).to(device)
|
||||||
|
model.eval()
|
||||||
|
preds = model(x,target,verbose=False)
|
||||||
|
label = np.argmax(preds.cpu().numpy(),axis=1)[0]
|
||||||
|
if label == y_train[j]:
|
||||||
|
corr+=1
|
||||||
|
if label <=1:
|
||||||
|
conf_matrix[label][0]+=1
|
||||||
|
if y_train[j] <=1:
|
||||||
|
conf_matrix[y_train[j]][2]+=1
|
||||||
|
if label <=1:
|
||||||
|
conf_matrix[label][1]+=1
|
||||||
|
ep_loss+=loss_fn(preds,y)
|
||||||
|
val_f_score = float(f_score(conf_matrix))
|
||||||
|
|
||||||
|
#if val_f_score > best_val_score:
|
||||||
|
# best_val_score = val_f_score
|
||||||
|
# best_model = copy.deepcopy(model)
|
||||||
|
|
||||||
|
|
||||||
|
if _%10 ==0 and _ != 0:
|
||||||
|
print("current last 10- score ",temp_score*1.0/10)
|
||||||
|
if temp_score > best_score:
|
||||||
|
best_score = temp_score
|
||||||
|
best_ensemble = temp_ensemble
|
||||||
|
print("this is current best score now")
|
||||||
|
temp_ensemble = []
|
||||||
|
temp_score = 0
|
||||||
|
|
||||||
|
temp_ensemble.append(copy.deepcopy(model))
|
||||||
|
temp_score += val_f_score
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("epoch number {} , val_f_score {}".\
|
||||||
|
format(_+1,f_score(conf_matrix)))
|
||||||
|
|
||||||
|
|
||||||
|
print("current last 10- score ",temp_score*1.0/10)
|
||||||
|
if temp_score > best_score:
|
||||||
|
best_score = temp_score
|
||||||
|
best_ensemble = temp_ensemble
|
||||||
|
print("this is current best score now")
|
||||||
|
|
||||||
|
ensemble_models.extend(best_ensemble)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
conf_matrix = np.zeros((2,3))
|
||||||
|
for j in range(len(x_test)):
|
||||||
|
x = torch.tensor(np.array(x_test[j]),dtype=torch.long).to(device)
|
||||||
|
y = torch.tensor([y_test[j]],dtype=torch.long).to(device)
|
||||||
|
all_preds = []
|
||||||
|
for model in ensemble_models:
|
||||||
|
model.eval()
|
||||||
|
all_preds.append(np.argmax(model(x,target).cpu().numpy(),axis=1)[0])
|
||||||
|
cnts = [0,0,0]
|
||||||
|
for prediction in all_preds:
|
||||||
|
cnts[prediction]+=1
|
||||||
|
label = np.argmax(cnts)
|
||||||
|
if label == y_test[j]:
|
||||||
|
corr+=1
|
||||||
|
if label <=1:
|
||||||
|
conf_matrix[label][0]+=1
|
||||||
|
if y_test[j] <=1:
|
||||||
|
conf_matrix[y_test[j]][2]+=1
|
||||||
|
if label <=1:
|
||||||
|
conf_matrix[label][1]+=1
|
||||||
|
ep_loss+=loss_fn(preds,y)
|
||||||
|
|
||||||
|
print("test_f_score {}".format(f_score(conf_matrix)))
|
||||||
|
print(conf_matrix)
|
||||||
|
return conf_matrix
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dataset = sys.argv[1]
|
||||||
|
|
||||||
|
fin_matrix = np.zeros((2,3))
|
||||||
|
|
||||||
|
|
||||||
|
stances, word2emb, word_ind, ind_word, embedding_matrix, device,\
|
||||||
|
x_train, y_train, x_test, y_test, vector_target, train_tweets, test_tweets = load_dataset(dataset,dev = "cuda")
|
||||||
|
|
||||||
|
|
||||||
|
combined = list(zip(x_train, y_train))
|
||||||
|
random.shuffle(combined)
|
||||||
|
x_train[:], y_train[:] = zip(*combined)
|
||||||
|
|
||||||
|
|
||||||
|
fin_matrix += train_bagging_tan_CV(version=version,n_epochs=100,batch_size=50,l2=0.0,dropout = 0.6,n_folds=10)
|
||||||
|
|
||||||
|
print(f_score(fin_matrix))
|
41181
TAN/emnlp_dict.txt
Normal file
41181
TAN/emnlp_dict.txt
Normal file
File diff suppressed because it is too large
Load Diff
108
TAN/networks.py
Normal file
108
TAN/networks.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import torch
|
||||||
|
import torch.autograd as autograd
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.optim as optim
|
||||||
|
import numpy as np
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class LSTM_TAN(nn.Module):
|
||||||
|
def __init__(self,version,embedding_dim, hidden_dim, vocab_size, n_targets,embedding_matrix,dropout = 0.5):
|
||||||
|
super(LSTM_TAN, self).__init__()
|
||||||
|
if version not in ["tan-","tan","lstm"]:
|
||||||
|
print("Version is tan-,tan,lstm")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.embedding_dim = embedding_dim
|
||||||
|
#WORD_EMBEDDINGS
|
||||||
|
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
|
||||||
|
self.word_embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float))
|
||||||
|
self.word_embeddings.weight.requires_grad=True
|
||||||
|
self.version = version
|
||||||
|
|
||||||
|
|
||||||
|
if version == "tan-":
|
||||||
|
self.attention = nn.Linear(embedding_dim,1)
|
||||||
|
elif version == "tan":
|
||||||
|
self.attention = nn.Linear(2*embedding_dim,1)
|
||||||
|
|
||||||
|
|
||||||
|
#LSTM
|
||||||
|
# The LSTM takes word embeddings as inputs, and outputs hidden states
|
||||||
|
# with dimensionality hidden_dim.
|
||||||
|
|
||||||
|
self.dropout = nn.Dropout(dropout)
|
||||||
|
|
||||||
|
#FINAL_LAYER
|
||||||
|
if version !="lstm":
|
||||||
|
self.hidden2target = nn.Linear(2*self.hidden_dim, n_targets)
|
||||||
|
else:
|
||||||
|
self.hidden2target = nn.Linear(self.hidden_dim, n_targets)
|
||||||
|
|
||||||
|
self.hidden = self.init_hidden()
|
||||||
|
|
||||||
|
def init_hidden(self):
|
||||||
|
# Before we've done anything, we dont have any hidden state.
|
||||||
|
# Refer to the Pytorch documentation to see exactly
|
||||||
|
# why they have this dimensionality.
|
||||||
|
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
|
||||||
|
return (torch.zeros(1, 1, self.hidden_dim),
|
||||||
|
torch.zeros(1, 1, self.hidden_dim))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, sentence, target,verbose=False):
|
||||||
|
x_emb = self.word_embeddings(sentence)
|
||||||
|
E = self.E
|
||||||
|
version = self.version
|
||||||
|
|
||||||
|
if version != "tan-":
|
||||||
|
t_emb = self.word_embeddings(target)
|
||||||
|
#print(t_emb)
|
||||||
|
#print(torch.mean(t_emb,dim=0,keepdim=True).shape)
|
||||||
|
t_emb = torch.mean(t_emb,dim=0,keepdim=True)
|
||||||
|
xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
|
||||||
|
#print(xt_emb)
|
||||||
|
|
||||||
|
if version == "tan-":
|
||||||
|
lstm_out, _ = self.lstm(
|
||||||
|
x_emb.view(len(sentence), 1 , self.embedding_dim))
|
||||||
|
|
||||||
|
a = self.attention(x_emb)
|
||||||
|
|
||||||
|
final_hidden_state = torch.mm(F.softmax(a.view(1,-1),dim=1),lstm_out.view(len(sentence),-1))
|
||||||
|
|
||||||
|
elif version == "tan":
|
||||||
|
a = self.attention(xt_emb)
|
||||||
|
|
||||||
|
lstm_out, _ = self.lstm(x_emb.view(len(sentence), 1 , self.embedding_dim))
|
||||||
|
|
||||||
|
final_hidden_state = torch.mm(F.softmax(a.view(1,-1),dim=1),lstm_out.view(len(sentence),-1))
|
||||||
|
|
||||||
|
elif version == "lstm":
|
||||||
|
_, hidden_state = self.lstm(
|
||||||
|
x_emb.view(len(sentence), 1 , self.embedding_dim))
|
||||||
|
|
||||||
|
final_hidden_state = hidden_state[0].view(-1,self.hidden_dim)
|
||||||
|
|
||||||
|
|
||||||
|
target_space = self.hidden2target(self.dropout(final_hidden_state))
|
||||||
|
target_scores = F.log_softmax(target_space, dim=1)
|
||||||
|
|
||||||
|
|
||||||
|
return target_scores
|
||||||
|
|
||||||
|
#t_emb = self.word_embeddings(target)
|
||||||
|
#print(t_emb)
|
||||||
|
#print(torch.mean(t_emb,dim=0,keepdim=True).shape)
|
||||||
|
#t_emb = torch.mean(t_emb,dim=0,keep dim=True)
|
||||||
|
|
||||||
|
#xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
|
||||||
|
#print(xt_emb)
|
||||||
|
|
||||||
|
# In[26]:
|
1
TAN/noslang_data.json
Normal file
1
TAN/noslang_data.json
Normal file
File diff suppressed because one or more lines are too long
333
TAN/utils.py
Normal file
333
TAN/utils.py
Normal file
@ -0,0 +1,333 @@
|
|||||||
|
import csv
|
||||||
|
import copy
|
||||||
|
import numpy as np
|
||||||
|
import re
|
||||||
|
import itertools
|
||||||
|
from collections import Counter,defaultdict
|
||||||
|
import torch
|
||||||
|
import json
|
||||||
|
from collections import Counter
|
||||||
|
import wordninja
|
||||||
|
"""
|
||||||
|
|
||||||
|
Tokenization/string cleaning for all datasets.
|
||||||
|
Every dataset is lower cased.
|
||||||
|
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
|
||||||
|
|
||||||
|
string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
|
||||||
|
string = re.sub(r"#SemST", "", string)
|
||||||
|
string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
|
||||||
|
#string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
|
||||||
|
string = re.sub(r"([A-Z])", r" \1", string)
|
||||||
|
string = re.sub(r"\'s", " \'s", string)
|
||||||
|
string = re.sub(r"\'ve", " \'ve", string)
|
||||||
|
string = re.sub(r"n\'t", " n\'t", string)
|
||||||
|
string = re.sub(r"\'re", " \'re", string)
|
||||||
|
string = re.sub(r"\'d", " \'d", string)
|
||||||
|
string = re.sub(r"\'ll", " \'ll", string)
|
||||||
|
string = re.sub(r",", " , ", string)
|
||||||
|
string = re.sub(r"!", " ! ", string)
|
||||||
|
string = re.sub(r"\(", " ( ", string)
|
||||||
|
string = re.sub(r"\)", " ) ", string)
|
||||||
|
string = re.sub(r"\?", " ? ", string)
|
||||||
|
string = re.sub(r"\s{2,}", " ", string)
|
||||||
|
return string.strip() if TREC else string.strip().lower()
|
||||||
|
|
||||||
|
"""
|
||||||
|
def clean_str2(string, TREC=False):
|
||||||
|
"""
|
||||||
|
Tokenization/string cleaning for all datasets.
|
||||||
|
Every dataset is lower cased.
|
||||||
|
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
|
||||||
|
"""
|
||||||
|
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
|
||||||
|
string = re.sub(r"\'s", " \'s", string)
|
||||||
|
string = re.sub(r"\'ve", " \'ve", string)
|
||||||
|
string = re.sub(r"n\'t", " n\'t", string)
|
||||||
|
string = re.sub(r"\'re", " \'re", string)
|
||||||
|
string = re.sub(r"\'d", " \'d", string)
|
||||||
|
string = re.sub(r"\'ll", " \'ll", string)
|
||||||
|
string = re.sub(r",", " , ", string)
|
||||||
|
string = re.sub(r"!", " ! ", string)
|
||||||
|
string = re.sub(r"\(", " \( ", string)
|
||||||
|
string = re.sub(r"\)", " \) ", string)
|
||||||
|
string = re.sub(r"\?", " \? ", string)
|
||||||
|
string = re.sub(r"\s{2,}", " ", string)
|
||||||
|
return string.strip() if TREC else string.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def clean_str(string, TREC=False):
|
||||||
|
"""
|
||||||
|
Tokenization/string cleaning for all datasets.
|
||||||
|
Every dataset is lower cased.
|
||||||
|
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
|
||||||
|
"""
|
||||||
|
string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
|
||||||
|
string = re.sub(r"#SemST", "", string)
|
||||||
|
string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
|
||||||
|
#string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
|
||||||
|
#string = re.sub(r"([A-Z])", r" \1", string)
|
||||||
|
string = re.sub(r"\'s", " \'s", string)
|
||||||
|
string = re.sub(r"\'ve", " \'ve", string)
|
||||||
|
string = re.sub(r"n\'t", " n\'t", string)
|
||||||
|
string = re.sub(r"\'re", " \'re", string)
|
||||||
|
string = re.sub(r"\'d", " \'d", string)
|
||||||
|
string = re.sub(r"\'ll", " \'ll", string)
|
||||||
|
string = re.sub(r",", " , ", string)
|
||||||
|
string = re.sub(r"!", " ! ", string)
|
||||||
|
string = re.sub(r"\(", " ( ", string)
|
||||||
|
string = re.sub(r"\)", " ) ", string)
|
||||||
|
string = re.sub(r"\?", " ? ", string)
|
||||||
|
string = re.sub(r"\s{2,}", " ", string)
|
||||||
|
return string.strip() if TREC else string.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def create_normalise_dict(no_slang_data = "./noslang_data.json", emnlp_dict = "./emnlp_dict.txt"):
|
||||||
|
print("Creating Normalization Dictionary")
|
||||||
|
with open(no_slang_data, "r") as f:
|
||||||
|
data1 = json.load(f)
|
||||||
|
|
||||||
|
data2 = {}
|
||||||
|
|
||||||
|
with open(emnlp_dict,"r") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
for line in lines:
|
||||||
|
row = line.split('\t')
|
||||||
|
data2[row[0]] = row[1].rstrip()
|
||||||
|
|
||||||
|
normalization_dict = {**data1,**data2}
|
||||||
|
#print(normalization_dict)
|
||||||
|
return normalization_dict
|
||||||
|
|
||||||
|
def normalise(normalization_dict,sentence):
|
||||||
|
normalised_tokens = []
|
||||||
|
word_tokens = sentence.split()
|
||||||
|
for word in word_tokens:
|
||||||
|
if word in normalization_dict:
|
||||||
|
#if False:
|
||||||
|
normalised_tokens.extend(normalization_dict[word].lower().split(" "))
|
||||||
|
#print(word," normalised to ",normalization_dict[word])
|
||||||
|
else:
|
||||||
|
normalised_tokens.append(word.lower())
|
||||||
|
#print(normalised_tokens)
|
||||||
|
return normalised_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(dataset,dev = "cuda"):
|
||||||
|
def split(word):
|
||||||
|
if word in word2emb:
|
||||||
|
#if True:
|
||||||
|
return [word]
|
||||||
|
return wordninja.split(word)
|
||||||
|
|
||||||
|
|
||||||
|
assert dataset in ['VC', 'HC', 'HRT', 'LA', 'CC', 'SC', 'EC', 'MMR', 'AT', 'FM'], "unknown dataset"
|
||||||
|
|
||||||
|
folder = "Data_SemE_P"
|
||||||
|
|
||||||
|
if dataset == 'EC':
|
||||||
|
topic = 'E-ciggarettes are safer than normal ciggarettes'
|
||||||
|
folder = "Data_MPCHI_P"
|
||||||
|
elif dataset == 'SC':
|
||||||
|
topic = 'Sun exposure can lead to skin cancer'
|
||||||
|
folder = "Data_MPCHI_P"
|
||||||
|
elif dataset == 'VC':
|
||||||
|
topic = 'Vitamin C prevents common cold'
|
||||||
|
folder = "Data_MPCHI_P"
|
||||||
|
elif dataset == 'HRT':
|
||||||
|
topic = 'Women should take HRT post menopause'
|
||||||
|
folder = "Data_MPCHI_P"
|
||||||
|
elif dataset == 'MMR':
|
||||||
|
topic = 'MMR vaccine can cause autism'
|
||||||
|
folder = "Data_MPCHI_P"
|
||||||
|
elif dataset == 'AT' :
|
||||||
|
topic = "atheism"
|
||||||
|
elif dataset == 'HC' :
|
||||||
|
topic = "hillary clinton"
|
||||||
|
elif dataset == 'LA' :
|
||||||
|
topic = "legalization of abortion"
|
||||||
|
elif dataset == 'CC' :
|
||||||
|
topic = "climate change is a real concern"
|
||||||
|
elif dataset == 'FM' :
|
||||||
|
topic = "feminist movement"
|
||||||
|
elif dataset == 'VCA':
|
||||||
|
topic = "vaccines cause autism"
|
||||||
|
elif dataset == 'VTI':
|
||||||
|
topic = "vaccines treat influenza"
|
||||||
|
print(topic)
|
||||||
|
|
||||||
|
|
||||||
|
if dataset == "AT":
|
||||||
|
dataset = "Atheism"
|
||||||
|
|
||||||
|
target = normalise(normalization_dict,clean_str(topic))
|
||||||
|
stances = {'FAVOR' : 0, 'AGAINST' : 1, 'NONE' : 2}
|
||||||
|
|
||||||
|
train_x = []
|
||||||
|
train_y = []
|
||||||
|
|
||||||
|
with open("../Preprocessing/{}/{}/train_preprocessed.csv".format(folder,dataset),"r",encoding='latin-1') as f:
|
||||||
|
reader = csv.DictReader(f, delimiter=',')
|
||||||
|
for row in reader:
|
||||||
|
if row['Stance'] in stances:
|
||||||
|
train_x.append(row['Tweet'].split(' '))
|
||||||
|
train_y.append(stances[row['Stance']])
|
||||||
|
|
||||||
|
test_x = []
|
||||||
|
test_y = []
|
||||||
|
|
||||||
|
with open("../Preprocessing/{}/{}/test_preprocessed.csv".format(folder,dataset),"r",encoding='latin-1') as f:
|
||||||
|
reader = csv.DictReader(f, delimiter=',')
|
||||||
|
for row in reader:
|
||||||
|
if row['Stance'] in stances:
|
||||||
|
test_x.append(row['Tweet'].split(' '))
|
||||||
|
test_y.append(stances[row['Stance']])
|
||||||
|
|
||||||
|
|
||||||
|
word2emb = load_glove_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
word_ind = {}
|
||||||
|
|
||||||
|
# for i,sent in enumerate(train_x):
|
||||||
|
# final_sent = []
|
||||||
|
# j = 0
|
||||||
|
# while j < len(sent):
|
||||||
|
# final_sent += split(sent[j])
|
||||||
|
# j+=1
|
||||||
|
# train_x[i] = final_sent
|
||||||
|
#
|
||||||
|
# for i,sent in enumerate(test_x):
|
||||||
|
# final_sent = []
|
||||||
|
# j = 0
|
||||||
|
# while j < len(sent):
|
||||||
|
# final_sent += split(sent[j])
|
||||||
|
# j+=1
|
||||||
|
# test_x[i] = final_sent
|
||||||
|
#
|
||||||
|
|
||||||
|
for sent in train_x:
|
||||||
|
for word in sent:
|
||||||
|
if word not in word_ind and word in word2emb:
|
||||||
|
word_ind[word] = len(word_ind)
|
||||||
|
|
||||||
|
for sent in test_x:
|
||||||
|
for word in sent:
|
||||||
|
if word not in word_ind and word in word2emb:
|
||||||
|
word_ind[word] = len(word_ind)
|
||||||
|
|
||||||
|
for word in target:
|
||||||
|
if word not in word_ind and word in word2emb:
|
||||||
|
word_ind[word] = len(word_ind)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
UNK = len(word_ind)
|
||||||
|
PAD = len(word_ind)+1
|
||||||
|
|
||||||
|
|
||||||
|
ind_word = {v:k for k,v in word_ind.items()}
|
||||||
|
|
||||||
|
|
||||||
|
print("Number of words - {}".format(len(ind_word)))
|
||||||
|
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
#x_train = np.full((len(train_x),MAX_LEN),PAD)
|
||||||
|
x_train = []
|
||||||
|
OOV = 0
|
||||||
|
oovs = []
|
||||||
|
|
||||||
|
for i,sent in enumerate(train_x):
|
||||||
|
temp = []
|
||||||
|
for j,word in enumerate(sent):
|
||||||
|
if word in word_ind:
|
||||||
|
temp.append(word_ind[word])
|
||||||
|
else:
|
||||||
|
#print(word)
|
||||||
|
temp.append(UNK)
|
||||||
|
OOV+=1
|
||||||
|
oovs.append(word)
|
||||||
|
x_train.append(temp)
|
||||||
|
|
||||||
|
print("OOV words :- ",OOV)
|
||||||
|
a = Counter(oovs)
|
||||||
|
print(a)
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
y_train = np.array(train_y)
|
||||||
|
y_test = np.array(test_y)
|
||||||
|
|
||||||
|
|
||||||
|
# In[15]:
|
||||||
|
|
||||||
|
x_test = []
|
||||||
|
|
||||||
|
for i,sent in enumerate(test_x):
|
||||||
|
temp = []
|
||||||
|
for j,word in enumerate(sent):
|
||||||
|
if word in word_ind:
|
||||||
|
temp.append(word_ind[word])
|
||||||
|
else:
|
||||||
|
temp.append(UNK)
|
||||||
|
|
||||||
|
x_test.append(temp)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
embedding_matrix = np.zeros((len(word_ind) + 2, 300))
|
||||||
|
embedding_matrix[len(word_ind)] = np.random.randn((300))
|
||||||
|
for word in word_ind:
|
||||||
|
embedding_matrix[word_ind[word]] = word2emb[word]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("Number of training examples :- ",len(x_train))
|
||||||
|
print("Sample vectorised sentence :- ",x_train[0])
|
||||||
|
|
||||||
|
device = torch.device(dev)
|
||||||
|
print("Using this device :- ", device)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vector_target = []
|
||||||
|
for w in target:
|
||||||
|
if w in word_ind:
|
||||||
|
vector_target.append(word_ind[w])
|
||||||
|
else:
|
||||||
|
vector_target.append(UNK)
|
||||||
|
|
||||||
|
|
||||||
|
print("vectorised target:-")
|
||||||
|
print(vector_target)
|
||||||
|
|
||||||
|
return stances, word2emb, word_ind, ind_word, embedding_matrix, device,\
|
||||||
|
x_train, y_train, x_test, y_test, vector_target, train_x, test_x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_glove_embeddings():
|
||||||
|
word2emb = {}
|
||||||
|
WORD2VEC_MODEL = "../Preprocessing/glove.6B.300d.txt"
|
||||||
|
fglove = open(WORD2VEC_MODEL,"r")
|
||||||
|
for line in fglove:
|
||||||
|
cols = line.strip().split()
|
||||||
|
word = cols[0]
|
||||||
|
embedding = np.array(cols[1:],dtype="float32")
|
||||||
|
word2emb[word]=embedding
|
||||||
|
fglove.close()
|
||||||
|
return word2emb
|
Loading…
Reference in New Issue
Block a user