added SEN-SVM codes

This commit is contained in:
Siddharth 2019-06-18 22:16:04 +05:30
parent b524243043
commit a86f074dca
6 changed files with 1652 additions and 21 deletions

533
SEN-SVM/STA_features.py Normal file
View File

@ -0,0 +1,533 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import time
import numpy as np
import pandas as pd
import string
import csv
from scipy import stats
import random
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import wordninja
from collections import defaultdict, Counter
import math
import sys
# In[ ]:
def load_glove_embeddings_set():
word2emb = []
WORD2VEC_MODEL = "glove.6B.300d.txt"
fglove = open(WORD2VEC_MODEL,"r")
for line in fglove:
cols = line.strip().split()
word = cols[0]
word2emb.append(word)
fglove.close()
return set(word2emb)
def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
print("Creating Normalization Dictionary")
with open(no_slang_data, "r") as f:
data1 = json.load(f)
data2 = {}
with open(emnlp_dict,"r") as f:
lines = f.readlines()
for line in lines:
row = line.split('\t')
data2[row[0]] = row[1].rstrip()
normalization_dict = {**data1,**data2}
#print(normalization_dict)
return normalization_dict
word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
# In[ ]:
def sent_process(sent):
sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
sent = re.sub(r"#SemST", "", sent)
sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
#sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
#sent = re.sub(r"([A-Z])", r" \1", sent)
sent = re.sub(r"\'s", " \'s", sent)
sent = re.sub(r"\'ve", " \'ve", sent)
sent = re.sub(r"n\'t", " n\'t", sent)
sent = re.sub(r"\'re", " \'re", sent)
sent = re.sub(r"\'d", " \'d", sent)
sent = re.sub(r"\'ll", " \'ll", sent)
sent = re.sub(r",", " , ", sent)
sent = re.sub(r"!", " ! ", sent)
sent = re.sub(r"\(", " ( ", sent)
sent = re.sub(r"\)", " ) ", sent)
sent = re.sub(r"\?", " ? ", sent)
sent = re.sub(r"\s{2,}", " ", sent)
sent = sent.strip()
word_tokens = sent.split()
normalised_tokens = []
for word in word_tokens:
if word in norm_dict:
#if False:
normalised_tokens.extend(norm_dict[word].lower().split(" "))
#print(word," normalised to ",norm_dict[word])
else:
normalised_tokens.append(word.lower())
wordninja_tokens = []
for word in normalised_tokens:
if word in word_dict:
wordninja_tokens+=[word]
else:
wordninja_tokens+=wordninja.split(word)
return " ".join(wordninja_tokens)
# In[ ]:
# In[13]:
def build_lexicon(name):
def pmi(x,y,z,t):
res=(x/(y*(z/t)+(math.sqrt(x)*math.sqrt(math.log(0.9)/(-2)))))
return math.log(res,2)
def prob(word1,nava,total):
count_prob=0
if word1 in nava:
count_prob += nava[word1]
return((count_prob+1))
def prob_cond(word1,seed,stance_seed,stance,total):
count_prob=0
for i in range(len(seed)):
if(seed[i]==word1):
if(stance_seed[i]==stance):
count_prob=count_prob+1
return((count_prob+1))
def prob_cond1(word1,word2,Features,total):
count_prob=0
#for i in range(length_Features):
# flag1=0
# flag2=0
# for word in Features['co_relation'][i]:
# if(word==word1):
# flag1=1
# if(word==word2):
# flag2=1
# if(flag1==1 and flag2==1):
# count_prob=count_prob+1
#seed and non-seed lexicon formation
return((co_relation[(word1,word2)]+1))
print("building lexicon for ", name)
raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/train.csv')
#Features Extraction
porter=PorterStemmer()
Stop_words=set(stopwords.words('english'))
Features=raw[['sentence']]
Tweet=Features['sentence'].copy()
Features['sentence']=Tweet.apply(sent_process)
Features['tokenized_sents'] = Features.apply(lambda row: (row['sentence'].split()), axis=1)
Features['pos_tag']=Features.apply(lambda row:nltk.pos_tag(row['tokenized_sents'],tagset='universal'),axis=1)
Features['stance']=raw['stance']
length_Features=len(Features['sentence'])
co_relation=defaultdict(int)
co_relation2 = []
for i in range(length_Features):
line=[]
for word,tag in Features['pos_tag'][i]:
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
if(word not in Stop_words):
line.append(porter.stem(word))
for i in range(len(line)):
for j in range(i+1,len(line)):
co_relation[(line[i],line[j])]+=1
co_relation[(line[j],line[i])]+=1
co_relation2.append(line)
Features['co_relation']=co_relation2
FAVOR=[]
AGAINST=[]
NONE=[]
for i in range(length_Features):
if(Features['stance'][i]=='support'):
for word,tag in Features['pos_tag'][i]:
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
if(word not in Stop_words):
FAVOR.append(porter.stem(word))
else:
if(Features['stance'][i]=='oppose'):
for word,tag in Features['pos_tag'][i]:
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
if(word not in Stop_words):
AGAINST.append(porter.stem(word))
else:
if(Features['stance'][i]=='neutral'):
for word,tag in Features['pos_tag'][i]:
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
if(word not in Stop_words):
NONE.append(porter.stem(word))
len_sup=len(FAVOR)
len_opp=len(AGAINST)
len_nut=len(NONE)
len_co=[]
for i in range(length_Features):
len_co.append(len(Features['co_relation'][i]))
Features['len_nava']=len_co
nava=[]
for i in range(length_Features):
for word,tag in Features['pos_tag'][i]:
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
if(word not in Stop_words):
nava.append(word.lower())
nava_stem=[]
for word in nava:
nava_stem.append(porter.stem(word))
uni_nava_stem=list(set(nava_stem))
nava_stem = Counter(nava_stem)
total=len(nava_stem)
length=len(uni_nava_stem)
print(total,length)
seed=[]
non_seed=[]
seed_stance=[]
for i in range(len(Features)):
for j in range(int(0.75*Features['len_nava'][i])):
seed.append(Features['co_relation'][i][j])
seed_stance.append(Features['stance'][i])
for j in range(int(0.75*Features['len_nava'][i]),Features['len_nava'][i]):
non_seed.append(Features['co_relation'][i][j])
uni_seed=list(set(seed))
uni_non_seed=list(set(non_seed))
'''for i in range(len(Features)):
x=[]
x=random.sample(Features['co_relation'][i],int(0.75*Features['len_nava'][i]))
for j in range(len(x)):
seed.append(x[j])
seed_stance.append(Features['stance'][i])
for j in range(Features['len_nava'][i]):
if(Features['co_relation'][i][j] not in x):
non_seed.append(Features['co_relation'][i][j])
uni_seed=list(set(seed))
uni_non_seed=list(set(non_seed))'''
len_seed=len(seed)
len_uni_seed=len(uni_seed)
len_non_seed=len(non_seed)
len_uni_non_seed=len(uni_non_seed)
len_seed_sup=0
len_seed_opp=0
len_seed_nut=0
for i in range(len(seed_stance)):
if(seed_stance[i]=='support'):
len_seed_sup=len_seed_sup+1
else:
if(seed_stance[i]=='oppose'):
len_seed_opp=len_seed_opp+1
else:
len_seed_nut=len_seed_nut+1
print(len_seed_nut,len_seed_opp,len_seed_sup)
prob_sup=len_seed_sup/(len_seed_sup+len_seed_opp+len_seed_nut)
prob_opp=len_seed_opp/(len_seed_sup+len_seed_opp+len_seed_nut)
prob_nut=len_seed_nut/(len_seed_sup+len_seed_opp+len_seed_nut)
prob_word=[]
for word in uni_seed:
prob_word.append(prob(word,nava_stem,total))
prob_cond_word={}
prob_supp_word=[]
prob_opp_word=[]
prob_neu_word=[]
for word in uni_seed:
prob_supp_word.append(prob_cond(word,seed,seed_stance,'support',(len_seed_sup+len_seed_opp+len_seed_nut)))
prob_opp_word.append(prob_cond(word,seed,seed_stance,'oppose',(len_seed_sup+len_seed_opp+len_seed_nut)))
prob_neu_word.append(prob_cond(word,seed,seed_stance,'neutral',(len_seed_sup+len_seed_opp+len_seed_nut)))
prob_cond_word={'word':list(uni_seed),'prob_word':prob_word,'prob_supp_word':prob_supp_word,'prob_opp_word':prob_opp_word,'prob_neu_word':prob_neu_word}
Seed_lexicon = pd.DataFrame(data=prob_cond_word)
print(Seed_lexicon)
pmi_AGAINST=[]
pmi_FAVOR=[]
pmi_NONE=[]
'''for i in range(len_uni_seed):
pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],prob_opp))
pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],prob_sup))
pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],prob_nut))'''
for i in range(len_uni_seed):
pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],len_seed_opp,len_seed))
pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],len_seed_sup,len_seed))
pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],len_seed_nut,len_seed))
Seed_lexicon['pmi_AGAINST']=list(pmi_AGAINST)
Seed_lexicon['pmi_FAVOR']=list(pmi_FAVOR)
Seed_lexicon['pmi_NONE']=list(pmi_NONE)
stance=[]
for i in range(len_uni_seed):
if((Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_AGAINST'][i]) and (Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_NONE'][i])):
stance.append('support')
else:
if((Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_FAVOR'][i]) & (Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_NONE'][i])):
stance.append('oppose')
else:
stance.append('neutral')
Seed_lexicon['Stance']=list(stance)
#NON SEED LEXICON
score_non_seed_opp=[]
score_non_seed_sup=[]
score_non_seed_nut=[]
opp_seed_word=[]
nut_seed_word=[]
sup_seed_word=[]
for i in range(len_uni_seed):
if(Seed_lexicon['Stance'][i]=='support'):
sup_seed_word.append(Seed_lexicon['word'][i])
else:
if(Seed_lexicon['Stance'][i]=='oppose'):
opp_seed_word.append(Seed_lexicon['word'][i])
else:
nut_seed_word.append(Seed_lexicon['word'][i])
#opp_seed_word=set(opp_seed_word)
#nut_seed_word=set(nut_seed_word)
#sup_seed_word=set(sup_seed_word)
len_opp_words=len(opp_seed_word)
len_nut_words=len(nut_seed_word)
len_sup_words=len(sup_seed_word)
pmi_non_seed={}
start1=time.time()
print("COMPUTING...")
k=0
for word in uni_non_seed:
list_=[]
for i in range(len_sup_words):
l=pmi(prob_cond1(word,sup_seed_word[i],Features,total),prob(word,nava_stem,total),prob(sup_seed_word[i],nava_stem,total),total)
if(l<0):
list_.append(1)
else:
list_.append(l)
score_non_seed_sup.append(stats.gmean(list_))
#print(k)
k=k+1
print("score_non_seed_sup_complete :)")
end1=time.time()
time1=end1-start1
print(time1)
start2=time.time()
k=0
for word in uni_non_seed:
list_=[]
for i in range(len_opp_words):
l=pmi(prob_cond1(word,opp_seed_word[i],Features,total),prob(word,nava_stem,total),prob(opp_seed_word[i],nava_stem,total),total)
if(l<0):
list_.append(1)
else:
list_.append(l)
score_non_seed_opp.append(stats.gmean(list_))
#print(k)
k=k+1
print("score_non_seed_opp_complete :)")
end2=time.time()
time2=end2-start2
print(time2)
start3=time.time()
k=0
#print("~~~~",nut_seed_word)
print(len(uni_non_seed),len_nut_words)
for word in uni_non_seed:
list_=[]
#s2 = time.time()
for i in range(len_nut_words):
#s1 = time.time()
l=pmi(prob_cond1(word,nut_seed_word[i],Features,total), prob(word,nava_stem,total), prob(nut_seed_word[i],nava_stem,total),total)
#print(time.time()-s1)
if(l<0):
list_.append(1)
else:
list_.append(l)
score_non_seed_nut.append(stats.gmean(list_))
#print(time.time()-s2)
#print(k)
k=k+1
print("score_non_seed_nut_complete :)")
end3=time.time()
print("Process Complete :)")
time3=end3-start3
print(time3)
total_time=time1+time2+time3
print(total_time)
prob_cond_word={'word':list(uni_non_seed),'score_non_seed_opp':score_non_seed_opp,'score_non_seed_sup':score_non_seed_sup,'score_non_seed_nut':score_non_seed_nut}
NonSeed_lexicon = pd.DataFrame(data=prob_cond_word)
#Tweet Vector Formation
lex_word=[]
lex_word.extend(list(Seed_lexicon['word']))
lex_word.extend(list(NonSeed_lexicon['word']))
pmi_sup=[]
pmi_sup.extend(list(Seed_lexicon['pmi_FAVOR']))
pmi_sup.extend(list(NonSeed_lexicon['score_non_seed_sup']))
pmi_opp=[]
pmi_opp.extend(list(Seed_lexicon['pmi_AGAINST']))
pmi_opp.extend(list(NonSeed_lexicon['score_non_seed_opp']))
pmi_nut=[]
pmi_nut.extend(list(Seed_lexicon['pmi_NONE']))
pmi_nut.extend(list(NonSeed_lexicon['score_non_seed_nut']))
Lexicon = dict()
for i in range(len(lex_word)):
Lexicon[lex_word[i]] = {'pmi_sup':pmi_sup[i],'pmi_opp':pmi_opp[i],'pmi_nut':pmi_nut[i]}
print("Lexicon formed")
return Lexicon
#Lexicon={'word':lex_word,'pmi_sup':pmi_sup,'pmi_opp':pmi_opp,'pmi_nut':pmi_nut}
#Lexicon = pd.DataFrame(data=Lexicon)
# In[14]:
#Lexicon = build_lexicon('SC')
# In[26]:
def produce_features(name,Lexicon):
#train_features
for l in ['train','test']:
raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/{}.csv'.format(l))
Stop_words=set(stopwords.words('english'))
Features=raw[['sentence']]
Tweet=Features['sentence'].copy()
Features['preprocessed_sentence']=Tweet.apply(sent_process)
Features['tokenized_sents'] = Features.apply(lambda row: (row['preprocessed_sentence'].split()), axis=1)
porter = PorterStemmer()
start=time.time()
#word_sup_vect=[]
#word_opp_vect=[]
#word_nut_vect=[]
data = [['sentence','pmi_sup','pmi_opp','pmi_nut']]
len_lexicon_word=len(Lexicon)
for i in range(len(Features['sentence'])):
sum1=0
sum2=0
sum3=0
total_lex=0
temp = []
for word in Features['tokenized_sents'][i]:
#for j in range(len_lexicon_word):
w = porter.stem(word)
if w in Lexicon:
sum1=sum1+Lexicon[w]['pmi_sup']
sum2=sum2+Lexicon[w]['pmi_opp']
sum3=sum3+Lexicon[w]['pmi_nut']
total_lex=total_lex+1
#word_sup_vect.append(sum1/total_lex)
#word_opp_vect.append(sum2/total_lex)
#word_nut_vect.append(sum3/total_lex)
data.append([Features['sentence'][i],sum1/total_lex,sum2/total_lex,sum3/total_lex])
my_df = pd.DataFrame(data)
my_df.to_csv('./pmi/pmi_{}_{}.csv'.format(name,l+'1'),header=False,index=False)
end=time.time()
print(end-start)
# In[27]:
produce_features('HRT',build_lexicon('HRT'))
'''for dataset in ['AT','LA','CC','HC','FM']:
produce_features(dataset,build_lexicon(dataset))'''
# In[ ]:
# In[ ]:

318
SEN-SVM/SVM.py Normal file
View File

@ -0,0 +1,318 @@
#!/usr/bin/env python
# coding: utf-8
# In[9]:
import csv
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import wordninja
import re
import json
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
# In[2]:
stemmer = PorterStemmer()
def load_glove_embeddings_set():
word2emb = []
WORD2VEC_MODEL = "../glove.6B.300d.txt"
fglove = open(WORD2VEC_MODEL,"r")
for line in fglove:
cols = line.strip().split()
word = cols[0]
word2emb.append(word)
fglove.close()
return set(word2emb)
def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
print("Creating Normalization Dictionary")
with open(no_slang_data, "r") as f:
data1 = json.load(f)
data2 = {}
with open(emnlp_dict,"r") as f:
lines = f.readlines()
for line in lines:
row = line.split('\t')
data2[row[0]] = row[1].rstrip()
normalization_dict = {**data1,**data2}
#print(normalization_dict)
return normalization_dict
word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
# In[3]:
def sent_process(sent):
sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
sent = re.sub(r"#SemST", "", sent)
sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
#sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
#sent = re.sub(r"([A-Z])", r" \1", sent)
sent = re.sub(r"\'s", " \'s", sent)
sent = re.sub(r"\'ve", " \'ve", sent)
sent = re.sub(r"n\'t", " n\'t", sent)
sent = re.sub(r"\'re", " \'re", sent)
sent = re.sub(r"\'d", " \'d", sent)
sent = re.sub(r"\'ll", " \'ll", sent)
sent = re.sub(r",", " , ", sent)
sent = re.sub(r"!", " ! ", sent)
sent = re.sub(r"\(", " ( ", sent)
sent = re.sub(r"\)", " ) ", sent)
sent = re.sub(r"\?", " ? ", sent)
sent = re.sub(r"\s{2,}", " ", sent)
sent = sent.strip()
word_tokens = sent.split()
normalised_tokens = []
for word in word_tokens:
if word in norm_dict:
#if False:
normalised_tokens.extend(norm_dict[word].lower().split(" "))
print(word," normalised to ",norm_dict[word])
else:
normalised_tokens.append(word.lower())
wordninja_tokens = []
for word in normalised_tokens:
if word in word_dict:
wordninja_tokens+=[word]
else:
wordninja_tokens+=wordninja.split(word)
return " ".join(wordninja_tokens)
# In[4]:
def svc_param_selection(X, y, nfolds):
Cs = [0.001, 0.01, 0.1, 1, 10,100 ]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = [{'C': Cs, 'gamma' : gammas , 'kernel' : ['rbf']},{'C': Cs , 'gamma' : gammas , 'kernel' : ['linear']}]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
grid_search.fit(X, y)
grid_search.best_params_
return grid_search.best_params_
# In[21]:
def train(topic,bow=False,senti=False,sta=False,ent=False):
save_file = topic+"_"
print(topic)
y_train = []
y_test = []
sentences = []
features_ent = []
features_sta = []
features_senti = []
senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2}
with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f:
reader = csv.DictReader(f, delimiter=',')
done = False
for row in reader:
sentences.append(row['sentence'])
if ent:
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
if not done:
save_file = save_file + "ent_"
if sta:
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
if not done:
save_file = save_file + "sta_"
if senti:
features_senti.append([senti_dict[row['senti']]])
if not done:
save_file = save_file + "senti_"
done = True
y_train.append(row['label'])
L = len(sentences)
with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f:
reader = csv.DictReader(f, delimiter=',')
for row in reader:
sentences.append(row['sentence'])
if ent:
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
if sta:
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
if senti:
features_senti.append([senti_dict[row['senti']]])
y_test.append(row['label'])
all_features = []
if bow:
new_sentences = []
for sent in sentences:
tokens = word_tokenize(sent)
tokens = [stemmer.stem(token) for token in tokens]
ret = " ".join(w for w in tokens)
new_sentences.append(ret)
save_file = save_file + "bow_"
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2)
features_bow = vectorizer.fit_transform(new_sentences)
all_features.append(features_bow.toarray())
#features_bow_train = np.array(features_bow[:L].toarray())
#features_bow_test = np.array(features_bow[L:].toarray())
if ent:
#features_ent_train = np.array(features_ent[:L])
#features_ent_test = np.array(features_ent[L:])
all_features.append(features_ent)
if sta:
#features_ent_train = np.array(features_ent[:L])
#features_ent_test = np.array(features_ent[L:])
all_features.append(features_sta)
if senti:
#features_senti_train = np.array(features_senti[:L])
#features_senti_test = np.array(features_senti[L:])
all_features.append(features_senti)
dataset = np.concatenate(all_features,axis=1)
train_dataset = dataset[:L]
test_dataset = dataset[L:]
best_params = svc_param_selection(train_dataset,y_train,nfolds=5)
print(best_params)
if best_params['kernel'] == 'rbf':
model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
else:
model = svm.SVC(kernel='linear' ,C = best_params['C'],probability=True)
model.fit(train_dataset,y_train)
y_pred = model.predict(test_dataset)
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
#cm = confusion_matrix(y_test,y_pred,labels=['0','1','2'])
conf_score = model.predict_proba(test_dataset)
#print
df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1))
df.to_csv(save_file+".csv",header=False,index=False)
return y_pred,y_test
# In[22]:
'''
#BOW
print ('BOW---------')
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,bow = True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))'''
# In[23]:
print ("STA---------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[7]:
print("ENT----------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,ent=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[8]:
print("SENTI---------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,senti=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[9]:
print("ENT_SENTI--------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,ent=True,senti=True,sta=False)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[10]:
print("ENT_STA---------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,ent=True,senti=False,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[11]:
print("SENTI_STA-----------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,ent=False,senti=True,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[12]:
print("ENT_SENTI_STA---------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,ent=True,senti=True,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[ ]:

380
SEN-SVM/SVM_mpchi.py Normal file
View File

@ -0,0 +1,380 @@
#!/usr/bin/env python
# coding: utf-8
# In[9]:
import csv
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import wordninja
import re
import json
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
# In[2]:
stemmer = PorterStemmer()
def load_glove_embeddings_set():
word2emb = []
WORD2VEC_MODEL = "../glove.6B.300d.txt"
fglove = open(WORD2VEC_MODEL,"r")
for line in fglove:
cols = line.strip().split()
word = cols[0]
word2emb.append(word)
fglove.close()
return set(word2emb)
def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
print("Creating Normalization Dictionary")
with open(no_slang_data, "r") as f:
data1 = json.load(f)
data2 = {}
with open(emnlp_dict,"r") as f:
lines = f.readlines()
for line in lines:
row = line.split('\t')
data2[row[0]] = row[1].rstrip()
normalization_dict = {**data1,**data2}
#print(normalization_dict)
return normalization_dict
word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
# In[3]:
def sent_process(sent):
sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
sent = re.sub(r"#SemST", "", sent)
sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
#sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
#sent = re.sub(r"([A-Z])", r" \1", sent)
sent = re.sub(r"\'s", " \'s", sent)
sent = re.sub(r"\'ve", " \'ve", sent)
sent = re.sub(r"n\'t", " n\'t", sent)
sent = re.sub(r"\'re", " \'re", sent)
sent = re.sub(r"\'d", " \'d", sent)
sent = re.sub(r"\'ll", " \'ll", sent)
sent = re.sub(r",", " , ", sent)
sent = re.sub(r"!", " ! ", sent)
sent = re.sub(r"\(", " ( ", sent)
sent = re.sub(r"\)", " ) ", sent)
sent = re.sub(r"\?", " ? ", sent)
sent = re.sub(r"\s{2,}", " ", sent)
sent = sent.strip()
word_tokens = sent.split()
normalised_tokens = []
for word in word_tokens:
if word in norm_dict:
#if False:
normalised_tokens.extend(norm_dict[word].lower().split(" "))
print(word," normalised to ",norm_dict[word])
else:
normalised_tokens.append(word.lower())
wordninja_tokens = []
for word in normalised_tokens:
if word in word_dict:
wordninja_tokens+=[word]
else:
wordninja_tokens+=wordninja.split(word)
return " ".join(wordninja_tokens)
# In[4]:
#{'C': c, 'gamma' : gammas , 'kernel' : ['rbf']},
def svc_param_selection(X, y, nfolds):
Cs = [0.001, 0.01, 0.1, 1, 10,100 ]
c = [1]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = [{'C': c , 'gamma' : gammas , 'kernel' : ['linear']}]
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
grid_search.fit(X, y)
grid_search.best_params_
return grid_search.best_params_
# In[21]:
def train(topic,bow=False,senti=False,sta=False,ent=False,med = False):
save_file = topic+"_"
print(topic)
y_train = []
y_test = []
sentences = []
features_ent = []
features_sta = []
features_senti = []
features_med = []
senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2}
with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f:
reader = csv.DictReader(f, delimiter=',')
done = False
for row in reader:
sentences.append(row['sentence'])
if ent:
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
if not done:
save_file = save_file + "ent_"
if sta:
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
if not done:
save_file = save_file + "sta_"
if senti:
features_senti.append([senti_dict[row['senti']]])
if not done:
save_file = save_file + "senti_"
if med:
features_med.append([row['med_aff'],row['med_treat']])
if not done:
save_file = save_file + "med"
done = True
y_train.append(row['label'])
L = len(sentences)
with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f:
reader = csv.DictReader(f, delimiter=',')
for row in reader:
sentences.append(row['sentence'])
if ent:
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
if sta:
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
if senti:
features_senti.append([senti_dict[row['senti']]])
if med:
features_med.append([row['med_aff'],row['med_treat']])
y_test.append(row['label'])
all_features = []
if bow:
new_sentences = []
for sent in sentences:
tokens = word_tokenize(sent)
tokens = [stemmer.stem(token) for token in tokens]
ret = " ".join(w for w in tokens)
new_sentences.append(ret)
save_file = save_file + "bow_"
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2)
features_bow = vectorizer.fit_transform(new_sentences)
all_features.append(features_bow.toarray())
#features_bow_train = np.array(features_bow[:L].toarray())
#features_bow_test = np.array(features_bow[L:].toarray())
if ent:
#features_ent_train = np.array(features_ent[:L])
#features_ent_test = np.array(features_ent[L:])
all_features.append(features_ent)
if sta:
#features_ent_train = np.array(features_ent[:L])
#features_ent_test = np.array(features_ent[L:])
all_features.append(features_sta)
if senti:
#features_senti_train = np.array(features_senti[:L])
#features_senti_test = np.array(features_senti[L:])
all_features.append(features_senti)
if med:
all_features.append(features_med)
dataset = np.concatenate(all_features,axis=1)
train_dataset = dataset[:L]
test_dataset = dataset[L:]
best_params = svc_param_selection(train_dataset,y_train,nfolds=5)
print(best_params)
if best_params['kernel'] == 'rbf':
model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
else:
model = svm.SVC(kernel='linear' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
model.fit(train_dataset,y_train)
y_pred = model.predict(test_dataset)
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
#cm = confusion_matrix(y_test,y_pred,labels=['0','1','2'])
conf_score = model.predict_proba(test_dataset)
#print
df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1))
df.to_csv(save_file+".csv",header=False,index=False)
return y_pred,y_test
# In[22]:
'''
#BOW
print ('BOW---------')
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,bow = True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[23]:
print ("STA---------")
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[7]:
print("ENT----------")
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,ent=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[8]:
print("SENTI---------")
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,senti=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[9]:
print("ENT_SENTI--------")
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset ,senti=True , ent=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[10]:
print("ENT_STA---------")
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,ent=True,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[11]:
print("SENTI_STA-----------")
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,senti=True,sta=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[12]:
print("ENT_SENTI_STA---------")
y_pred,y_test = [],[]
for dataset in ['AT','CC','FM','LA','HC']:
a,b = train(dataset,ent=True,senti=True,sta=True,bow = True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
# In[ ]:
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,med=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,sta = True, med=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,senti = True, med=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,sta = True, senti = True, med=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,sta = True, senti = True, med=True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))'''
y_pred,y_test = [],[]
for dataset in ['MMR','HRT','EC','VC','SC']:
a,b = train(dataset,sta = True, senti = True, med=True, ent = True, bow= True)
y_pred.extend(a)
y_test.extend(b)
print(len(a),len(b))
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 03:05:06 2018
@author: shalmoli
"""
import pandas as pd
import csv
from pycorenlp import StanfordCoreNLP
import json
#reading Input file
data=pd.read_csv('HRT2.csv')
#Extracting Sentences
sentence=data['sentences']
length=len(sentence)
#storing Result
# before Executing this,we have to start the Stanford Server through Terminal
'''It's returns sentiment sentences wise.
If we have have abstract more than one sentence ,we conclude the sentiment for that sentence by considering the majority and breaking tie randomly.'''
result=[]
for i in range(length):
nlp = StanfordCoreNLP('http://localhost:9000')
res = nlp.annotate(sentence[i],
properties={
'annotators': 'sentiment',
'outputFormat': 'json',
'timeout': 100000,
})
print (res)
count=0# counting number of sentences in input
count_1=0#counting Negative Sentiment
count_2=0#counting Neutral Sentiment
count_3=0#counting Positive Sentiment
for s in (res['sentences']):
if(s["sentimentValue"]):
count=count+1
if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'):
count_1=count_1+1
else:
if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'):
count_3=count_3+1
else:
count_2=count_2+1
if(count>1):
if(count_1 > count_2 and count_1 > count_3):
result.append("Negative")
else:
if(count_2 > count_1 and count_2 > count_3):
result.append('Neutral')
else:
result.append('Positive')
else:
if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'):
result.append("Negative")
else:
if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'):
result.append("Positive")
else:
result.append('Neutral')
#Storing Output to file Oytput is binary for each class individually
file = open('Sentiment_HRT2.csv','a')
fields = ('sentence','positive','negative','neutral','sentiment')
wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
wr.writeheader()
for i in range(length):
if(result[i]=='Negative'):
wr.writerow({'sentence':sentence[i],'positive':0,'negative':1,'neutral':0,'sentiment':result[i]})
else:
if(result[i]=='Positive'):
wr.writerow({'sentence':sentence[i],'positive':1,'negative':0,'neutral':0,'sentiment':result[i]})
else:
wr.writerow({'sentence':sentence[i],'positive':0,'negative':0,'neutral':1,'sentiment':result[i]})
file.close()

342
SEN-SVM/te_f.py Normal file
View File

@ -0,0 +1,342 @@
def isnan(value):
try:
import math
return math.isnan(float(value))
except:
return False
#matplotlib inline
import tensorflow as tf
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import matplotlib.ticker as ticker
import urllib
import sys
import os
import csv
import zipfile
glove_zip_file = "glove.6B.zip"
glove_vectors_file = "glove.6B.50d.txt"
snli_zip_file = "snli_1.0.zip"
snli_dev_file = "snli_1.0_dev.txt"
snli_full_dataset_file = "snli_1.0_train.txt"
from six.moves.urllib.request import urlretrieve
#large file - 862 MB
if (not os.path.isfile(glove_zip_file) and
not os.path.isfile(glove_vectors_file)):
urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip",
glove_zip_file)
#medium-sized file - 94.6 MB
if (not os.path.isfile(snli_zip_file) and
not os.path.isfile(snli_dev_file)):
urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
snli_zip_file)
def unzip_single_file(zip_file_name, output_file_name):
"""
If the outFile is already created, don't recreate
If the outFile does not exist, create it from the zipFile
"""
if not os.path.isfile(output_file_name):
with open(output_file_name, 'wb') as out_file:
with zipfile.ZipFile(zip_file_name) as zipped:
for info in zipped.infolist():
if output_file_name in info.filename:
with zipped.open(info) as requested_file:
out_file.write(requested_file.read())
return
unzip_single_file(glove_zip_file, glove_vectors_file)
unzip_single_file(snli_zip_file, snli_dev_file)
glove_wordmap = {}
with open(glove_vectors_file, "r") as glove:
for line in glove:
name, vector = tuple(line.split(" ", 1))
glove_wordmap[name] = np.fromstring(vector, sep=" ")
def sentence2sequence(sentence):
"""
- Turns an input sentence into an (n,d) matrix,
where n is the number of tokens in the sentence
and d is the number of dimensions each word vector has.
"""
tokens = sentence.lower().split(" ")
rows = []
words = []
#Greedy search for tokens
for token in tokens:
i = len(token)
while len(token) > 0 and i > 0:
word = token[:i]
if word in glove_wordmap:
rows.append(glove_wordmap[word])
words.append(word)
token = token[i:]
i = len(token)
else:
i = i-1
return rows, words
rnn_size = 64
rnn = tf.contrib.rnn.BasicRNNCell(rnn_size)
#Constants setup
max_hypothesis_length, max_evidence_length = 60, 50
batch_size, vector_size, hidden_size = 128, 50, 64
lstm_size = hidden_size
weight_decay = 0.0001
learning_rate = 1
input_p, output_p = 0.5, 0.5
training_iterations_count = 100000
display_step = 10
def score_setup(row):
convert_dict = {
'entailment': 0,
'neutral': 1,
'contradiction': 2
}
score = np.zeros((3,))
for x in range(1,6):
tag = row["label"+str(x)]
if tag in convert_dict: score[convert_dict[tag]] += 1
return score / (1.0*np.sum(score))
def fit_to_size(matrix, shape):
res = np.zeros(shape)
slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)]
res[slices] = matrix[slices]
return res
def split_data_into_scores():
import csv
with open("snli_1.0_dev.txt","r") as data:
train = csv.DictReader(data, delimiter='\t')
evi_sentences = []
hyp_sentences = []
labels = []
scores = []
for row in train:
hyp_sentences.append(np.vstack(
sentence2sequence(row["sentence1"].lower())[0]))
evi_sentences.append(np.vstack(
sentence2sequence(row["sentence2"].lower())[0]))
labels.append(row["gold_label"])
scores.append(score_setup(row))
hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
for x in hyp_sentences])
evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
for x in evi_sentences])
return (hyp_sentences, evi_sentences), labels, np.array(scores)
data_feature_list, correct_values, correct_scores = split_data_into_scores()
l_h, l_e = max_hypothesis_length, max_evidence_length
N, D, H = batch_size, vector_size, hidden_size
l_seq = l_h + l_e
tf.reset_default_graph()
# lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p)
# N: The number of elements in each of our batches,
# which we use to train subsets of data for efficiency's sake.
# l_h: The maximum length of a hypothesis, or the second sentence. This is
# used because training an RNN is extraordinarily difficult without
# rolling it out to a fixed length.
# l_e: The maximum length of evidence, the first sentence. This is used
# because training an RNN is extraordinarily difficult without
# rolling it out to a fixed length.
# D: The size of our used GloVe or other vectors.
hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis')
evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence')
y = tf.placeholder(tf.float32, [N, 3], 'label')
# hyp: Where the hypotheses will be stored during training.
# evi: Where the evidences will be stored during training.
# y: Where correct scores will be stored during training.
# lstm_size: the size of the gates in the LSTM,
# as in the first LSTM layer's initialization.
# lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size)
lstm_back = tf.nn.rnn_cell.LSTMCell(lstm_size)
# lstm_back: The LSTM used for looking backwards
# through the sentences, similar to lstm.
# input_p: the probability that inputs to the LSTM will be retained at each
# iteration of dropout.
# output_p: the probability that outputs from the LSTM will be retained at
# each iteration of dropout.
lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p)
# lstm_drop_back: A dropout wrapper for lstm_back, like lstm_drop.
fc_initializer = tf.random_normal_initializer(stddev=0.1)
# fc_initializer: initial values for the fully connected layer's weights.
# hidden_size: the size of the outputs from each lstm layer.
# Multiplied by 2 to account for the two LSTMs.
fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3],
initializer = fc_initializer)
# fc_weight: Storage for the fully connected layer's weights.
fc_bias = tf.get_variable('bias', [3])
# fc_bias: Storage for the fully connected layer's bias.
# tf.GraphKeys.REGULARIZATION_LOSSES: A key to a collection in the graph
# designated for losses due to regularization.
# In this case, this portion of loss is regularization on the weights
# for the fully connected layer.
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
tf.nn.l2_loss(fc_weight))
x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d
# Permuting batch_size and n_steps
x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d
# Reshaping to (n_steps*batch_size, n_input)
x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
x = tf.split(x, l_seq,)
# x: the inputs to the bidirectional_rnn
# tf.contrib.rnn.static_bidirectional_rnn: Runs the input through
# two recurrent networks, one that runs the inputs forward and one
# that runs the inputs in reversed order, combining the outputs.
rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back,x, dtype=tf.float32)
# rnn_outputs: the list of LSTM outputs, as a list.
# What we want is the latest output, rnn_outputs[-1]
classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias
# The scores are relative certainties for how likely the output matches
# a certain entailment:
# 0: Positive entailment
# 1: Neutral entailment
# 2: Negative entailment
with tf.variable_scope('Accuracy'):
predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32')
y_label = tf.cast(tf.argmax(y, 1), 'int32')
corrects = tf.equal(predicts, y_label)
num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))
with tf.variable_scope("loss"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
logits = classification_scores, labels = y)
loss = tf.reduce_mean(cross_entropy)
total_loss = loss + weight_decay * tf.add_n(
tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
opt_op = optimizer.minimize(total_loss)
# Initialize variables
init = tf.global_variables_initializer()
# Use TQDM if installed
tqdm_installed = False
try:
from tqdm import tqdm
tqdm_installed = True
except:
pass
# Launch the Tensorflow session
sess = tf.Session()
sess.run(init)
# training_iterations_count: The number of data pieces to train on in total
# batch_size: The number of data pieces per batch
training_iterations = range(0,training_iterations_count,batch_size)
if tqdm_installed:
# Add a progress bar if TQDM is installed
training_iterations = tqdm(training_iterations)
for i in training_iterations:
# Select indices for a random data subset
batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size)
# Use the selected subset indices to initialize the graph's
# placeholder values
hyps, evis, ys = (data_feature_list[0][batch,:],
data_feature_list[1][batch,:],
correct_scores[batch])
# Run the optimization with these initialized values
sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys})
# display_step: how often the accuracy and loss should
# be tested and displayed.
if (i/batch_size) % display_step == 0:
# Calculate batch accuracy
acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys})
# Calculate batch loss
tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys})
# Display results
print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
Features_pmh=pd.read_csv('Climate1.csv')
length_features=len(Features_pmh)
result=[]
pred=[]
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()
text=Features_pmh['Tweet'].copy()
def sent_process(sent):
sent = sent.translate(str.maketrans('', '', string.punctuation))
sent = [word for word in sent.split() if word.lower() not in stopwords.words('english')]
return " ".join(sent)
Features_pmh['Tweet']=text.apply(sent_process)
file = open('Climate_t.csv','a')
fields = ('Text','hypotheses','result','pos_scr','neg_scr','nut_scr')
wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
wr.writeheader()
for i in range(length_features):
if(isnan(Features_pmh['Tweet'][i])==False):
evidences = [Features_pmh['Tweet'][i]]
else:
evidences = [Features_pmh['Tweet'][i]]
hypotheses = ["Climate Change is a Real Concern"]
sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]),(60, 50)) for evidence in evidences]
sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]),(50,50)) for hypothesis in hypotheses]
prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N),evi: (sentence2 * N),y: [[0,0,0]]*N})
#print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+" entailment")
result.append(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])])
pred.append(prediction[0])
wr.writerow({'Text':Features_pmh['Tweet'][i],'hypotheses':hypotheses,"result" :result[i],'pos_scr':pred[i][0],'neg_scr':pred[i][1],'nut_scr':pred[i][2]})
file.close()

View File

@ -18,7 +18,7 @@ class LSTM_TAN(nn.Module):
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
#WORD_EMBEDDINGS
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.word_embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float))
self.word_embeddings.weight.requires_grad=True
@ -30,14 +30,10 @@ class LSTM_TAN(nn.Module):
self.attention = nn.Linear(2*embedding_dim,1)
#LSTM
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=(version!="lstm"))
self.dropout = nn.Dropout(dropout)
#FINAL_LAYER
if version !="lstm":
self.hidden2target = nn.Linear(2*self.hidden_dim, n_targets)
else:
@ -46,10 +42,6 @@ class LSTM_TAN(nn.Module):
self.hidden = self.init_hidden()
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (torch.zeros(1, 1, self.hidden_dim),
torch.zeros(1, 1, self.hidden_dim))
@ -62,11 +54,8 @@ class LSTM_TAN(nn.Module):
if version != "tan-":
t_emb = self.word_embeddings(target)
#print(t_emb)
#print(torch.mean(t_emb,dim=0,keepdim=True).shape)
t_emb = torch.mean(t_emb,dim=0,keepdim=True)
xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
#print(xt_emb)
if version == "tan-":
lstm_out, _ = self.lstm(
@ -96,12 +85,3 @@ class LSTM_TAN(nn.Module):
return target_scores
#t_emb = self.word_embeddings(target)
#print(t_emb)
#print(torch.mean(t_emb,dim=0,keepdim=True).shape)
#t_emb = torch.mean(t_emb,dim=0,keep dim=True)
#xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
#print(xt_emb)
# In[26]: