added SEN-SVM codes
This commit is contained in:
parent
b524243043
commit
a86f074dca
533
SEN-SVM/STA_features.py
Normal file
533
SEN-SVM/STA_features.py
Normal file
@ -0,0 +1,533 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import string
|
||||
import csv
|
||||
from scipy import stats
|
||||
import random
|
||||
import json
|
||||
import nltk
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
import re
|
||||
import wordninja
|
||||
from collections import defaultdict, Counter
|
||||
import math
|
||||
import sys
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def load_glove_embeddings_set():
|
||||
word2emb = []
|
||||
WORD2VEC_MODEL = "glove.6B.300d.txt"
|
||||
fglove = open(WORD2VEC_MODEL,"r")
|
||||
for line in fglove:
|
||||
cols = line.strip().split()
|
||||
word = cols[0]
|
||||
word2emb.append(word)
|
||||
fglove.close()
|
||||
return set(word2emb)
|
||||
|
||||
def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
|
||||
print("Creating Normalization Dictionary")
|
||||
with open(no_slang_data, "r") as f:
|
||||
data1 = json.load(f)
|
||||
|
||||
data2 = {}
|
||||
|
||||
with open(emnlp_dict,"r") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
row = line.split('\t')
|
||||
data2[row[0]] = row[1].rstrip()
|
||||
|
||||
normalization_dict = {**data1,**data2}
|
||||
#print(normalization_dict)
|
||||
return normalization_dict
|
||||
|
||||
word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def sent_process(sent):
|
||||
sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
|
||||
sent = re.sub(r"#SemST", "", sent)
|
||||
sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
|
||||
#sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
|
||||
#sent = re.sub(r"([A-Z])", r" \1", sent)
|
||||
sent = re.sub(r"\'s", " \'s", sent)
|
||||
sent = re.sub(r"\'ve", " \'ve", sent)
|
||||
sent = re.sub(r"n\'t", " n\'t", sent)
|
||||
sent = re.sub(r"\'re", " \'re", sent)
|
||||
sent = re.sub(r"\'d", " \'d", sent)
|
||||
sent = re.sub(r"\'ll", " \'ll", sent)
|
||||
sent = re.sub(r",", " , ", sent)
|
||||
sent = re.sub(r"!", " ! ", sent)
|
||||
sent = re.sub(r"\(", " ( ", sent)
|
||||
sent = re.sub(r"\)", " ) ", sent)
|
||||
sent = re.sub(r"\?", " ? ", sent)
|
||||
sent = re.sub(r"\s{2,}", " ", sent)
|
||||
sent = sent.strip()
|
||||
word_tokens = sent.split()
|
||||
normalised_tokens = []
|
||||
for word in word_tokens:
|
||||
if word in norm_dict:
|
||||
#if False:
|
||||
normalised_tokens.extend(norm_dict[word].lower().split(" "))
|
||||
#print(word," normalised to ",norm_dict[word])
|
||||
else:
|
||||
normalised_tokens.append(word.lower())
|
||||
wordninja_tokens = []
|
||||
for word in normalised_tokens:
|
||||
if word in word_dict:
|
||||
wordninja_tokens+=[word]
|
||||
else:
|
||||
wordninja_tokens+=wordninja.split(word)
|
||||
return " ".join(wordninja_tokens)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[13]:
|
||||
|
||||
|
||||
|
||||
|
||||
def build_lexicon(name):
|
||||
def pmi(x,y,z,t):
|
||||
res=(x/(y*(z/t)+(math.sqrt(x)*math.sqrt(math.log(0.9)/(-2)))))
|
||||
return math.log(res,2)
|
||||
|
||||
|
||||
|
||||
def prob(word1,nava,total):
|
||||
count_prob=0
|
||||
if word1 in nava:
|
||||
count_prob += nava[word1]
|
||||
return((count_prob+1))
|
||||
|
||||
def prob_cond(word1,seed,stance_seed,stance,total):
|
||||
count_prob=0
|
||||
for i in range(len(seed)):
|
||||
if(seed[i]==word1):
|
||||
if(stance_seed[i]==stance):
|
||||
count_prob=count_prob+1
|
||||
return((count_prob+1))
|
||||
|
||||
|
||||
def prob_cond1(word1,word2,Features,total):
|
||||
count_prob=0
|
||||
#for i in range(length_Features):
|
||||
# flag1=0
|
||||
# flag2=0
|
||||
# for word in Features['co_relation'][i]:
|
||||
# if(word==word1):
|
||||
# flag1=1
|
||||
# if(word==word2):
|
||||
# flag2=1
|
||||
# if(flag1==1 and flag2==1):
|
||||
# count_prob=count_prob+1
|
||||
#seed and non-seed lexicon formation
|
||||
return((co_relation[(word1,word2)]+1))
|
||||
|
||||
print("building lexicon for ", name)
|
||||
raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/train.csv')
|
||||
|
||||
#Features Extraction
|
||||
porter=PorterStemmer()
|
||||
|
||||
Stop_words=set(stopwords.words('english'))
|
||||
Features=raw[['sentence']]
|
||||
Tweet=Features['sentence'].copy()
|
||||
|
||||
Features['sentence']=Tweet.apply(sent_process)
|
||||
Features['tokenized_sents'] = Features.apply(lambda row: (row['sentence'].split()), axis=1)
|
||||
Features['pos_tag']=Features.apply(lambda row:nltk.pos_tag(row['tokenized_sents'],tagset='universal'),axis=1)
|
||||
Features['stance']=raw['stance']
|
||||
length_Features=len(Features['sentence'])
|
||||
|
||||
co_relation=defaultdict(int)
|
||||
co_relation2 = []
|
||||
for i in range(length_Features):
|
||||
line=[]
|
||||
for word,tag in Features['pos_tag'][i]:
|
||||
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
|
||||
if(word not in Stop_words):
|
||||
line.append(porter.stem(word))
|
||||
for i in range(len(line)):
|
||||
for j in range(i+1,len(line)):
|
||||
co_relation[(line[i],line[j])]+=1
|
||||
co_relation[(line[j],line[i])]+=1
|
||||
co_relation2.append(line)
|
||||
|
||||
Features['co_relation']=co_relation2
|
||||
|
||||
FAVOR=[]
|
||||
AGAINST=[]
|
||||
NONE=[]
|
||||
for i in range(length_Features):
|
||||
if(Features['stance'][i]=='support'):
|
||||
for word,tag in Features['pos_tag'][i]:
|
||||
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
|
||||
if(word not in Stop_words):
|
||||
FAVOR.append(porter.stem(word))
|
||||
else:
|
||||
if(Features['stance'][i]=='oppose'):
|
||||
for word,tag in Features['pos_tag'][i]:
|
||||
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
|
||||
if(word not in Stop_words):
|
||||
AGAINST.append(porter.stem(word))
|
||||
else:
|
||||
if(Features['stance'][i]=='neutral'):
|
||||
for word,tag in Features['pos_tag'][i]:
|
||||
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
|
||||
if(word not in Stop_words):
|
||||
NONE.append(porter.stem(word))
|
||||
|
||||
len_sup=len(FAVOR)
|
||||
len_opp=len(AGAINST)
|
||||
len_nut=len(NONE)
|
||||
|
||||
len_co=[]
|
||||
for i in range(length_Features):
|
||||
len_co.append(len(Features['co_relation'][i]))
|
||||
|
||||
Features['len_nava']=len_co
|
||||
|
||||
nava=[]
|
||||
for i in range(length_Features):
|
||||
for word,tag in Features['pos_tag'][i]:
|
||||
if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
|
||||
if(word not in Stop_words):
|
||||
nava.append(word.lower())
|
||||
nava_stem=[]
|
||||
for word in nava:
|
||||
nava_stem.append(porter.stem(word))
|
||||
uni_nava_stem=list(set(nava_stem))
|
||||
nava_stem = Counter(nava_stem)
|
||||
|
||||
|
||||
total=len(nava_stem)
|
||||
length=len(uni_nava_stem)
|
||||
|
||||
print(total,length)
|
||||
|
||||
seed=[]
|
||||
non_seed=[]
|
||||
seed_stance=[]
|
||||
for i in range(len(Features)):
|
||||
for j in range(int(0.75*Features['len_nava'][i])):
|
||||
seed.append(Features['co_relation'][i][j])
|
||||
seed_stance.append(Features['stance'][i])
|
||||
for j in range(int(0.75*Features['len_nava'][i]),Features['len_nava'][i]):
|
||||
non_seed.append(Features['co_relation'][i][j])
|
||||
uni_seed=list(set(seed))
|
||||
uni_non_seed=list(set(non_seed))
|
||||
|
||||
'''for i in range(len(Features)):
|
||||
x=[]
|
||||
x=random.sample(Features['co_relation'][i],int(0.75*Features['len_nava'][i]))
|
||||
for j in range(len(x)):
|
||||
seed.append(x[j])
|
||||
seed_stance.append(Features['stance'][i])
|
||||
for j in range(Features['len_nava'][i]):
|
||||
if(Features['co_relation'][i][j] not in x):
|
||||
non_seed.append(Features['co_relation'][i][j])
|
||||
uni_seed=list(set(seed))
|
||||
uni_non_seed=list(set(non_seed))'''
|
||||
|
||||
len_seed=len(seed)
|
||||
len_uni_seed=len(uni_seed)
|
||||
len_non_seed=len(non_seed)
|
||||
len_uni_non_seed=len(uni_non_seed)
|
||||
|
||||
len_seed_sup=0
|
||||
len_seed_opp=0
|
||||
len_seed_nut=0
|
||||
for i in range(len(seed_stance)):
|
||||
if(seed_stance[i]=='support'):
|
||||
len_seed_sup=len_seed_sup+1
|
||||
else:
|
||||
if(seed_stance[i]=='oppose'):
|
||||
len_seed_opp=len_seed_opp+1
|
||||
else:
|
||||
len_seed_nut=len_seed_nut+1
|
||||
print(len_seed_nut,len_seed_opp,len_seed_sup)
|
||||
|
||||
|
||||
|
||||
|
||||
prob_sup=len_seed_sup/(len_seed_sup+len_seed_opp+len_seed_nut)
|
||||
prob_opp=len_seed_opp/(len_seed_sup+len_seed_opp+len_seed_nut)
|
||||
prob_nut=len_seed_nut/(len_seed_sup+len_seed_opp+len_seed_nut)
|
||||
|
||||
prob_word=[]
|
||||
for word in uni_seed:
|
||||
prob_word.append(prob(word,nava_stem,total))
|
||||
|
||||
prob_cond_word={}
|
||||
prob_supp_word=[]
|
||||
prob_opp_word=[]
|
||||
prob_neu_word=[]
|
||||
|
||||
for word in uni_seed:
|
||||
prob_supp_word.append(prob_cond(word,seed,seed_stance,'support',(len_seed_sup+len_seed_opp+len_seed_nut)))
|
||||
prob_opp_word.append(prob_cond(word,seed,seed_stance,'oppose',(len_seed_sup+len_seed_opp+len_seed_nut)))
|
||||
prob_neu_word.append(prob_cond(word,seed,seed_stance,'neutral',(len_seed_sup+len_seed_opp+len_seed_nut)))
|
||||
|
||||
prob_cond_word={'word':list(uni_seed),'prob_word':prob_word,'prob_supp_word':prob_supp_word,'prob_opp_word':prob_opp_word,'prob_neu_word':prob_neu_word}
|
||||
Seed_lexicon = pd.DataFrame(data=prob_cond_word)
|
||||
|
||||
|
||||
|
||||
print(Seed_lexicon)
|
||||
|
||||
pmi_AGAINST=[]
|
||||
pmi_FAVOR=[]
|
||||
pmi_NONE=[]
|
||||
'''for i in range(len_uni_seed):
|
||||
pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],prob_opp))
|
||||
pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],prob_sup))
|
||||
pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],prob_nut))'''
|
||||
|
||||
for i in range(len_uni_seed):
|
||||
pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],len_seed_opp,len_seed))
|
||||
pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],len_seed_sup,len_seed))
|
||||
pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],len_seed_nut,len_seed))
|
||||
|
||||
|
||||
Seed_lexicon['pmi_AGAINST']=list(pmi_AGAINST)
|
||||
Seed_lexicon['pmi_FAVOR']=list(pmi_FAVOR)
|
||||
Seed_lexicon['pmi_NONE']=list(pmi_NONE)
|
||||
|
||||
stance=[]
|
||||
for i in range(len_uni_seed):
|
||||
if((Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_AGAINST'][i]) and (Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_NONE'][i])):
|
||||
stance.append('support')
|
||||
else:
|
||||
if((Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_FAVOR'][i]) & (Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_NONE'][i])):
|
||||
stance.append('oppose')
|
||||
else:
|
||||
stance.append('neutral')
|
||||
|
||||
Seed_lexicon['Stance']=list(stance)
|
||||
|
||||
#NON SEED LEXICON
|
||||
score_non_seed_opp=[]
|
||||
score_non_seed_sup=[]
|
||||
score_non_seed_nut=[]
|
||||
|
||||
opp_seed_word=[]
|
||||
nut_seed_word=[]
|
||||
sup_seed_word=[]
|
||||
for i in range(len_uni_seed):
|
||||
if(Seed_lexicon['Stance'][i]=='support'):
|
||||
sup_seed_word.append(Seed_lexicon['word'][i])
|
||||
else:
|
||||
if(Seed_lexicon['Stance'][i]=='oppose'):
|
||||
opp_seed_word.append(Seed_lexicon['word'][i])
|
||||
else:
|
||||
nut_seed_word.append(Seed_lexicon['word'][i])
|
||||
|
||||
#opp_seed_word=set(opp_seed_word)
|
||||
#nut_seed_word=set(nut_seed_word)
|
||||
#sup_seed_word=set(sup_seed_word)
|
||||
|
||||
len_opp_words=len(opp_seed_word)
|
||||
len_nut_words=len(nut_seed_word)
|
||||
len_sup_words=len(sup_seed_word)
|
||||
|
||||
pmi_non_seed={}
|
||||
|
||||
start1=time.time()
|
||||
print("COMPUTING...")
|
||||
k=0
|
||||
for word in uni_non_seed:
|
||||
list_=[]
|
||||
for i in range(len_sup_words):
|
||||
l=pmi(prob_cond1(word,sup_seed_word[i],Features,total),prob(word,nava_stem,total),prob(sup_seed_word[i],nava_stem,total),total)
|
||||
if(l<0):
|
||||
list_.append(1)
|
||||
else:
|
||||
list_.append(l)
|
||||
score_non_seed_sup.append(stats.gmean(list_))
|
||||
#print(k)
|
||||
k=k+1
|
||||
print("score_non_seed_sup_complete :)")
|
||||
end1=time.time()
|
||||
time1=end1-start1
|
||||
print(time1)
|
||||
|
||||
start2=time.time()
|
||||
k=0
|
||||
for word in uni_non_seed:
|
||||
list_=[]
|
||||
for i in range(len_opp_words):
|
||||
l=pmi(prob_cond1(word,opp_seed_word[i],Features,total),prob(word,nava_stem,total),prob(opp_seed_word[i],nava_stem,total),total)
|
||||
if(l<0):
|
||||
list_.append(1)
|
||||
else:
|
||||
list_.append(l)
|
||||
score_non_seed_opp.append(stats.gmean(list_))
|
||||
#print(k)
|
||||
k=k+1
|
||||
print("score_non_seed_opp_complete :)")
|
||||
end2=time.time()
|
||||
time2=end2-start2
|
||||
print(time2)
|
||||
|
||||
start3=time.time()
|
||||
k=0
|
||||
|
||||
#print("~~~~",nut_seed_word)
|
||||
print(len(uni_non_seed),len_nut_words)
|
||||
for word in uni_non_seed:
|
||||
list_=[]
|
||||
#s2 = time.time()
|
||||
for i in range(len_nut_words):
|
||||
#s1 = time.time()
|
||||
l=pmi(prob_cond1(word,nut_seed_word[i],Features,total), prob(word,nava_stem,total), prob(nut_seed_word[i],nava_stem,total),total)
|
||||
#print(time.time()-s1)
|
||||
if(l<0):
|
||||
list_.append(1)
|
||||
else:
|
||||
list_.append(l)
|
||||
score_non_seed_nut.append(stats.gmean(list_))
|
||||
#print(time.time()-s2)
|
||||
#print(k)
|
||||
k=k+1
|
||||
print("score_non_seed_nut_complete :)")
|
||||
end3=time.time()
|
||||
print("Process Complete :)")
|
||||
time3=end3-start3
|
||||
print(time3)
|
||||
|
||||
total_time=time1+time2+time3
|
||||
print(total_time)
|
||||
|
||||
prob_cond_word={'word':list(uni_non_seed),'score_non_seed_opp':score_non_seed_opp,'score_non_seed_sup':score_non_seed_sup,'score_non_seed_nut':score_non_seed_nut}
|
||||
NonSeed_lexicon = pd.DataFrame(data=prob_cond_word)
|
||||
|
||||
#Tweet Vector Formation
|
||||
lex_word=[]
|
||||
lex_word.extend(list(Seed_lexicon['word']))
|
||||
lex_word.extend(list(NonSeed_lexicon['word']))
|
||||
|
||||
pmi_sup=[]
|
||||
pmi_sup.extend(list(Seed_lexicon['pmi_FAVOR']))
|
||||
pmi_sup.extend(list(NonSeed_lexicon['score_non_seed_sup']))
|
||||
|
||||
pmi_opp=[]
|
||||
pmi_opp.extend(list(Seed_lexicon['pmi_AGAINST']))
|
||||
pmi_opp.extend(list(NonSeed_lexicon['score_non_seed_opp']))
|
||||
|
||||
pmi_nut=[]
|
||||
pmi_nut.extend(list(Seed_lexicon['pmi_NONE']))
|
||||
pmi_nut.extend(list(NonSeed_lexicon['score_non_seed_nut']))
|
||||
|
||||
Lexicon = dict()
|
||||
for i in range(len(lex_word)):
|
||||
Lexicon[lex_word[i]] = {'pmi_sup':pmi_sup[i],'pmi_opp':pmi_opp[i],'pmi_nut':pmi_nut[i]}
|
||||
|
||||
print("Lexicon formed")
|
||||
return Lexicon
|
||||
|
||||
#Lexicon={'word':lex_word,'pmi_sup':pmi_sup,'pmi_opp':pmi_opp,'pmi_nut':pmi_nut}
|
||||
#Lexicon = pd.DataFrame(data=Lexicon)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[14]:
|
||||
|
||||
|
||||
#Lexicon = build_lexicon('SC')
|
||||
|
||||
|
||||
# In[26]:
|
||||
|
||||
|
||||
def produce_features(name,Lexicon):
|
||||
#train_features
|
||||
for l in ['train','test']:
|
||||
raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/{}.csv'.format(l))
|
||||
Stop_words=set(stopwords.words('english'))
|
||||
Features=raw[['sentence']]
|
||||
Tweet=Features['sentence'].copy()
|
||||
|
||||
Features['preprocessed_sentence']=Tweet.apply(sent_process)
|
||||
Features['tokenized_sents'] = Features.apply(lambda row: (row['preprocessed_sentence'].split()), axis=1)
|
||||
|
||||
porter = PorterStemmer()
|
||||
start=time.time()
|
||||
#word_sup_vect=[]
|
||||
#word_opp_vect=[]
|
||||
#word_nut_vect=[]
|
||||
|
||||
data = [['sentence','pmi_sup','pmi_opp','pmi_nut']]
|
||||
len_lexicon_word=len(Lexicon)
|
||||
|
||||
for i in range(len(Features['sentence'])):
|
||||
sum1=0
|
||||
sum2=0
|
||||
sum3=0
|
||||
total_lex=0
|
||||
temp = []
|
||||
for word in Features['tokenized_sents'][i]:
|
||||
#for j in range(len_lexicon_word):
|
||||
|
||||
w = porter.stem(word)
|
||||
if w in Lexicon:
|
||||
sum1=sum1+Lexicon[w]['pmi_sup']
|
||||
sum2=sum2+Lexicon[w]['pmi_opp']
|
||||
sum3=sum3+Lexicon[w]['pmi_nut']
|
||||
total_lex=total_lex+1
|
||||
#word_sup_vect.append(sum1/total_lex)
|
||||
#word_opp_vect.append(sum2/total_lex)
|
||||
#word_nut_vect.append(sum3/total_lex)
|
||||
data.append([Features['sentence'][i],sum1/total_lex,sum2/total_lex,sum3/total_lex])
|
||||
|
||||
my_df = pd.DataFrame(data)
|
||||
my_df.to_csv('./pmi/pmi_{}_{}.csv'.format(name,l+'1'),header=False,index=False)
|
||||
|
||||
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
# In[27]:
|
||||
produce_features('HRT',build_lexicon('HRT'))
|
||||
|
||||
'''for dataset in ['AT','LA','CC','HC','FM']:
|
||||
produce_features(dataset,build_lexicon(dataset))'''
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
318
SEN-SVM/SVM.py
Normal file
318
SEN-SVM/SVM.py
Normal file
@ -0,0 +1,318 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[9]:
|
||||
|
||||
|
||||
import csv
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
import string
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import wordninja
|
||||
import re
|
||||
import json
|
||||
from sklearn import svm
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import classification_report,confusion_matrix
|
||||
import numpy as np
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# In[2]:
|
||||
stemmer = PorterStemmer()
|
||||
|
||||
def load_glove_embeddings_set():
|
||||
word2emb = []
|
||||
WORD2VEC_MODEL = "../glove.6B.300d.txt"
|
||||
fglove = open(WORD2VEC_MODEL,"r")
|
||||
for line in fglove:
|
||||
cols = line.strip().split()
|
||||
word = cols[0]
|
||||
word2emb.append(word)
|
||||
fglove.close()
|
||||
return set(word2emb)
|
||||
|
||||
def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
|
||||
print("Creating Normalization Dictionary")
|
||||
with open(no_slang_data, "r") as f:
|
||||
data1 = json.load(f)
|
||||
|
||||
data2 = {}
|
||||
|
||||
with open(emnlp_dict,"r") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
row = line.split('\t')
|
||||
data2[row[0]] = row[1].rstrip()
|
||||
|
||||
normalization_dict = {**data1,**data2}
|
||||
#print(normalization_dict)
|
||||
return normalization_dict
|
||||
|
||||
word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
|
||||
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
def sent_process(sent):
|
||||
sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
|
||||
sent = re.sub(r"#SemST", "", sent)
|
||||
sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
|
||||
#sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
|
||||
#sent = re.sub(r"([A-Z])", r" \1", sent)
|
||||
sent = re.sub(r"\'s", " \'s", sent)
|
||||
sent = re.sub(r"\'ve", " \'ve", sent)
|
||||
sent = re.sub(r"n\'t", " n\'t", sent)
|
||||
sent = re.sub(r"\'re", " \'re", sent)
|
||||
sent = re.sub(r"\'d", " \'d", sent)
|
||||
sent = re.sub(r"\'ll", " \'ll", sent)
|
||||
sent = re.sub(r",", " , ", sent)
|
||||
sent = re.sub(r"!", " ! ", sent)
|
||||
sent = re.sub(r"\(", " ( ", sent)
|
||||
sent = re.sub(r"\)", " ) ", sent)
|
||||
sent = re.sub(r"\?", " ? ", sent)
|
||||
sent = re.sub(r"\s{2,}", " ", sent)
|
||||
sent = sent.strip()
|
||||
word_tokens = sent.split()
|
||||
normalised_tokens = []
|
||||
for word in word_tokens:
|
||||
if word in norm_dict:
|
||||
#if False:
|
||||
normalised_tokens.extend(norm_dict[word].lower().split(" "))
|
||||
print(word," normalised to ",norm_dict[word])
|
||||
else:
|
||||
normalised_tokens.append(word.lower())
|
||||
wordninja_tokens = []
|
||||
for word in normalised_tokens:
|
||||
if word in word_dict:
|
||||
wordninja_tokens+=[word]
|
||||
else:
|
||||
wordninja_tokens+=wordninja.split(word)
|
||||
return " ".join(wordninja_tokens)
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
|
||||
def svc_param_selection(X, y, nfolds):
|
||||
Cs = [0.001, 0.01, 0.1, 1, 10,100 ]
|
||||
gammas = [0.001, 0.01, 0.1, 1]
|
||||
param_grid = [{'C': Cs, 'gamma' : gammas , 'kernel' : ['rbf']},{'C': Cs , 'gamma' : gammas , 'kernel' : ['linear']}]
|
||||
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
|
||||
grid_search.fit(X, y)
|
||||
grid_search.best_params_
|
||||
return grid_search.best_params_
|
||||
|
||||
|
||||
# In[21]:
|
||||
|
||||
|
||||
def train(topic,bow=False,senti=False,sta=False,ent=False):
|
||||
save_file = topic+"_"
|
||||
print(topic)
|
||||
y_train = []
|
||||
y_test = []
|
||||
sentences = []
|
||||
features_ent = []
|
||||
features_sta = []
|
||||
features_senti = []
|
||||
senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2}
|
||||
with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f:
|
||||
reader = csv.DictReader(f, delimiter=',')
|
||||
done = False
|
||||
for row in reader:
|
||||
sentences.append(row['sentence'])
|
||||
if ent:
|
||||
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
|
||||
if not done:
|
||||
save_file = save_file + "ent_"
|
||||
if sta:
|
||||
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
|
||||
if not done:
|
||||
save_file = save_file + "sta_"
|
||||
if senti:
|
||||
features_senti.append([senti_dict[row['senti']]])
|
||||
if not done:
|
||||
save_file = save_file + "senti_"
|
||||
done = True
|
||||
y_train.append(row['label'])
|
||||
L = len(sentences)
|
||||
with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f:
|
||||
reader = csv.DictReader(f, delimiter=',')
|
||||
for row in reader:
|
||||
sentences.append(row['sentence'])
|
||||
if ent:
|
||||
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
|
||||
if sta:
|
||||
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
|
||||
if senti:
|
||||
features_senti.append([senti_dict[row['senti']]])
|
||||
y_test.append(row['label'])
|
||||
|
||||
all_features = []
|
||||
if bow:
|
||||
new_sentences = []
|
||||
for sent in sentences:
|
||||
tokens = word_tokenize(sent)
|
||||
tokens = [stemmer.stem(token) for token in tokens]
|
||||
ret = " ".join(w for w in tokens)
|
||||
new_sentences.append(ret)
|
||||
save_file = save_file + "bow_"
|
||||
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2)
|
||||
features_bow = vectorizer.fit_transform(new_sentences)
|
||||
all_features.append(features_bow.toarray())
|
||||
#features_bow_train = np.array(features_bow[:L].toarray())
|
||||
#features_bow_test = np.array(features_bow[L:].toarray())
|
||||
|
||||
if ent:
|
||||
#features_ent_train = np.array(features_ent[:L])
|
||||
#features_ent_test = np.array(features_ent[L:])
|
||||
all_features.append(features_ent)
|
||||
|
||||
if sta:
|
||||
#features_ent_train = np.array(features_ent[:L])
|
||||
#features_ent_test = np.array(features_ent[L:])
|
||||
all_features.append(features_sta)
|
||||
|
||||
|
||||
if senti:
|
||||
#features_senti_train = np.array(features_senti[:L])
|
||||
#features_senti_test = np.array(features_senti[L:])
|
||||
all_features.append(features_senti)
|
||||
|
||||
dataset = np.concatenate(all_features,axis=1)
|
||||
train_dataset = dataset[:L]
|
||||
test_dataset = dataset[L:]
|
||||
|
||||
best_params = svc_param_selection(train_dataset,y_train,nfolds=5)
|
||||
print(best_params)
|
||||
if best_params['kernel'] == 'rbf':
|
||||
model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
|
||||
else:
|
||||
model = svm.SVC(kernel='linear' ,C = best_params['C'],probability=True)
|
||||
|
||||
|
||||
model.fit(train_dataset,y_train)
|
||||
|
||||
y_pred = model.predict(test_dataset)
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
#cm = confusion_matrix(y_test,y_pred,labels=['0','1','2'])
|
||||
conf_score = model.predict_proba(test_dataset)
|
||||
|
||||
#print
|
||||
df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1))
|
||||
df.to_csv(save_file+".csv",header=False,index=False)
|
||||
return y_pred,y_test
|
||||
|
||||
|
||||
|
||||
# In[22]:
|
||||
|
||||
'''
|
||||
#BOW
|
||||
print ('BOW---------')
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,bow = True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))'''
|
||||
|
||||
|
||||
# In[23]:
|
||||
|
||||
print ("STA---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[7]:
|
||||
|
||||
print("ENT----------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,ent=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[8]:
|
||||
|
||||
print("SENTI---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,senti=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[9]:
|
||||
|
||||
print("ENT_SENTI--------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,ent=True,senti=True,sta=False)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
print("ENT_STA---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,ent=True,senti=False,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[11]:
|
||||
|
||||
print("SENTI_STA-----------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,ent=False,senti=True,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[12]:
|
||||
|
||||
print("ENT_SENTI_STA---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,ent=True,senti=True,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
380
SEN-SVM/SVM_mpchi.py
Normal file
380
SEN-SVM/SVM_mpchi.py
Normal file
@ -0,0 +1,380 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[9]:
|
||||
|
||||
|
||||
import csv
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
import string
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import wordninja
|
||||
import re
|
||||
import json
|
||||
from sklearn import svm
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import classification_report,confusion_matrix
|
||||
import numpy as np
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# In[2]:
|
||||
stemmer = PorterStemmer()
|
||||
|
||||
def load_glove_embeddings_set():
|
||||
word2emb = []
|
||||
WORD2VEC_MODEL = "../glove.6B.300d.txt"
|
||||
fglove = open(WORD2VEC_MODEL,"r")
|
||||
for line in fglove:
|
||||
cols = line.strip().split()
|
||||
word = cols[0]
|
||||
word2emb.append(word)
|
||||
fglove.close()
|
||||
return set(word2emb)
|
||||
|
||||
def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
|
||||
print("Creating Normalization Dictionary")
|
||||
with open(no_slang_data, "r") as f:
|
||||
data1 = json.load(f)
|
||||
|
||||
data2 = {}
|
||||
|
||||
with open(emnlp_dict,"r") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
row = line.split('\t')
|
||||
data2[row[0]] = row[1].rstrip()
|
||||
|
||||
normalization_dict = {**data1,**data2}
|
||||
#print(normalization_dict)
|
||||
return normalization_dict
|
||||
|
||||
word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
|
||||
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
def sent_process(sent):
|
||||
sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
|
||||
sent = re.sub(r"#SemST", "", sent)
|
||||
sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
|
||||
#sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
|
||||
#sent = re.sub(r"([A-Z])", r" \1", sent)
|
||||
sent = re.sub(r"\'s", " \'s", sent)
|
||||
sent = re.sub(r"\'ve", " \'ve", sent)
|
||||
sent = re.sub(r"n\'t", " n\'t", sent)
|
||||
sent = re.sub(r"\'re", " \'re", sent)
|
||||
sent = re.sub(r"\'d", " \'d", sent)
|
||||
sent = re.sub(r"\'ll", " \'ll", sent)
|
||||
sent = re.sub(r",", " , ", sent)
|
||||
sent = re.sub(r"!", " ! ", sent)
|
||||
sent = re.sub(r"\(", " ( ", sent)
|
||||
sent = re.sub(r"\)", " ) ", sent)
|
||||
sent = re.sub(r"\?", " ? ", sent)
|
||||
sent = re.sub(r"\s{2,}", " ", sent)
|
||||
sent = sent.strip()
|
||||
word_tokens = sent.split()
|
||||
normalised_tokens = []
|
||||
for word in word_tokens:
|
||||
if word in norm_dict:
|
||||
#if False:
|
||||
normalised_tokens.extend(norm_dict[word].lower().split(" "))
|
||||
print(word," normalised to ",norm_dict[word])
|
||||
else:
|
||||
normalised_tokens.append(word.lower())
|
||||
wordninja_tokens = []
|
||||
for word in normalised_tokens:
|
||||
if word in word_dict:
|
||||
wordninja_tokens+=[word]
|
||||
else:
|
||||
wordninja_tokens+=wordninja.split(word)
|
||||
return " ".join(wordninja_tokens)
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
#{'C': c, 'gamma' : gammas , 'kernel' : ['rbf']},
|
||||
|
||||
def svc_param_selection(X, y, nfolds):
|
||||
Cs = [0.001, 0.01, 0.1, 1, 10,100 ]
|
||||
c = [1]
|
||||
gammas = [0.001, 0.01, 0.1, 1]
|
||||
param_grid = [{'C': c , 'gamma' : gammas , 'kernel' : ['linear']}]
|
||||
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
|
||||
grid_search.fit(X, y)
|
||||
grid_search.best_params_
|
||||
return grid_search.best_params_
|
||||
|
||||
|
||||
# In[21]:
|
||||
|
||||
|
||||
def train(topic,bow=False,senti=False,sta=False,ent=False,med = False):
|
||||
save_file = topic+"_"
|
||||
print(topic)
|
||||
y_train = []
|
||||
y_test = []
|
||||
sentences = []
|
||||
features_ent = []
|
||||
features_sta = []
|
||||
features_senti = []
|
||||
features_med = []
|
||||
senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2}
|
||||
with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f:
|
||||
reader = csv.DictReader(f, delimiter=',')
|
||||
done = False
|
||||
for row in reader:
|
||||
sentences.append(row['sentence'])
|
||||
if ent:
|
||||
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
|
||||
if not done:
|
||||
save_file = save_file + "ent_"
|
||||
if sta:
|
||||
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
|
||||
if not done:
|
||||
save_file = save_file + "sta_"
|
||||
if senti:
|
||||
features_senti.append([senti_dict[row['senti']]])
|
||||
if not done:
|
||||
save_file = save_file + "senti_"
|
||||
if med:
|
||||
features_med.append([row['med_aff'],row['med_treat']])
|
||||
if not done:
|
||||
save_file = save_file + "med"
|
||||
done = True
|
||||
y_train.append(row['label'])
|
||||
L = len(sentences)
|
||||
with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f:
|
||||
reader = csv.DictReader(f, delimiter=',')
|
||||
for row in reader:
|
||||
sentences.append(row['sentence'])
|
||||
if ent:
|
||||
features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
|
||||
if sta:
|
||||
features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
|
||||
if senti:
|
||||
features_senti.append([senti_dict[row['senti']]])
|
||||
if med:
|
||||
features_med.append([row['med_aff'],row['med_treat']])
|
||||
y_test.append(row['label'])
|
||||
|
||||
all_features = []
|
||||
if bow:
|
||||
new_sentences = []
|
||||
for sent in sentences:
|
||||
tokens = word_tokenize(sent)
|
||||
tokens = [stemmer.stem(token) for token in tokens]
|
||||
ret = " ".join(w for w in tokens)
|
||||
new_sentences.append(ret)
|
||||
save_file = save_file + "bow_"
|
||||
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2)
|
||||
features_bow = vectorizer.fit_transform(new_sentences)
|
||||
all_features.append(features_bow.toarray())
|
||||
#features_bow_train = np.array(features_bow[:L].toarray())
|
||||
#features_bow_test = np.array(features_bow[L:].toarray())
|
||||
|
||||
if ent:
|
||||
#features_ent_train = np.array(features_ent[:L])
|
||||
#features_ent_test = np.array(features_ent[L:])
|
||||
all_features.append(features_ent)
|
||||
|
||||
if sta:
|
||||
#features_ent_train = np.array(features_ent[:L])
|
||||
#features_ent_test = np.array(features_ent[L:])
|
||||
all_features.append(features_sta)
|
||||
|
||||
|
||||
if senti:
|
||||
#features_senti_train = np.array(features_senti[:L])
|
||||
#features_senti_test = np.array(features_senti[L:])
|
||||
all_features.append(features_senti)
|
||||
|
||||
if med:
|
||||
all_features.append(features_med)
|
||||
dataset = np.concatenate(all_features,axis=1)
|
||||
train_dataset = dataset[:L]
|
||||
test_dataset = dataset[L:]
|
||||
|
||||
best_params = svc_param_selection(train_dataset,y_train,nfolds=5)
|
||||
print(best_params)
|
||||
if best_params['kernel'] == 'rbf':
|
||||
model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
|
||||
else:
|
||||
model = svm.SVC(kernel='linear' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
|
||||
|
||||
|
||||
model.fit(train_dataset,y_train)
|
||||
|
||||
y_pred = model.predict(test_dataset)
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
#cm = confusion_matrix(y_test,y_pred,labels=['0','1','2'])
|
||||
conf_score = model.predict_proba(test_dataset)
|
||||
|
||||
#print
|
||||
df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1))
|
||||
df.to_csv(save_file+".csv",header=False,index=False)
|
||||
return y_pred,y_test
|
||||
|
||||
|
||||
|
||||
# In[22]:
|
||||
|
||||
'''
|
||||
#BOW
|
||||
print ('BOW---------')
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,bow = True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[23]:
|
||||
|
||||
print ("STA---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[7]:
|
||||
|
||||
print("ENT----------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,ent=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[8]:
|
||||
|
||||
print("SENTI---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,senti=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[9]:
|
||||
|
||||
print("ENT_SENTI--------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset ,senti=True , ent=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
print("ENT_STA---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,ent=True,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[11]:
|
||||
|
||||
print("SENTI_STA-----------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,senti=True,sta=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[12]:
|
||||
|
||||
print("ENT_SENTI_STA---------")
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['AT','CC','FM','LA','HC']:
|
||||
a,b = train(dataset,ent=True,senti=True,sta=True,bow = True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,med=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,sta = True, med=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,senti = True, med=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,sta = True, senti = True, med=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
||||
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,sta = True, senti = True, med=True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))'''
|
||||
|
||||
y_pred,y_test = [],[]
|
||||
for dataset in ['MMR','HRT','EC','VC','SC']:
|
||||
a,b = train(dataset,sta = True, senti = True, med=True, ent = True, bow= True)
|
||||
y_pred.extend(a)
|
||||
y_test.extend(b)
|
||||
print(len(a),len(b))
|
||||
|
||||
print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
|
78
SEN-SVM/sentiment_api_2.py
Normal file
78
SEN-SVM/sentiment_api_2.py
Normal file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jun 13 03:05:06 2018
|
||||
|
||||
@author: shalmoli
|
||||
"""
|
||||
import pandas as pd
|
||||
import csv
|
||||
from pycorenlp import StanfordCoreNLP
|
||||
import json
|
||||
|
||||
#reading Input file
|
||||
data=pd.read_csv('HRT2.csv')
|
||||
#Extracting Sentences
|
||||
sentence=data['sentences']
|
||||
length=len(sentence)
|
||||
|
||||
#storing Result
|
||||
# before Executing this,we have to start the Stanford Server through Terminal
|
||||
'''It's returns sentiment sentences wise.
|
||||
If we have have abstract more than one sentence ,we conclude the sentiment for that sentence by considering the majority and breaking tie randomly.'''
|
||||
result=[]
|
||||
for i in range(length):
|
||||
nlp = StanfordCoreNLP('http://localhost:9000')
|
||||
res = nlp.annotate(sentence[i],
|
||||
properties={
|
||||
'annotators': 'sentiment',
|
||||
'outputFormat': 'json',
|
||||
'timeout': 100000,
|
||||
})
|
||||
print (res)
|
||||
count=0# counting number of sentences in input
|
||||
count_1=0#counting Negative Sentiment
|
||||
count_2=0#counting Neutral Sentiment
|
||||
count_3=0#counting Positive Sentiment
|
||||
for s in (res['sentences']):
|
||||
if(s["sentimentValue"]):
|
||||
count=count+1
|
||||
if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'):
|
||||
count_1=count_1+1
|
||||
else:
|
||||
if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'):
|
||||
count_3=count_3+1
|
||||
else:
|
||||
count_2=count_2+1
|
||||
if(count>1):
|
||||
if(count_1 > count_2 and count_1 > count_3):
|
||||
result.append("Negative")
|
||||
else:
|
||||
if(count_2 > count_1 and count_2 > count_3):
|
||||
result.append('Neutral')
|
||||
else:
|
||||
result.append('Positive')
|
||||
else:
|
||||
if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'):
|
||||
result.append("Negative")
|
||||
else:
|
||||
if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'):
|
||||
result.append("Positive")
|
||||
else:
|
||||
result.append('Neutral')
|
||||
|
||||
#Storing Output to file Oytput is binary for each class individually
|
||||
file = open('Sentiment_HRT2.csv','a')
|
||||
fields = ('sentence','positive','negative','neutral','sentiment')
|
||||
wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
|
||||
wr.writeheader()
|
||||
for i in range(length):
|
||||
if(result[i]=='Negative'):
|
||||
wr.writerow({'sentence':sentence[i],'positive':0,'negative':1,'neutral':0,'sentiment':result[i]})
|
||||
else:
|
||||
if(result[i]=='Positive'):
|
||||
wr.writerow({'sentence':sentence[i],'positive':1,'negative':0,'neutral':0,'sentiment':result[i]})
|
||||
else:
|
||||
wr.writerow({'sentence':sentence[i],'positive':0,'negative':0,'neutral':1,'sentiment':result[i]})
|
||||
|
||||
file.close()
|
342
SEN-SVM/te_f.py
Normal file
342
SEN-SVM/te_f.py
Normal file
@ -0,0 +1,342 @@
|
||||
def isnan(value):
|
||||
try:
|
||||
import math
|
||||
return math.isnan(float(value))
|
||||
except:
|
||||
return False
|
||||
|
||||
#matplotlib inline
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
#import matplotlib.pyplot as plt
|
||||
#import matplotlib.ticker as ticker
|
||||
import urllib
|
||||
import sys
|
||||
import os
|
||||
import csv
|
||||
import zipfile
|
||||
|
||||
glove_zip_file = "glove.6B.zip"
|
||||
glove_vectors_file = "glove.6B.50d.txt"
|
||||
|
||||
snli_zip_file = "snli_1.0.zip"
|
||||
snli_dev_file = "snli_1.0_dev.txt"
|
||||
snli_full_dataset_file = "snli_1.0_train.txt"
|
||||
|
||||
from six.moves.urllib.request import urlretrieve
|
||||
|
||||
#large file - 862 MB
|
||||
if (not os.path.isfile(glove_zip_file) and
|
||||
not os.path.isfile(glove_vectors_file)):
|
||||
urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip",
|
||||
glove_zip_file)
|
||||
|
||||
#medium-sized file - 94.6 MB
|
||||
if (not os.path.isfile(snli_zip_file) and
|
||||
not os.path.isfile(snli_dev_file)):
|
||||
urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
|
||||
snli_zip_file)
|
||||
|
||||
def unzip_single_file(zip_file_name, output_file_name):
|
||||
"""
|
||||
If the outFile is already created, don't recreate
|
||||
If the outFile does not exist, create it from the zipFile
|
||||
"""
|
||||
if not os.path.isfile(output_file_name):
|
||||
with open(output_file_name, 'wb') as out_file:
|
||||
with zipfile.ZipFile(zip_file_name) as zipped:
|
||||
for info in zipped.infolist():
|
||||
if output_file_name in info.filename:
|
||||
with zipped.open(info) as requested_file:
|
||||
out_file.write(requested_file.read())
|
||||
return
|
||||
|
||||
unzip_single_file(glove_zip_file, glove_vectors_file)
|
||||
unzip_single_file(snli_zip_file, snli_dev_file)
|
||||
|
||||
glove_wordmap = {}
|
||||
with open(glove_vectors_file, "r") as glove:
|
||||
for line in glove:
|
||||
name, vector = tuple(line.split(" ", 1))
|
||||
glove_wordmap[name] = np.fromstring(vector, sep=" ")
|
||||
|
||||
|
||||
def sentence2sequence(sentence):
|
||||
"""
|
||||
|
||||
- Turns an input sentence into an (n,d) matrix,
|
||||
where n is the number of tokens in the sentence
|
||||
and d is the number of dimensions each word vector has.
|
||||
|
||||
"""
|
||||
tokens = sentence.lower().split(" ")
|
||||
rows = []
|
||||
words = []
|
||||
#Greedy search for tokens
|
||||
for token in tokens:
|
||||
i = len(token)
|
||||
while len(token) > 0 and i > 0:
|
||||
word = token[:i]
|
||||
if word in glove_wordmap:
|
||||
rows.append(glove_wordmap[word])
|
||||
words.append(word)
|
||||
token = token[i:]
|
||||
i = len(token)
|
||||
else:
|
||||
i = i-1
|
||||
return rows, words
|
||||
|
||||
rnn_size = 64
|
||||
rnn = tf.contrib.rnn.BasicRNNCell(rnn_size)
|
||||
|
||||
#Constants setup
|
||||
max_hypothesis_length, max_evidence_length = 60, 50
|
||||
batch_size, vector_size, hidden_size = 128, 50, 64
|
||||
|
||||
lstm_size = hidden_size
|
||||
|
||||
weight_decay = 0.0001
|
||||
|
||||
learning_rate = 1
|
||||
|
||||
input_p, output_p = 0.5, 0.5
|
||||
|
||||
training_iterations_count = 100000
|
||||
|
||||
display_step = 10
|
||||
|
||||
def score_setup(row):
|
||||
convert_dict = {
|
||||
'entailment': 0,
|
||||
'neutral': 1,
|
||||
'contradiction': 2
|
||||
}
|
||||
score = np.zeros((3,))
|
||||
for x in range(1,6):
|
||||
tag = row["label"+str(x)]
|
||||
if tag in convert_dict: score[convert_dict[tag]] += 1
|
||||
return score / (1.0*np.sum(score))
|
||||
|
||||
def fit_to_size(matrix, shape):
|
||||
res = np.zeros(shape)
|
||||
slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)]
|
||||
res[slices] = matrix[slices]
|
||||
return res
|
||||
|
||||
|
||||
def split_data_into_scores():
|
||||
import csv
|
||||
with open("snli_1.0_dev.txt","r") as data:
|
||||
train = csv.DictReader(data, delimiter='\t')
|
||||
evi_sentences = []
|
||||
hyp_sentences = []
|
||||
labels = []
|
||||
scores = []
|
||||
for row in train:
|
||||
hyp_sentences.append(np.vstack(
|
||||
sentence2sequence(row["sentence1"].lower())[0]))
|
||||
evi_sentences.append(np.vstack(
|
||||
sentence2sequence(row["sentence2"].lower())[0]))
|
||||
labels.append(row["gold_label"])
|
||||
scores.append(score_setup(row))
|
||||
|
||||
hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
|
||||
for x in hyp_sentences])
|
||||
evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
|
||||
for x in evi_sentences])
|
||||
|
||||
return (hyp_sentences, evi_sentences), labels, np.array(scores)
|
||||
|
||||
data_feature_list, correct_values, correct_scores = split_data_into_scores()
|
||||
|
||||
l_h, l_e = max_hypothesis_length, max_evidence_length
|
||||
N, D, H = batch_size, vector_size, hidden_size
|
||||
l_seq = l_h + l_e
|
||||
|
||||
tf.reset_default_graph()
|
||||
# lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
|
||||
lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
|
||||
lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p)
|
||||
|
||||
|
||||
# N: The number of elements in each of our batches,
|
||||
# which we use to train subsets of data for efficiency's sake.
|
||||
# l_h: The maximum length of a hypothesis, or the second sentence. This is
|
||||
# used because training an RNN is extraordinarily difficult without
|
||||
# rolling it out to a fixed length.
|
||||
# l_e: The maximum length of evidence, the first sentence. This is used
|
||||
# because training an RNN is extraordinarily difficult without
|
||||
# rolling it out to a fixed length.
|
||||
# D: The size of our used GloVe or other vectors.
|
||||
hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis')
|
||||
evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence')
|
||||
y = tf.placeholder(tf.float32, [N, 3], 'label')
|
||||
# hyp: Where the hypotheses will be stored during training.
|
||||
# evi: Where the evidences will be stored during training.
|
||||
# y: Where correct scores will be stored during training.
|
||||
|
||||
# lstm_size: the size of the gates in the LSTM,
|
||||
# as in the first LSTM layer's initialization.
|
||||
# lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size)
|
||||
lstm_back = tf.nn.rnn_cell.LSTMCell(lstm_size)
|
||||
# lstm_back: The LSTM used for looking backwards
|
||||
# through the sentences, similar to lstm.
|
||||
|
||||
# input_p: the probability that inputs to the LSTM will be retained at each
|
||||
# iteration of dropout.
|
||||
# output_p: the probability that outputs from the LSTM will be retained at
|
||||
# each iteration of dropout.
|
||||
lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p)
|
||||
# lstm_drop_back: A dropout wrapper for lstm_back, like lstm_drop.
|
||||
|
||||
|
||||
fc_initializer = tf.random_normal_initializer(stddev=0.1)
|
||||
# fc_initializer: initial values for the fully connected layer's weights.
|
||||
# hidden_size: the size of the outputs from each lstm layer.
|
||||
# Multiplied by 2 to account for the two LSTMs.
|
||||
fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3],
|
||||
initializer = fc_initializer)
|
||||
# fc_weight: Storage for the fully connected layer's weights.
|
||||
fc_bias = tf.get_variable('bias', [3])
|
||||
# fc_bias: Storage for the fully connected layer's bias.
|
||||
|
||||
# tf.GraphKeys.REGULARIZATION_LOSSES: A key to a collection in the graph
|
||||
# designated for losses due to regularization.
|
||||
# In this case, this portion of loss is regularization on the weights
|
||||
# for the fully connected layer.
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
|
||||
tf.nn.l2_loss(fc_weight))
|
||||
|
||||
x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d
|
||||
# Permuting batch_size and n_steps
|
||||
x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d
|
||||
# Reshaping to (n_steps*batch_size, n_input)
|
||||
x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d
|
||||
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
|
||||
x = tf.split(x, l_seq,)
|
||||
|
||||
# x: the inputs to the bidirectional_rnn
|
||||
|
||||
|
||||
# tf.contrib.rnn.static_bidirectional_rnn: Runs the input through
|
||||
# two recurrent networks, one that runs the inputs forward and one
|
||||
# that runs the inputs in reversed order, combining the outputs.
|
||||
rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back,x, dtype=tf.float32)
|
||||
# rnn_outputs: the list of LSTM outputs, as a list.
|
||||
# What we want is the latest output, rnn_outputs[-1]
|
||||
|
||||
classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias
|
||||
# The scores are relative certainties for how likely the output matches
|
||||
# a certain entailment:
|
||||
# 0: Positive entailment
|
||||
# 1: Neutral entailment
|
||||
# 2: Negative entailment
|
||||
|
||||
with tf.variable_scope('Accuracy'):
|
||||
predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32')
|
||||
y_label = tf.cast(tf.argmax(y, 1), 'int32')
|
||||
corrects = tf.equal(predicts, y_label)
|
||||
num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
|
||||
accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))
|
||||
|
||||
with tf.variable_scope("loss"):
|
||||
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
|
||||
logits = classification_scores, labels = y)
|
||||
loss = tf.reduce_mean(cross_entropy)
|
||||
total_loss = loss + weight_decay * tf.add_n(
|
||||
tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
|
||||
|
||||
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
|
||||
|
||||
opt_op = optimizer.minimize(total_loss)
|
||||
|
||||
# Initialize variables
|
||||
init = tf.global_variables_initializer()
|
||||
|
||||
# Use TQDM if installed
|
||||
tqdm_installed = False
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
tqdm_installed = True
|
||||
except:
|
||||
pass
|
||||
|
||||
# Launch the Tensorflow session
|
||||
sess = tf.Session()
|
||||
sess.run(init)
|
||||
|
||||
# training_iterations_count: The number of data pieces to train on in total
|
||||
# batch_size: The number of data pieces per batch
|
||||
training_iterations = range(0,training_iterations_count,batch_size)
|
||||
if tqdm_installed:
|
||||
# Add a progress bar if TQDM is installed
|
||||
training_iterations = tqdm(training_iterations)
|
||||
|
||||
for i in training_iterations:
|
||||
|
||||
# Select indices for a random data subset
|
||||
batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size)
|
||||
|
||||
# Use the selected subset indices to initialize the graph's
|
||||
# placeholder values
|
||||
hyps, evis, ys = (data_feature_list[0][batch,:],
|
||||
data_feature_list[1][batch,:],
|
||||
correct_scores[batch])
|
||||
|
||||
# Run the optimization with these initialized values
|
||||
sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys})
|
||||
# display_step: how often the accuracy and loss should
|
||||
# be tested and displayed.
|
||||
if (i/batch_size) % display_step == 0:
|
||||
# Calculate batch accuracy
|
||||
acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys})
|
||||
# Calculate batch loss
|
||||
tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys})
|
||||
# Display results
|
||||
print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \
|
||||
"{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \
|
||||
"{:.5f}".format(acc))
|
||||
Features_pmh=pd.read_csv('Climate1.csv')
|
||||
|
||||
length_features=len(Features_pmh)
|
||||
result=[]
|
||||
pred=[]
|
||||
|
||||
import string
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import sent_tokenize,word_tokenize
|
||||
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||||
tokenizer = PunktSentenceTokenizer()
|
||||
|
||||
text=Features_pmh['Tweet'].copy()
|
||||
def sent_process(sent):
|
||||
sent = sent.translate(str.maketrans('', '', string.punctuation))
|
||||
sent = [word for word in sent.split() if word.lower() not in stopwords.words('english')]
|
||||
return " ".join(sent)
|
||||
Features_pmh['Tweet']=text.apply(sent_process)
|
||||
file = open('Climate_t.csv','a')
|
||||
fields = ('Text','hypotheses','result','pos_scr','neg_scr','nut_scr')
|
||||
wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
|
||||
wr.writeheader()
|
||||
|
||||
for i in range(length_features):
|
||||
if(isnan(Features_pmh['Tweet'][i])==False):
|
||||
evidences = [Features_pmh['Tweet'][i]]
|
||||
else:
|
||||
evidences = [Features_pmh['Tweet'][i]]
|
||||
|
||||
hypotheses = ["Climate Change is a Real Concern"]
|
||||
|
||||
sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]),(60, 50)) for evidence in evidences]
|
||||
|
||||
sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]),(50,50)) for hypothesis in hypotheses]
|
||||
|
||||
prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N),evi: (sentence2 * N),y: [[0,0,0]]*N})
|
||||
#print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+" entailment")
|
||||
result.append(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])])
|
||||
pred.append(prediction[0])
|
||||
wr.writerow({'Text':Features_pmh['Tweet'][i],'hypotheses':hypotheses,"result" :result[i],'pos_scr':pred[i][0],'neg_scr':pred[i][1],'nut_scr':pred[i][2]})
|
||||
|
||||
file.close()
|
@ -18,7 +18,7 @@ class LSTM_TAN(nn.Module):
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.embedding_dim = embedding_dim
|
||||
#WORD_EMBEDDINGS
|
||||
|
||||
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
|
||||
self.word_embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float))
|
||||
self.word_embeddings.weight.requires_grad=True
|
||||
@ -30,14 +30,10 @@ class LSTM_TAN(nn.Module):
|
||||
self.attention = nn.Linear(2*embedding_dim,1)
|
||||
|
||||
|
||||
#LSTM
|
||||
# The LSTM takes word embeddings as inputs, and outputs hidden states
|
||||
# with dimensionality hidden_dim.
|
||||
self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=(version!="lstm"))
|
||||
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
#FINAL_LAYER
|
||||
if version !="lstm":
|
||||
self.hidden2target = nn.Linear(2*self.hidden_dim, n_targets)
|
||||
else:
|
||||
@ -46,10 +42,6 @@ class LSTM_TAN(nn.Module):
|
||||
self.hidden = self.init_hidden()
|
||||
|
||||
def init_hidden(self):
|
||||
# Before we've done anything, we dont have any hidden state.
|
||||
# Refer to the Pytorch documentation to see exactly
|
||||
# why they have this dimensionality.
|
||||
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
|
||||
return (torch.zeros(1, 1, self.hidden_dim),
|
||||
torch.zeros(1, 1, self.hidden_dim))
|
||||
|
||||
@ -62,11 +54,8 @@ class LSTM_TAN(nn.Module):
|
||||
|
||||
if version != "tan-":
|
||||
t_emb = self.word_embeddings(target)
|
||||
#print(t_emb)
|
||||
#print(torch.mean(t_emb,dim=0,keepdim=True).shape)
|
||||
t_emb = torch.mean(t_emb,dim=0,keepdim=True)
|
||||
xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
|
||||
#print(xt_emb)
|
||||
|
||||
if version == "tan-":
|
||||
lstm_out, _ = self.lstm(
|
||||
@ -96,12 +85,3 @@ class LSTM_TAN(nn.Module):
|
||||
|
||||
return target_scores
|
||||
|
||||
#t_emb = self.word_embeddings(target)
|
||||
#print(t_emb)
|
||||
#print(torch.mean(t_emb,dim=0,keepdim=True).shape)
|
||||
#t_emb = torch.mean(t_emb,dim=0,keep dim=True)
|
||||
|
||||
#xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
|
||||
#print(xt_emb)
|
||||
|
||||
# In[26]:
|
||||
|
Loading…
Reference in New Issue
Block a user