From a86f074dca04befc990674883bf2b3fbc73714ed Mon Sep 17 00:00:00 2001 From: Siddharth Date: Tue, 18 Jun 2019 22:16:04 +0530 Subject: [PATCH] added SEN-SVM codes --- SEN-SVM/STA_features.py | 533 +++++++++++++++++++++++++++++++++++++ SEN-SVM/SVM.py | 318 ++++++++++++++++++++++ SEN-SVM/SVM_mpchi.py | 380 ++++++++++++++++++++++++++ SEN-SVM/sentiment_api_2.py | 78 ++++++ SEN-SVM/te_f.py | 342 ++++++++++++++++++++++++ TAN/networks.py | 22 +- 6 files changed, 1652 insertions(+), 21 deletions(-) create mode 100644 SEN-SVM/STA_features.py create mode 100644 SEN-SVM/SVM.py create mode 100644 SEN-SVM/SVM_mpchi.py create mode 100644 SEN-SVM/sentiment_api_2.py create mode 100644 SEN-SVM/te_f.py diff --git a/SEN-SVM/STA_features.py b/SEN-SVM/STA_features.py new file mode 100644 index 0000000..afd9a32 --- /dev/null +++ b/SEN-SVM/STA_features.py @@ -0,0 +1,533 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +import time +import numpy as np +import pandas as pd +import string +import csv +from scipy import stats +import random +import json +import nltk +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +import re +import wordninja +from collections import defaultdict, Counter +import math +import sys + + +# In[ ]: + + +def load_glove_embeddings_set(): + word2emb = [] + WORD2VEC_MODEL = "glove.6B.300d.txt" + fglove = open(WORD2VEC_MODEL,"r") + for line in fglove: + cols = line.strip().split() + word = cols[0] + word2emb.append(word) + fglove.close() + return set(word2emb) + +def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"): + print("Creating Normalization Dictionary") + with open(no_slang_data, "r") as f: + data1 = json.load(f) + + data2 = {} + + with open(emnlp_dict,"r") as f: + lines = f.readlines() + for line in lines: + row = line.split('\t') + data2[row[0]] = row[1].rstrip() + + normalization_dict = {**data1,**data2} + #print(normalization_dict) + return normalization_dict + +word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict() + + +# In[ ]: + + +def sent_process(sent): + sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent) + sent = re.sub(r"#SemST", "", sent) + sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent) + #sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent) + #sent = re.sub(r"([A-Z])", r" \1", sent) + sent = re.sub(r"\'s", " \'s", sent) + sent = re.sub(r"\'ve", " \'ve", sent) + sent = re.sub(r"n\'t", " n\'t", sent) + sent = re.sub(r"\'re", " \'re", sent) + sent = re.sub(r"\'d", " \'d", sent) + sent = re.sub(r"\'ll", " \'ll", sent) + sent = re.sub(r",", " , ", sent) + sent = re.sub(r"!", " ! ", sent) + sent = re.sub(r"\(", " ( ", sent) + sent = re.sub(r"\)", " ) ", sent) + sent = re.sub(r"\?", " ? ", sent) + sent = re.sub(r"\s{2,}", " ", sent) + sent = sent.strip() + word_tokens = sent.split() + normalised_tokens = [] + for word in word_tokens: + if word in norm_dict: + #if False: + normalised_tokens.extend(norm_dict[word].lower().split(" ")) + #print(word," normalised to ",norm_dict[word]) + else: + normalised_tokens.append(word.lower()) + wordninja_tokens = [] + for word in normalised_tokens: + if word in word_dict: + wordninja_tokens+=[word] + else: + wordninja_tokens+=wordninja.split(word) + return " ".join(wordninja_tokens) + + +# In[ ]: + + + + + +# In[13]: + + + + +def build_lexicon(name): + def pmi(x,y,z,t): + res=(x/(y*(z/t)+(math.sqrt(x)*math.sqrt(math.log(0.9)/(-2))))) + return math.log(res,2) + + + + def prob(word1,nava,total): + count_prob=0 + if word1 in nava: + count_prob += nava[word1] + return((count_prob+1)) + + def prob_cond(word1,seed,stance_seed,stance,total): + count_prob=0 + for i in range(len(seed)): + if(seed[i]==word1): + if(stance_seed[i]==stance): + count_prob=count_prob+1 + return((count_prob+1)) + + + def prob_cond1(word1,word2,Features,total): + count_prob=0 + #for i in range(length_Features): + # flag1=0 + # flag2=0 + # for word in Features['co_relation'][i]: + # if(word==word1): + # flag1=1 + # if(word==word2): + # flag2=1 + # if(flag1==1 and flag2==1): + # count_prob=count_prob+1 + #seed and non-seed lexicon formation + return((co_relation[(word1,word2)]+1)) + + print("building lexicon for ", name) + raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/train.csv') + + #Features Extraction + porter=PorterStemmer() + + Stop_words=set(stopwords.words('english')) + Features=raw[['sentence']] + Tweet=Features['sentence'].copy() + + Features['sentence']=Tweet.apply(sent_process) + Features['tokenized_sents'] = Features.apply(lambda row: (row['sentence'].split()), axis=1) + Features['pos_tag']=Features.apply(lambda row:nltk.pos_tag(row['tokenized_sents'],tagset='universal'),axis=1) + Features['stance']=raw['stance'] + length_Features=len(Features['sentence']) + + co_relation=defaultdict(int) + co_relation2 = [] + for i in range(length_Features): + line=[] + for word,tag in Features['pos_tag'][i]: + if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'): + if(word not in Stop_words): + line.append(porter.stem(word)) + for i in range(len(line)): + for j in range(i+1,len(line)): + co_relation[(line[i],line[j])]+=1 + co_relation[(line[j],line[i])]+=1 + co_relation2.append(line) + + Features['co_relation']=co_relation2 + + FAVOR=[] + AGAINST=[] + NONE=[] + for i in range(length_Features): + if(Features['stance'][i]=='support'): + for word,tag in Features['pos_tag'][i]: + if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'): + if(word not in Stop_words): + FAVOR.append(porter.stem(word)) + else: + if(Features['stance'][i]=='oppose'): + for word,tag in Features['pos_tag'][i]: + if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'): + if(word not in Stop_words): + AGAINST.append(porter.stem(word)) + else: + if(Features['stance'][i]=='neutral'): + for word,tag in Features['pos_tag'][i]: + if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'): + if(word not in Stop_words): + NONE.append(porter.stem(word)) + + len_sup=len(FAVOR) + len_opp=len(AGAINST) + len_nut=len(NONE) + + len_co=[] + for i in range(length_Features): + len_co.append(len(Features['co_relation'][i])) + + Features['len_nava']=len_co + + nava=[] + for i in range(length_Features): + for word,tag in Features['pos_tag'][i]: + if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'): + if(word not in Stop_words): + nava.append(word.lower()) + nava_stem=[] + for word in nava: + nava_stem.append(porter.stem(word)) + uni_nava_stem=list(set(nava_stem)) + nava_stem = Counter(nava_stem) + + + total=len(nava_stem) + length=len(uni_nava_stem) + + print(total,length) + + seed=[] + non_seed=[] + seed_stance=[] + for i in range(len(Features)): + for j in range(int(0.75*Features['len_nava'][i])): + seed.append(Features['co_relation'][i][j]) + seed_stance.append(Features['stance'][i]) + for j in range(int(0.75*Features['len_nava'][i]),Features['len_nava'][i]): + non_seed.append(Features['co_relation'][i][j]) + uni_seed=list(set(seed)) + uni_non_seed=list(set(non_seed)) + + '''for i in range(len(Features)): + x=[] + x=random.sample(Features['co_relation'][i],int(0.75*Features['len_nava'][i])) + for j in range(len(x)): + seed.append(x[j]) + seed_stance.append(Features['stance'][i]) + for j in range(Features['len_nava'][i]): + if(Features['co_relation'][i][j] not in x): + non_seed.append(Features['co_relation'][i][j]) + uni_seed=list(set(seed)) + uni_non_seed=list(set(non_seed))''' + + len_seed=len(seed) + len_uni_seed=len(uni_seed) + len_non_seed=len(non_seed) + len_uni_non_seed=len(uni_non_seed) + + len_seed_sup=0 + len_seed_opp=0 + len_seed_nut=0 + for i in range(len(seed_stance)): + if(seed_stance[i]=='support'): + len_seed_sup=len_seed_sup+1 + else: + if(seed_stance[i]=='oppose'): + len_seed_opp=len_seed_opp+1 + else: + len_seed_nut=len_seed_nut+1 + print(len_seed_nut,len_seed_opp,len_seed_sup) + + + + + prob_sup=len_seed_sup/(len_seed_sup+len_seed_opp+len_seed_nut) + prob_opp=len_seed_opp/(len_seed_sup+len_seed_opp+len_seed_nut) + prob_nut=len_seed_nut/(len_seed_sup+len_seed_opp+len_seed_nut) + + prob_word=[] + for word in uni_seed: + prob_word.append(prob(word,nava_stem,total)) + + prob_cond_word={} + prob_supp_word=[] + prob_opp_word=[] + prob_neu_word=[] + + for word in uni_seed: + prob_supp_word.append(prob_cond(word,seed,seed_stance,'support',(len_seed_sup+len_seed_opp+len_seed_nut))) + prob_opp_word.append(prob_cond(word,seed,seed_stance,'oppose',(len_seed_sup+len_seed_opp+len_seed_nut))) + prob_neu_word.append(prob_cond(word,seed,seed_stance,'neutral',(len_seed_sup+len_seed_opp+len_seed_nut))) + + prob_cond_word={'word':list(uni_seed),'prob_word':prob_word,'prob_supp_word':prob_supp_word,'prob_opp_word':prob_opp_word,'prob_neu_word':prob_neu_word} + Seed_lexicon = pd.DataFrame(data=prob_cond_word) + + + + print(Seed_lexicon) + + pmi_AGAINST=[] + pmi_FAVOR=[] + pmi_NONE=[] + '''for i in range(len_uni_seed): + pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],prob_opp)) + pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],prob_sup)) + pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],prob_nut))''' + + for i in range(len_uni_seed): + pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],len_seed_opp,len_seed)) + pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],len_seed_sup,len_seed)) + pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],len_seed_nut,len_seed)) + + + Seed_lexicon['pmi_AGAINST']=list(pmi_AGAINST) + Seed_lexicon['pmi_FAVOR']=list(pmi_FAVOR) + Seed_lexicon['pmi_NONE']=list(pmi_NONE) + + stance=[] + for i in range(len_uni_seed): + if((Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_AGAINST'][i]) and (Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_NONE'][i])): + stance.append('support') + else: + if((Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_FAVOR'][i]) & (Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_NONE'][i])): + stance.append('oppose') + else: + stance.append('neutral') + + Seed_lexicon['Stance']=list(stance) + + #NON SEED LEXICON + score_non_seed_opp=[] + score_non_seed_sup=[] + score_non_seed_nut=[] + + opp_seed_word=[] + nut_seed_word=[] + sup_seed_word=[] + for i in range(len_uni_seed): + if(Seed_lexicon['Stance'][i]=='support'): + sup_seed_word.append(Seed_lexicon['word'][i]) + else: + if(Seed_lexicon['Stance'][i]=='oppose'): + opp_seed_word.append(Seed_lexicon['word'][i]) + else: + nut_seed_word.append(Seed_lexicon['word'][i]) + + #opp_seed_word=set(opp_seed_word) + #nut_seed_word=set(nut_seed_word) + #sup_seed_word=set(sup_seed_word) + + len_opp_words=len(opp_seed_word) + len_nut_words=len(nut_seed_word) + len_sup_words=len(sup_seed_word) + + pmi_non_seed={} + + start1=time.time() + print("COMPUTING...") + k=0 + for word in uni_non_seed: + list_=[] + for i in range(len_sup_words): + l=pmi(prob_cond1(word,sup_seed_word[i],Features,total),prob(word,nava_stem,total),prob(sup_seed_word[i],nava_stem,total),total) + if(l<0): + list_.append(1) + else: + list_.append(l) + score_non_seed_sup.append(stats.gmean(list_)) + #print(k) + k=k+1 + print("score_non_seed_sup_complete :)") + end1=time.time() + time1=end1-start1 + print(time1) + + start2=time.time() + k=0 + for word in uni_non_seed: + list_=[] + for i in range(len_opp_words): + l=pmi(prob_cond1(word,opp_seed_word[i],Features,total),prob(word,nava_stem,total),prob(opp_seed_word[i],nava_stem,total),total) + if(l<0): + list_.append(1) + else: + list_.append(l) + score_non_seed_opp.append(stats.gmean(list_)) + #print(k) + k=k+1 + print("score_non_seed_opp_complete :)") + end2=time.time() + time2=end2-start2 + print(time2) + + start3=time.time() + k=0 + + #print("~~~~",nut_seed_word) + print(len(uni_non_seed),len_nut_words) + for word in uni_non_seed: + list_=[] + #s2 = time.time() + for i in range(len_nut_words): + #s1 = time.time() + l=pmi(prob_cond1(word,nut_seed_word[i],Features,total), prob(word,nava_stem,total), prob(nut_seed_word[i],nava_stem,total),total) + #print(time.time()-s1) + if(l<0): + list_.append(1) + else: + list_.append(l) + score_non_seed_nut.append(stats.gmean(list_)) + #print(time.time()-s2) + #print(k) + k=k+1 + print("score_non_seed_nut_complete :)") + end3=time.time() + print("Process Complete :)") + time3=end3-start3 + print(time3) + + total_time=time1+time2+time3 + print(total_time) + + prob_cond_word={'word':list(uni_non_seed),'score_non_seed_opp':score_non_seed_opp,'score_non_seed_sup':score_non_seed_sup,'score_non_seed_nut':score_non_seed_nut} + NonSeed_lexicon = pd.DataFrame(data=prob_cond_word) + + #Tweet Vector Formation + lex_word=[] + lex_word.extend(list(Seed_lexicon['word'])) + lex_word.extend(list(NonSeed_lexicon['word'])) + + pmi_sup=[] + pmi_sup.extend(list(Seed_lexicon['pmi_FAVOR'])) + pmi_sup.extend(list(NonSeed_lexicon['score_non_seed_sup'])) + + pmi_opp=[] + pmi_opp.extend(list(Seed_lexicon['pmi_AGAINST'])) + pmi_opp.extend(list(NonSeed_lexicon['score_non_seed_opp'])) + + pmi_nut=[] + pmi_nut.extend(list(Seed_lexicon['pmi_NONE'])) + pmi_nut.extend(list(NonSeed_lexicon['score_non_seed_nut'])) + + Lexicon = dict() + for i in range(len(lex_word)): + Lexicon[lex_word[i]] = {'pmi_sup':pmi_sup[i],'pmi_opp':pmi_opp[i],'pmi_nut':pmi_nut[i]} + + print("Lexicon formed") + return Lexicon + + #Lexicon={'word':lex_word,'pmi_sup':pmi_sup,'pmi_opp':pmi_opp,'pmi_nut':pmi_nut} + #Lexicon = pd.DataFrame(data=Lexicon) + + + + + + + +# In[14]: + + +#Lexicon = build_lexicon('SC') + + +# In[26]: + + +def produce_features(name,Lexicon): + #train_features + for l in ['train','test']: + raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/{}.csv'.format(l)) + Stop_words=set(stopwords.words('english')) + Features=raw[['sentence']] + Tweet=Features['sentence'].copy() + + Features['preprocessed_sentence']=Tweet.apply(sent_process) + Features['tokenized_sents'] = Features.apply(lambda row: (row['preprocessed_sentence'].split()), axis=1) + + porter = PorterStemmer() + start=time.time() + #word_sup_vect=[] + #word_opp_vect=[] + #word_nut_vect=[] + + data = [['sentence','pmi_sup','pmi_opp','pmi_nut']] + len_lexicon_word=len(Lexicon) + + for i in range(len(Features['sentence'])): + sum1=0 + sum2=0 + sum3=0 + total_lex=0 + temp = [] + for word in Features['tokenized_sents'][i]: + #for j in range(len_lexicon_word): + + w = porter.stem(word) + if w in Lexicon: + sum1=sum1+Lexicon[w]['pmi_sup'] + sum2=sum2+Lexicon[w]['pmi_opp'] + sum3=sum3+Lexicon[w]['pmi_nut'] + total_lex=total_lex+1 + #word_sup_vect.append(sum1/total_lex) + #word_opp_vect.append(sum2/total_lex) + #word_nut_vect.append(sum3/total_lex) + data.append([Features['sentence'][i],sum1/total_lex,sum2/total_lex,sum3/total_lex]) + + my_df = pd.DataFrame(data) + my_df.to_csv('./pmi/pmi_{}_{}.csv'.format(name,l+'1'),header=False,index=False) + + + end=time.time() + print(end-start) + + +# In[27]: +produce_features('HRT',build_lexicon('HRT')) + +'''for dataset in ['AT','LA','CC','HC','FM']: + produce_features(dataset,build_lexicon(dataset))''' + + +# In[ ]: + + + + + +# In[ ]: + + + + diff --git a/SEN-SVM/SVM.py b/SEN-SVM/SVM.py new file mode 100644 index 0000000..0393865 --- /dev/null +++ b/SEN-SVM/SVM.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[9]: + + +import csv +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +import string +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +import wordninja +import re +import json +from sklearn import svm +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import classification_report,confusion_matrix +import numpy as np +from nltk.tokenize import word_tokenize +from nltk.stem.porter import PorterStemmer +import pandas as pd + + +# In[2]: +stemmer = PorterStemmer() + +def load_glove_embeddings_set(): + word2emb = [] + WORD2VEC_MODEL = "../glove.6B.300d.txt" + fglove = open(WORD2VEC_MODEL,"r") + for line in fglove: + cols = line.strip().split() + word = cols[0] + word2emb.append(word) + fglove.close() + return set(word2emb) + +def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"): + print("Creating Normalization Dictionary") + with open(no_slang_data, "r") as f: + data1 = json.load(f) + + data2 = {} + + with open(emnlp_dict,"r") as f: + lines = f.readlines() + for line in lines: + row = line.split('\t') + data2[row[0]] = row[1].rstrip() + + normalization_dict = {**data1,**data2} + #print(normalization_dict) + return normalization_dict + +word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict() + + +# In[3]: + + +def sent_process(sent): + sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent) + sent = re.sub(r"#SemST", "", sent) + sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent) + #sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent) + #sent = re.sub(r"([A-Z])", r" \1", sent) + sent = re.sub(r"\'s", " \'s", sent) + sent = re.sub(r"\'ve", " \'ve", sent) + sent = re.sub(r"n\'t", " n\'t", sent) + sent = re.sub(r"\'re", " \'re", sent) + sent = re.sub(r"\'d", " \'d", sent) + sent = re.sub(r"\'ll", " \'ll", sent) + sent = re.sub(r",", " , ", sent) + sent = re.sub(r"!", " ! ", sent) + sent = re.sub(r"\(", " ( ", sent) + sent = re.sub(r"\)", " ) ", sent) + sent = re.sub(r"\?", " ? ", sent) + sent = re.sub(r"\s{2,}", " ", sent) + sent = sent.strip() + word_tokens = sent.split() + normalised_tokens = [] + for word in word_tokens: + if word in norm_dict: + #if False: + normalised_tokens.extend(norm_dict[word].lower().split(" ")) + print(word," normalised to ",norm_dict[word]) + else: + normalised_tokens.append(word.lower()) + wordninja_tokens = [] + for word in normalised_tokens: + if word in word_dict: + wordninja_tokens+=[word] + else: + wordninja_tokens+=wordninja.split(word) + return " ".join(wordninja_tokens) + + +# In[4]: + + + +def svc_param_selection(X, y, nfolds): + Cs = [0.001, 0.01, 0.1, 1, 10,100 ] + gammas = [0.001, 0.01, 0.1, 1] + param_grid = [{'C': Cs, 'gamma' : gammas , 'kernel' : ['rbf']},{'C': Cs , 'gamma' : gammas , 'kernel' : ['linear']}] + grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds) + grid_search.fit(X, y) + grid_search.best_params_ + return grid_search.best_params_ + + +# In[21]: + + +def train(topic,bow=False,senti=False,sta=False,ent=False): + save_file = topic+"_" + print(topic) + y_train = [] + y_test = [] + sentences = [] + features_ent = [] + features_sta = [] + features_senti = [] + senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2} + with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f: + reader = csv.DictReader(f, delimiter=',') + done = False + for row in reader: + sentences.append(row['sentence']) + if ent: + features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']]) + if not done: + save_file = save_file + "ent_" + if sta: + features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']]) + if not done: + save_file = save_file + "sta_" + if senti: + features_senti.append([senti_dict[row['senti']]]) + if not done: + save_file = save_file + "senti_" + done = True + y_train.append(row['label']) + L = len(sentences) + with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f: + reader = csv.DictReader(f, delimiter=',') + for row in reader: + sentences.append(row['sentence']) + if ent: + features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']]) + if sta: + features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']]) + if senti: + features_senti.append([senti_dict[row['senti']]]) + y_test.append(row['label']) + + all_features = [] + if bow: + new_sentences = [] + for sent in sentences: + tokens = word_tokenize(sent) + tokens = [stemmer.stem(token) for token in tokens] + ret = " ".join(w for w in tokens) + new_sentences.append(ret) + save_file = save_file + "bow_" + vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2) + features_bow = vectorizer.fit_transform(new_sentences) + all_features.append(features_bow.toarray()) + #features_bow_train = np.array(features_bow[:L].toarray()) + #features_bow_test = np.array(features_bow[L:].toarray()) + + if ent: + #features_ent_train = np.array(features_ent[:L]) + #features_ent_test = np.array(features_ent[L:]) + all_features.append(features_ent) + + if sta: + #features_ent_train = np.array(features_ent[:L]) + #features_ent_test = np.array(features_ent[L:]) + all_features.append(features_sta) + + + if senti: + #features_senti_train = np.array(features_senti[:L]) + #features_senti_test = np.array(features_senti[L:]) + all_features.append(features_senti) + + dataset = np.concatenate(all_features,axis=1) + train_dataset = dataset[:L] + test_dataset = dataset[L:] + + best_params = svc_param_selection(train_dataset,y_train,nfolds=5) + print(best_params) + if best_params['kernel'] == 'rbf': + model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True) + else: + model = svm.SVC(kernel='linear' ,C = best_params['C'],probability=True) + + + model.fit(train_dataset,y_train) + + y_pred = model.predict(test_dataset) + print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + #cm = confusion_matrix(y_test,y_pred,labels=['0','1','2']) + conf_score = model.predict_proba(test_dataset) + + #print + df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1)) + df.to_csv(save_file+".csv",header=False,index=False) + return y_pred,y_test + + + +# In[22]: + +''' +#BOW +print ('BOW---------') +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,bow = True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))''' + + +# In[23]: + +print ("STA---------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[7]: + +print("ENT----------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,ent=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[8]: + +print("SENTI---------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,senti=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[9]: + +print("ENT_SENTI--------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,ent=True,senti=True,sta=False) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[10]: + +print("ENT_STA---------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,ent=True,senti=False,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[11]: + +print("SENTI_STA-----------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,ent=False,senti=True,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[12]: + +print("ENT_SENTI_STA---------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,ent=True,senti=True,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[ ]: + + + + diff --git a/SEN-SVM/SVM_mpchi.py b/SEN-SVM/SVM_mpchi.py new file mode 100644 index 0000000..8bc9b28 --- /dev/null +++ b/SEN-SVM/SVM_mpchi.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[9]: + + +import csv +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +import string +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +import wordninja +import re +import json +from sklearn import svm +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import classification_report,confusion_matrix +import numpy as np +from nltk.tokenize import word_tokenize +from nltk.stem.porter import PorterStemmer +import pandas as pd + + +# In[2]: +stemmer = PorterStemmer() + +def load_glove_embeddings_set(): + word2emb = [] + WORD2VEC_MODEL = "../glove.6B.300d.txt" + fglove = open(WORD2VEC_MODEL,"r") + for line in fglove: + cols = line.strip().split() + word = cols[0] + word2emb.append(word) + fglove.close() + return set(word2emb) + +def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"): + print("Creating Normalization Dictionary") + with open(no_slang_data, "r") as f: + data1 = json.load(f) + + data2 = {} + + with open(emnlp_dict,"r") as f: + lines = f.readlines() + for line in lines: + row = line.split('\t') + data2[row[0]] = row[1].rstrip() + + normalization_dict = {**data1,**data2} + #print(normalization_dict) + return normalization_dict + +word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict() + + +# In[3]: + + +def sent_process(sent): + sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent) + sent = re.sub(r"#SemST", "", sent) + sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent) + #sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent) + #sent = re.sub(r"([A-Z])", r" \1", sent) + sent = re.sub(r"\'s", " \'s", sent) + sent = re.sub(r"\'ve", " \'ve", sent) + sent = re.sub(r"n\'t", " n\'t", sent) + sent = re.sub(r"\'re", " \'re", sent) + sent = re.sub(r"\'d", " \'d", sent) + sent = re.sub(r"\'ll", " \'ll", sent) + sent = re.sub(r",", " , ", sent) + sent = re.sub(r"!", " ! ", sent) + sent = re.sub(r"\(", " ( ", sent) + sent = re.sub(r"\)", " ) ", sent) + sent = re.sub(r"\?", " ? ", sent) + sent = re.sub(r"\s{2,}", " ", sent) + sent = sent.strip() + word_tokens = sent.split() + normalised_tokens = [] + for word in word_tokens: + if word in norm_dict: + #if False: + normalised_tokens.extend(norm_dict[word].lower().split(" ")) + print(word," normalised to ",norm_dict[word]) + else: + normalised_tokens.append(word.lower()) + wordninja_tokens = [] + for word in normalised_tokens: + if word in word_dict: + wordninja_tokens+=[word] + else: + wordninja_tokens+=wordninja.split(word) + return " ".join(wordninja_tokens) + + +# In[4]: + +#{'C': c, 'gamma' : gammas , 'kernel' : ['rbf']}, + +def svc_param_selection(X, y, nfolds): + Cs = [0.001, 0.01, 0.1, 1, 10,100 ] + c = [1] + gammas = [0.001, 0.01, 0.1, 1] + param_grid = [{'C': c , 'gamma' : gammas , 'kernel' : ['linear']}] + grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds) + grid_search.fit(X, y) + grid_search.best_params_ + return grid_search.best_params_ + + +# In[21]: + + +def train(topic,bow=False,senti=False,sta=False,ent=False,med = False): + save_file = topic+"_" + print(topic) + y_train = [] + y_test = [] + sentences = [] + features_ent = [] + features_sta = [] + features_senti = [] + features_med = [] + senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2} + with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f: + reader = csv.DictReader(f, delimiter=',') + done = False + for row in reader: + sentences.append(row['sentence']) + if ent: + features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']]) + if not done: + save_file = save_file + "ent_" + if sta: + features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']]) + if not done: + save_file = save_file + "sta_" + if senti: + features_senti.append([senti_dict[row['senti']]]) + if not done: + save_file = save_file + "senti_" + if med: + features_med.append([row['med_aff'],row['med_treat']]) + if not done: + save_file = save_file + "med" + done = True + y_train.append(row['label']) + L = len(sentences) + with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f: + reader = csv.DictReader(f, delimiter=',') + for row in reader: + sentences.append(row['sentence']) + if ent: + features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']]) + if sta: + features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']]) + if senti: + features_senti.append([senti_dict[row['senti']]]) + if med: + features_med.append([row['med_aff'],row['med_treat']]) + y_test.append(row['label']) + + all_features = [] + if bow: + new_sentences = [] + for sent in sentences: + tokens = word_tokenize(sent) + tokens = [stemmer.stem(token) for token in tokens] + ret = " ".join(w for w in tokens) + new_sentences.append(ret) + save_file = save_file + "bow_" + vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2) + features_bow = vectorizer.fit_transform(new_sentences) + all_features.append(features_bow.toarray()) + #features_bow_train = np.array(features_bow[:L].toarray()) + #features_bow_test = np.array(features_bow[L:].toarray()) + + if ent: + #features_ent_train = np.array(features_ent[:L]) + #features_ent_test = np.array(features_ent[L:]) + all_features.append(features_ent) + + if sta: + #features_ent_train = np.array(features_ent[:L]) + #features_ent_test = np.array(features_ent[L:]) + all_features.append(features_sta) + + + if senti: + #features_senti_train = np.array(features_senti[:L]) + #features_senti_test = np.array(features_senti[L:]) + all_features.append(features_senti) + + if med: + all_features.append(features_med) + dataset = np.concatenate(all_features,axis=1) + train_dataset = dataset[:L] + test_dataset = dataset[L:] + + best_params = svc_param_selection(train_dataset,y_train,nfolds=5) + print(best_params) + if best_params['kernel'] == 'rbf': + model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True) + else: + model = svm.SVC(kernel='linear' ,C = best_params['C'], gamma = best_params['gamma'],probability=True) + + + model.fit(train_dataset,y_train) + + y_pred = model.predict(test_dataset) + print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + #cm = confusion_matrix(y_test,y_pred,labels=['0','1','2']) + conf_score = model.predict_proba(test_dataset) + + #print + df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1)) + df.to_csv(save_file+".csv",header=False,index=False) + return y_pred,y_test + + + +# In[22]: + +''' +#BOW +print ('BOW---------') +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,bow = True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[23]: + +print ("STA---------") +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[7]: + +print("ENT----------") +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,ent=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[8]: + +print("SENTI---------") +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,senti=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[9]: + +print("ENT_SENTI--------") +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset ,senti=True , ent=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[10]: + +print("ENT_STA---------") +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,ent=True,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[11]: + +print("SENTI_STA-----------") +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,senti=True,sta=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[12]: + +print("ENT_SENTI_STA---------") +y_pred,y_test = [],[] +for dataset in ['AT','CC','FM','LA','HC']: + a,b = train(dataset,ent=True,senti=True,sta=True,bow = True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +# In[ ]: + + +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,med=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + + +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,sta = True, med=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,senti = True, med=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,sta = True, senti = True, med=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) + +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,sta = True, senti = True, med=True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))''' + +y_pred,y_test = [],[] +for dataset in ['MMR','HRT','EC','VC','SC']: + a,b = train(dataset,sta = True, senti = True, med=True, ent = True, bow= True) + y_pred.extend(a) + y_test.extend(b) + print(len(a),len(b)) + +print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose'])) \ No newline at end of file diff --git a/SEN-SVM/sentiment_api_2.py b/SEN-SVM/sentiment_api_2.py new file mode 100644 index 0000000..f6ef210 --- /dev/null +++ b/SEN-SVM/sentiment_api_2.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 13 03:05:06 2018 + +@author: shalmoli +""" +import pandas as pd +import csv +from pycorenlp import StanfordCoreNLP +import json + +#reading Input file +data=pd.read_csv('HRT2.csv') +#Extracting Sentences +sentence=data['sentences'] +length=len(sentence) + +#storing Result +# before Executing this,we have to start the Stanford Server through Terminal +'''It's returns sentiment sentences wise. +If we have have abstract more than one sentence ,we conclude the sentiment for that sentence by considering the majority and breaking tie randomly.''' +result=[] +for i in range(length): + nlp = StanfordCoreNLP('http://localhost:9000') + res = nlp.annotate(sentence[i], + properties={ + 'annotators': 'sentiment', + 'outputFormat': 'json', + 'timeout': 100000, + }) + print (res) + count=0# counting number of sentences in input + count_1=0#counting Negative Sentiment + count_2=0#counting Neutral Sentiment + count_3=0#counting Positive Sentiment + for s in (res['sentences']): + if(s["sentimentValue"]): + count=count+1 + if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'): + count_1=count_1+1 + else: + if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'): + count_3=count_3+1 + else: + count_2=count_2+1 + if(count>1): + if(count_1 > count_2 and count_1 > count_3): + result.append("Negative") + else: + if(count_2 > count_1 and count_2 > count_3): + result.append('Neutral') + else: + result.append('Positive') + else: + if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'): + result.append("Negative") + else: + if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'): + result.append("Positive") + else: + result.append('Neutral') + +#Storing Output to file Oytput is binary for each class individually +file = open('Sentiment_HRT2.csv','a') +fields = ('sentence','positive','negative','neutral','sentiment') +wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n') +wr.writeheader() +for i in range(length): + if(result[i]=='Negative'): + wr.writerow({'sentence':sentence[i],'positive':0,'negative':1,'neutral':0,'sentiment':result[i]}) + else: + if(result[i]=='Positive'): + wr.writerow({'sentence':sentence[i],'positive':1,'negative':0,'neutral':0,'sentiment':result[i]}) + else: + wr.writerow({'sentence':sentence[i],'positive':0,'negative':0,'neutral':1,'sentiment':result[i]}) + +file.close() \ No newline at end of file diff --git a/SEN-SVM/te_f.py b/SEN-SVM/te_f.py new file mode 100644 index 0000000..93a1049 --- /dev/null +++ b/SEN-SVM/te_f.py @@ -0,0 +1,342 @@ +def isnan(value): + try: + import math + return math.isnan(float(value)) + except: + return False + +#matplotlib inline + +import tensorflow as tf +import numpy as np +import pandas as pd +#import matplotlib.pyplot as plt +#import matplotlib.ticker as ticker +import urllib +import sys +import os +import csv +import zipfile + +glove_zip_file = "glove.6B.zip" +glove_vectors_file = "glove.6B.50d.txt" + +snli_zip_file = "snli_1.0.zip" +snli_dev_file = "snli_1.0_dev.txt" +snli_full_dataset_file = "snli_1.0_train.txt" + +from six.moves.urllib.request import urlretrieve + +#large file - 862 MB +if (not os.path.isfile(glove_zip_file) and + not os.path.isfile(glove_vectors_file)): + urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip", + glove_zip_file) + +#medium-sized file - 94.6 MB +if (not os.path.isfile(snli_zip_file) and + not os.path.isfile(snli_dev_file)): + urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip", + snli_zip_file) + +def unzip_single_file(zip_file_name, output_file_name): + """ + If the outFile is already created, don't recreate + If the outFile does not exist, create it from the zipFile + """ + if not os.path.isfile(output_file_name): + with open(output_file_name, 'wb') as out_file: + with zipfile.ZipFile(zip_file_name) as zipped: + for info in zipped.infolist(): + if output_file_name in info.filename: + with zipped.open(info) as requested_file: + out_file.write(requested_file.read()) + return + +unzip_single_file(glove_zip_file, glove_vectors_file) +unzip_single_file(snli_zip_file, snli_dev_file) + +glove_wordmap = {} +with open(glove_vectors_file, "r") as glove: + for line in glove: + name, vector = tuple(line.split(" ", 1)) + glove_wordmap[name] = np.fromstring(vector, sep=" ") + + +def sentence2sequence(sentence): + """ + + - Turns an input sentence into an (n,d) matrix, + where n is the number of tokens in the sentence + and d is the number of dimensions each word vector has. + + """ + tokens = sentence.lower().split(" ") + rows = [] + words = [] + #Greedy search for tokens + for token in tokens: + i = len(token) + while len(token) > 0 and i > 0: + word = token[:i] + if word in glove_wordmap: + rows.append(glove_wordmap[word]) + words.append(word) + token = token[i:] + i = len(token) + else: + i = i-1 + return rows, words + +rnn_size = 64 +rnn = tf.contrib.rnn.BasicRNNCell(rnn_size) + +#Constants setup +max_hypothesis_length, max_evidence_length = 60, 50 +batch_size, vector_size, hidden_size = 128, 50, 64 + +lstm_size = hidden_size + +weight_decay = 0.0001 + +learning_rate = 1 + +input_p, output_p = 0.5, 0.5 + +training_iterations_count = 100000 + +display_step = 10 + +def score_setup(row): + convert_dict = { + 'entailment': 0, + 'neutral': 1, + 'contradiction': 2 + } + score = np.zeros((3,)) + for x in range(1,6): + tag = row["label"+str(x)] + if tag in convert_dict: score[convert_dict[tag]] += 1 + return score / (1.0*np.sum(score)) + +def fit_to_size(matrix, shape): + res = np.zeros(shape) + slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)] + res[slices] = matrix[slices] + return res + + +def split_data_into_scores(): + import csv + with open("snli_1.0_dev.txt","r") as data: + train = csv.DictReader(data, delimiter='\t') + evi_sentences = [] + hyp_sentences = [] + labels = [] + scores = [] + for row in train: + hyp_sentences.append(np.vstack( + sentence2sequence(row["sentence1"].lower())[0])) + evi_sentences.append(np.vstack( + sentence2sequence(row["sentence2"].lower())[0])) + labels.append(row["gold_label"]) + scores.append(score_setup(row)) + + hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size)) + for x in hyp_sentences]) + evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size)) + for x in evi_sentences]) + + return (hyp_sentences, evi_sentences), labels, np.array(scores) + +data_feature_list, correct_values, correct_scores = split_data_into_scores() + +l_h, l_e = max_hypothesis_length, max_evidence_length +N, D, H = batch_size, vector_size, hidden_size +l_seq = l_h + l_e + +tf.reset_default_graph() +# lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) +lstm = tf.nn.rnn_cell.LSTMCell(lstm_size) +lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p) + + +# N: The number of elements in each of our batches, +# which we use to train subsets of data for efficiency's sake. +# l_h: The maximum length of a hypothesis, or the second sentence. This is +# used because training an RNN is extraordinarily difficult without +# rolling it out to a fixed length. +# l_e: The maximum length of evidence, the first sentence. This is used +# because training an RNN is extraordinarily difficult without +# rolling it out to a fixed length. +# D: The size of our used GloVe or other vectors. +hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis') +evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence') +y = tf.placeholder(tf.float32, [N, 3], 'label') +# hyp: Where the hypotheses will be stored during training. +# evi: Where the evidences will be stored during training. +# y: Where correct scores will be stored during training. + +# lstm_size: the size of the gates in the LSTM, +# as in the first LSTM layer's initialization. +# lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size) +lstm_back = tf.nn.rnn_cell.LSTMCell(lstm_size) +# lstm_back: The LSTM used for looking backwards +# through the sentences, similar to lstm. + +# input_p: the probability that inputs to the LSTM will be retained at each +# iteration of dropout. +# output_p: the probability that outputs from the LSTM will be retained at +# each iteration of dropout. +lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p) +# lstm_drop_back: A dropout wrapper for lstm_back, like lstm_drop. + + +fc_initializer = tf.random_normal_initializer(stddev=0.1) +# fc_initializer: initial values for the fully connected layer's weights. +# hidden_size: the size of the outputs from each lstm layer. +# Multiplied by 2 to account for the two LSTMs. +fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3], + initializer = fc_initializer) +# fc_weight: Storage for the fully connected layer's weights. +fc_bias = tf.get_variable('bias', [3]) +# fc_bias: Storage for the fully connected layer's bias. + +# tf.GraphKeys.REGULARIZATION_LOSSES: A key to a collection in the graph +# designated for losses due to regularization. +# In this case, this portion of loss is regularization on the weights +# for the fully connected layer. +tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, + tf.nn.l2_loss(fc_weight)) + +x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d +# Permuting batch_size and n_steps +x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d +# Reshaping to (n_steps*batch_size, n_input) +x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d +# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) +x = tf.split(x, l_seq,) + +# x: the inputs to the bidirectional_rnn + + +# tf.contrib.rnn.static_bidirectional_rnn: Runs the input through +# two recurrent networks, one that runs the inputs forward and one +# that runs the inputs in reversed order, combining the outputs. +rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back,x, dtype=tf.float32) +# rnn_outputs: the list of LSTM outputs, as a list. +# What we want is the latest output, rnn_outputs[-1] + +classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias +# The scores are relative certainties for how likely the output matches +# a certain entailment: +# 0: Positive entailment +# 1: Neutral entailment +# 2: Negative entailment + +with tf.variable_scope('Accuracy'): + predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32') + y_label = tf.cast(tf.argmax(y, 1), 'int32') + corrects = tf.equal(predicts, y_label) + num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) + accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) + +with tf.variable_scope("loss"): + cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( + logits = classification_scores, labels = y) + loss = tf.reduce_mean(cross_entropy) + total_loss = loss + weight_decay * tf.add_n( + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) + +optimizer = tf.train.GradientDescentOptimizer(learning_rate) + +opt_op = optimizer.minimize(total_loss) + +# Initialize variables +init = tf.global_variables_initializer() + +# Use TQDM if installed +tqdm_installed = False +try: + from tqdm import tqdm + tqdm_installed = True +except: + pass + +# Launch the Tensorflow session +sess = tf.Session() +sess.run(init) + +# training_iterations_count: The number of data pieces to train on in total +# batch_size: The number of data pieces per batch +training_iterations = range(0,training_iterations_count,batch_size) +if tqdm_installed: + # Add a progress bar if TQDM is installed + training_iterations = tqdm(training_iterations) + +for i in training_iterations: + + # Select indices for a random data subset + batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size) + + # Use the selected subset indices to initialize the graph's + # placeholder values + hyps, evis, ys = (data_feature_list[0][batch,:], + data_feature_list[1][batch,:], + correct_scores[batch]) + + # Run the optimization with these initialized values + sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys}) + # display_step: how often the accuracy and loss should + # be tested and displayed. + if (i/batch_size) % display_step == 0: + # Calculate batch accuracy + acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys}) + # Calculate batch loss + tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys}) + # Display results + print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \ + "{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \ + "{:.5f}".format(acc)) +Features_pmh=pd.read_csv('Climate1.csv') + +length_features=len(Features_pmh) +result=[] +pred=[] + +import string +from nltk.corpus import stopwords +from nltk.tokenize import sent_tokenize,word_tokenize +from nltk.tokenize.punkt import PunktSentenceTokenizer +tokenizer = PunktSentenceTokenizer() + +text=Features_pmh['Tweet'].copy() +def sent_process(sent): + sent = sent.translate(str.maketrans('', '', string.punctuation)) + sent = [word for word in sent.split() if word.lower() not in stopwords.words('english')] + return " ".join(sent) +Features_pmh['Tweet']=text.apply(sent_process) +file = open('Climate_t.csv','a') +fields = ('Text','hypotheses','result','pos_scr','neg_scr','nut_scr') +wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n') +wr.writeheader() + +for i in range(length_features): + if(isnan(Features_pmh['Tweet'][i])==False): + evidences = [Features_pmh['Tweet'][i]] + else: + evidences = [Features_pmh['Tweet'][i]] + + hypotheses = ["Climate Change is a Real Concern"] + + sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]),(60, 50)) for evidence in evidences] + + sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]),(50,50)) for hypothesis in hypotheses] + + prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N),evi: (sentence2 * N),y: [[0,0,0]]*N}) + #print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+" entailment") + result.append(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]) + pred.append(prediction[0]) + wr.writerow({'Text':Features_pmh['Tweet'][i],'hypotheses':hypotheses,"result" :result[i],'pos_scr':pred[i][0],'neg_scr':pred[i][1],'nut_scr':pred[i][2]}) + +file.close() \ No newline at end of file diff --git a/TAN/networks.py b/TAN/networks.py index 4ecbb4d..0e67117 100644 --- a/TAN/networks.py +++ b/TAN/networks.py @@ -18,7 +18,7 @@ class LSTM_TAN(nn.Module): self.hidden_dim = hidden_dim self.embedding_dim = embedding_dim - #WORD_EMBEDDINGS + self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) self.word_embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float)) self.word_embeddings.weight.requires_grad=True @@ -30,14 +30,10 @@ class LSTM_TAN(nn.Module): self.attention = nn.Linear(2*embedding_dim,1) - #LSTM - # The LSTM takes word embeddings as inputs, and outputs hidden states - # with dimensionality hidden_dim. self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=(version!="lstm")) self.dropout = nn.Dropout(dropout) - #FINAL_LAYER if version !="lstm": self.hidden2target = nn.Linear(2*self.hidden_dim, n_targets) else: @@ -46,10 +42,6 @@ class LSTM_TAN(nn.Module): self.hidden = self.init_hidden() def init_hidden(self): - # Before we've done anything, we dont have any hidden state. - # Refer to the Pytorch documentation to see exactly - # why they have this dimensionality. - # The axes semantics are (num_layers, minibatch_size, hidden_dim) return (torch.zeros(1, 1, self.hidden_dim), torch.zeros(1, 1, self.hidden_dim)) @@ -62,11 +54,8 @@ class LSTM_TAN(nn.Module): if version != "tan-": t_emb = self.word_embeddings(target) - #print(t_emb) - #print(torch.mean(t_emb,dim=0,keepdim=True).shape) t_emb = torch.mean(t_emb,dim=0,keepdim=True) xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1) - #print(xt_emb) if version == "tan-": lstm_out, _ = self.lstm( @@ -96,12 +85,3 @@ class LSTM_TAN(nn.Module): return target_scores - #t_emb = self.word_embeddings(target) - #print(t_emb) - #print(torch.mean(t_emb,dim=0,keepdim=True).shape) - #t_emb = torch.mean(t_emb,dim=0,keep dim=True) - - #xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1) - #print(xt_emb) - -# In[26]: