added SEN-SVM codes

2019-06-18 22:16:04 +05:30 · 2019-06-18 22:16:04 +05:30 · a86f074dca
commit a86f074dca
parent b524243043
6 changed files with 1652 additions and 21 deletions
--- a/SEN-SVM/STA_features.py
+++ b/SEN-SVM/STA_features.py
@ -0,0 +1,533 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+import time
+import numpy as np
+import pandas as pd
+import string
+import csv
+from scipy import stats
+import random
+import json
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+import re
+import wordninja
+from collections import defaultdict, Counter
+import math
+import sys
+
+
+# In[ ]:
+
+
+def load_glove_embeddings_set():
+    word2emb = []
+    WORD2VEC_MODEL = "glove.6B.300d.txt"
+    fglove = open(WORD2VEC_MODEL,"r")
+    for line in fglove:
+        cols = line.strip().split()
+        word = cols[0]
+        word2emb.append(word)
+    fglove.close()
+    return set(word2emb)
+
+def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
+    print("Creating Normalization Dictionary")
+    with open(no_slang_data, "r") as f:
+        data1 = json.load(f)
+
+    data2 = {}
+
+    with open(emnlp_dict,"r") as f:
+        lines = f.readlines()
+        for line in lines:
+            row = line.split('\t')
+            data2[row[0]] = row[1].rstrip()
+
+    normalization_dict = {**data1,**data2}
+    #print(normalization_dict)
+    return normalization_dict
+
+word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
+
+
+# In[ ]:
+
+
+def sent_process(sent):
+    sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
+    sent = re.sub(r"#SemST", "", sent)
+    sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
+    #sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
+    #sent =  re.sub(r"([A-Z])", r" \1", sent)
+    sent = re.sub(r"\'s", " \'s", sent)
+    sent = re.sub(r"\'ve", " \'ve", sent)
+    sent = re.sub(r"n\'t", " n\'t", sent)
+    sent = re.sub(r"\'re", " \'re", sent)
+    sent = re.sub(r"\'d", " \'d", sent)
+    sent = re.sub(r"\'ll", " \'ll", sent)
+    sent = re.sub(r",", " , ", sent)
+    sent = re.sub(r"!", " ! ", sent)
+    sent = re.sub(r"\(", " ( ", sent)
+    sent = re.sub(r"\)", " ) ", sent)
+    sent = re.sub(r"\?", " ? ", sent)
+    sent = re.sub(r"\s{2,}", " ", sent)
+    sent = sent.strip()
+    word_tokens = sent.split()
+    normalised_tokens = []
+    for word in word_tokens:
+        if word in norm_dict:
+        #if False:
+            normalised_tokens.extend(norm_dict[word].lower().split(" "))
+            #print(word," normalised to ",norm_dict[word])
+        else:
+            normalised_tokens.append(word.lower())
+    wordninja_tokens = []
+    for word in normalised_tokens:
+        if word in word_dict:
+            wordninja_tokens+=[word]
+        else:
+            wordninja_tokens+=wordninja.split(word)
+    return " ".join(wordninja_tokens)
+
+
+# In[ ]:
+
+
+
+
+
+# In[13]:
+
+
+
+
+def build_lexicon(name):
+    def pmi(x,y,z,t):
+    	res=(x/(y*(z/t)+(math.sqrt(x)*math.sqrt(math.log(0.9)/(-2)))))
+    	return math.log(res,2)
+        
+
+    
+    def prob(word1,nava,total):
+        count_prob=0
+        if word1 in nava:
+            count_prob += nava[word1]
+        return((count_prob+1))
+
+    def prob_cond(word1,seed,stance_seed,stance,total):
+        count_prob=0
+        for i in range(len(seed)):
+            if(seed[i]==word1):
+                if(stance_seed[i]==stance):
+                    count_prob=count_prob+1
+        return((count_prob+1))
+
+
+    def prob_cond1(word1,word2,Features,total):
+        count_prob=0
+        #for i in range(length_Features):
+        #    flag1=0
+        #    flag2=0
+        #    for word in Features['co_relation'][i]:
+        #        if(word==word1):
+        #            flag1=1
+        #        if(word==word2):
+        #            flag2=1
+        #    if(flag1==1 and flag2==1):
+        #            count_prob=count_prob+1
+        #seed and non-seed lexicon formation       
+        return((co_relation[(word1,word2)]+1))
+
+    print("building lexicon for ", name)
+    raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/train.csv')
+
+    #Features Extraction
+    porter=PorterStemmer()
+
+    Stop_words=set(stopwords.words('english'))
+    Features=raw[['sentence']]
+    Tweet=Features['sentence'].copy()
+
+    Features['sentence']=Tweet.apply(sent_process)
+    Features['tokenized_sents'] = Features.apply(lambda row: (row['sentence'].split()), axis=1)
+    Features['pos_tag']=Features.apply(lambda row:nltk.pos_tag(row['tokenized_sents'],tagset='universal'),axis=1)
+    Features['stance']=raw['stance']
+    length_Features=len(Features['sentence'])
+    
+    co_relation=defaultdict(int)
+    co_relation2 = []
+    for i in range(length_Features):
+        line=[]
+        for word,tag in Features['pos_tag'][i]:
+            if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
+                if(word not in Stop_words):
+                    line.append(porter.stem(word))
+        for i in range(len(line)):
+            for j in range(i+1,len(line)):
+                co_relation[(line[i],line[j])]+=1
+                co_relation[(line[j],line[i])]+=1
+        co_relation2.append(line)
+
+    Features['co_relation']=co_relation2
+
+    FAVOR=[]
+    AGAINST=[]
+    NONE=[]
+    for i in range(length_Features):
+        if(Features['stance'][i]=='support'):
+            for word,tag in Features['pos_tag'][i]:
+                if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
+                    if(word not in Stop_words):
+                        FAVOR.append(porter.stem(word))
+        else:
+            if(Features['stance'][i]=='oppose'):
+                for word,tag in Features['pos_tag'][i]:
+                    if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
+                        if(word not in Stop_words):
+                            AGAINST.append(porter.stem(word))
+            else:
+                if(Features['stance'][i]=='neutral'):
+                    for word,tag in Features['pos_tag'][i]:
+                        if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
+                            if(word not in Stop_words):
+                                NONE.append(porter.stem(word))
+
+    len_sup=len(FAVOR)
+    len_opp=len(AGAINST)
+    len_nut=len(NONE)
+
+    len_co=[]
+    for i in range(length_Features):
+        len_co.append(len(Features['co_relation'][i]))
+
+    Features['len_nava']=len_co
+
+    nava=[]
+    for i in range(length_Features):
+        for word,tag in Features['pos_tag'][i]:
+            if(tag=='NOUN' or tag=='ADJ' or tag=='VERB' or tag=='ADV'):
+                if(word not in Stop_words):
+                    nava.append(word.lower())
+    nava_stem=[]
+    for word in nava:
+        nava_stem.append(porter.stem(word))
+    uni_nava_stem=list(set(nava_stem))
+    nava_stem = Counter(nava_stem)
+
+
+    total=len(nava_stem)
+    length=len(uni_nava_stem)
+
+    print(total,length)
+
+    seed=[]
+    non_seed=[]
+    seed_stance=[]
+    for i in range(len(Features)):
+        for j in range(int(0.75*Features['len_nava'][i])):
+            seed.append(Features['co_relation'][i][j])
+            seed_stance.append(Features['stance'][i])
+        for j in range(int(0.75*Features['len_nava'][i]),Features['len_nava'][i]):
+            non_seed.append(Features['co_relation'][i][j])
+    uni_seed=list(set(seed))
+    uni_non_seed=list(set(non_seed))
+
+    '''for i in range(len(Features)):
+        x=[]
+        x=random.sample(Features['co_relation'][i],int(0.75*Features['len_nava'][i]))
+        for j in range(len(x)):
+            seed.append(x[j])
+            seed_stance.append(Features['stance'][i])
+        for j in range(Features['len_nava'][i]):
+            if(Features['co_relation'][i][j] not in x):
+                non_seed.append(Features['co_relation'][i][j])
+    uni_seed=list(set(seed))
+    uni_non_seed=list(set(non_seed))'''
+
+    len_seed=len(seed)
+    len_uni_seed=len(uni_seed)
+    len_non_seed=len(non_seed)
+    len_uni_non_seed=len(uni_non_seed)
+
+    len_seed_sup=0
+    len_seed_opp=0
+    len_seed_nut=0
+    for i in range(len(seed_stance)):
+        if(seed_stance[i]=='support'):
+            len_seed_sup=len_seed_sup+1
+        else:
+            if(seed_stance[i]=='oppose'):
+                len_seed_opp=len_seed_opp+1
+            else:
+                len_seed_nut=len_seed_nut+1
+    print(len_seed_nut,len_seed_opp,len_seed_sup)
+
+    
+
+
+    prob_sup=len_seed_sup/(len_seed_sup+len_seed_opp+len_seed_nut)
+    prob_opp=len_seed_opp/(len_seed_sup+len_seed_opp+len_seed_nut)
+    prob_nut=len_seed_nut/(len_seed_sup+len_seed_opp+len_seed_nut)
+
+    prob_word=[]
+    for word in uni_seed:
+        prob_word.append(prob(word,nava_stem,total))
+
+    prob_cond_word={}
+    prob_supp_word=[]
+    prob_opp_word=[]
+    prob_neu_word=[]
+
+    for word in uni_seed:
+        prob_supp_word.append(prob_cond(word,seed,seed_stance,'support',(len_seed_sup+len_seed_opp+len_seed_nut)))
+        prob_opp_word.append(prob_cond(word,seed,seed_stance,'oppose',(len_seed_sup+len_seed_opp+len_seed_nut)))
+        prob_neu_word.append(prob_cond(word,seed,seed_stance,'neutral',(len_seed_sup+len_seed_opp+len_seed_nut)))
+
+    prob_cond_word={'word':list(uni_seed),'prob_word':prob_word,'prob_supp_word':prob_supp_word,'prob_opp_word':prob_opp_word,'prob_neu_word':prob_neu_word}
+    Seed_lexicon = pd.DataFrame(data=prob_cond_word)
+
+
+
+    print(Seed_lexicon)
+
+    pmi_AGAINST=[]
+    pmi_FAVOR=[]
+    pmi_NONE=[]
+    '''for i in range(len_uni_seed):
+        pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],prob_opp))
+        pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],prob_sup))
+        pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],prob_nut))'''
+
+    for i in range(len_uni_seed):
+        pmi_AGAINST.append(pmi(prob_opp_word[i],prob_word[i],len_seed_opp,len_seed))
+        pmi_FAVOR.append(pmi(prob_supp_word[i],prob_word[i],len_seed_sup,len_seed))
+        pmi_NONE.append(pmi(prob_neu_word[i],prob_word[i],len_seed_nut,len_seed))
+
+
+    Seed_lexicon['pmi_AGAINST']=list(pmi_AGAINST)
+    Seed_lexicon['pmi_FAVOR']=list(pmi_FAVOR)
+    Seed_lexicon['pmi_NONE']=list(pmi_NONE)
+
+    stance=[]
+    for i in range(len_uni_seed):
+        if((Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_AGAINST'][i]) and (Seed_lexicon['pmi_FAVOR'][i] > Seed_lexicon['pmi_NONE'][i])):
+            stance.append('support')
+        else:
+            if((Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_FAVOR'][i]) & (Seed_lexicon['pmi_AGAINST'][i] > Seed_lexicon['pmi_NONE'][i])):
+                stance.append('oppose')
+            else:
+                stance.append('neutral')
+
+    Seed_lexicon['Stance']=list(stance)
+
+        #NON SEED LEXICON
+    score_non_seed_opp=[]
+    score_non_seed_sup=[]
+    score_non_seed_nut=[]
+
+    opp_seed_word=[]
+    nut_seed_word=[]
+    sup_seed_word=[]
+    for i in range(len_uni_seed):
+            if(Seed_lexicon['Stance'][i]=='support'):
+                sup_seed_word.append(Seed_lexicon['word'][i])
+            else:
+                if(Seed_lexicon['Stance'][i]=='oppose'):
+                    opp_seed_word.append(Seed_lexicon['word'][i])
+                else:
+                    nut_seed_word.append(Seed_lexicon['word'][i])
+
+    #opp_seed_word=set(opp_seed_word)
+    #nut_seed_word=set(nut_seed_word)
+    #sup_seed_word=set(sup_seed_word)
+
+    len_opp_words=len(opp_seed_word)
+    len_nut_words=len(nut_seed_word)
+    len_sup_words=len(sup_seed_word)
+
+    pmi_non_seed={}
+
+    start1=time.time()
+    print("COMPUTING...")
+    k=0
+    for word in uni_non_seed:
+        list_=[]
+        for i in range(len_sup_words):
+            l=pmi(prob_cond1(word,sup_seed_word[i],Features,total),prob(word,nava_stem,total),prob(sup_seed_word[i],nava_stem,total),total)
+            if(l<0):
+                list_.append(1)
+            else:
+                list_.append(l)
+        score_non_seed_sup.append(stats.gmean(list_))
+        #print(k)
+        k=k+1
+    print("score_non_seed_sup_complete :)")
+    end1=time.time()
+    time1=end1-start1
+    print(time1)
+
+    start2=time.time()
+    k=0
+    for word in uni_non_seed:
+        list_=[]
+        for i in range(len_opp_words):        
+            l=pmi(prob_cond1(word,opp_seed_word[i],Features,total),prob(word,nava_stem,total),prob(opp_seed_word[i],nava_stem,total),total)
+            if(l<0):
+                list_.append(1)
+            else:
+                list_.append(l)
+        score_non_seed_opp.append(stats.gmean(list_))
+        #print(k)
+        k=k+1
+    print("score_non_seed_opp_complete :)")
+    end2=time.time() 
+    time2=end2-start2
+    print(time2)
+
+    start3=time.time()
+    k=0
+
+    #print("~~~~",nut_seed_word)
+    print(len(uni_non_seed),len_nut_words)
+    for word in uni_non_seed:
+        list_=[]
+        #s2 = time.time()
+        for i in range(len_nut_words):
+            #s1 = time.time()
+            l=pmi(prob_cond1(word,nut_seed_word[i],Features,total),                  prob(word,nava_stem,total),                  prob(nut_seed_word[i],nava_stem,total),total)
+            #print(time.time()-s1)
+            if(l<0):
+                list_.append(1)
+            else:
+                list_.append(l)
+        score_non_seed_nut.append(stats.gmean(list_))
+        #print(time.time()-s2)
+        #print(k)
+        k=k+1
+    print("score_non_seed_nut_complete :)")   
+    end3=time.time()
+    print("Process Complete :)")
+    time3=end3-start3
+    print(time3)
+
+    total_time=time1+time2+time3
+    print(total_time)
+
+    prob_cond_word={'word':list(uni_non_seed),'score_non_seed_opp':score_non_seed_opp,'score_non_seed_sup':score_non_seed_sup,'score_non_seed_nut':score_non_seed_nut}
+    NonSeed_lexicon = pd.DataFrame(data=prob_cond_word)
+
+    #Tweet Vector Formation
+    lex_word=[]
+    lex_word.extend(list(Seed_lexicon['word']))
+    lex_word.extend(list(NonSeed_lexicon['word']))
+
+    pmi_sup=[]
+    pmi_sup.extend(list(Seed_lexicon['pmi_FAVOR']))
+    pmi_sup.extend(list(NonSeed_lexicon['score_non_seed_sup']))
+
+    pmi_opp=[]
+    pmi_opp.extend(list(Seed_lexicon['pmi_AGAINST']))
+    pmi_opp.extend(list(NonSeed_lexicon['score_non_seed_opp']))
+
+    pmi_nut=[]
+    pmi_nut.extend(list(Seed_lexicon['pmi_NONE']))
+    pmi_nut.extend(list(NonSeed_lexicon['score_non_seed_nut']))
+
+    Lexicon = dict()
+    for i in range(len(lex_word)):
+        Lexicon[lex_word[i]] = {'pmi_sup':pmi_sup[i],'pmi_opp':pmi_opp[i],'pmi_nut':pmi_nut[i]}
+
+    print("Lexicon formed")
+    return Lexicon
+
+    #Lexicon={'word':lex_word,'pmi_sup':pmi_sup,'pmi_opp':pmi_opp,'pmi_nut':pmi_nut}
+    #Lexicon = pd.DataFrame(data=Lexicon)
+
+
+    
+    
+    
+
+
+# In[14]:
+
+
+#Lexicon = build_lexicon('SC')
+
+
+# In[26]:
+
+
+def produce_features(name,Lexicon):
+    #train_features
+    for l in ['train','test']:
+        raw=pd.read_csv('./MPHI_Preprocessed/'+name+'/{}.csv'.format(l))
+        Stop_words=set(stopwords.words('english'))
+        Features=raw[['sentence']]
+        Tweet=Features['sentence'].copy()
+
+        Features['preprocessed_sentence']=Tweet.apply(sent_process)
+        Features['tokenized_sents'] = Features.apply(lambda row: (row['preprocessed_sentence'].split()), axis=1)
+
+        porter = PorterStemmer()
+        start=time.time()
+        #word_sup_vect=[]
+        #word_opp_vect=[]
+        #word_nut_vect=[]
+
+        data = [['sentence','pmi_sup','pmi_opp','pmi_nut']]
+        len_lexicon_word=len(Lexicon)
+
+        for i in range(len(Features['sentence'])):
+            sum1=0
+            sum2=0
+            sum3=0
+            total_lex=0
+            temp = []
+            for word in Features['tokenized_sents'][i]:
+                #for j in range(len_lexicon_word):
+
+                w = porter.stem(word)
+                if w in Lexicon:
+                    sum1=sum1+Lexicon[w]['pmi_sup']
+                    sum2=sum2+Lexicon[w]['pmi_opp']
+                    sum3=sum3+Lexicon[w]['pmi_nut']
+                    total_lex=total_lex+1
+            #word_sup_vect.append(sum1/total_lex)
+            #word_opp_vect.append(sum2/total_lex)
+            #word_nut_vect.append(sum3/total_lex)
+            data.append([Features['sentence'][i],sum1/total_lex,sum2/total_lex,sum3/total_lex])
+
+        my_df = pd.DataFrame(data)
+        my_df.to_csv('./pmi/pmi_{}_{}.csv'.format(name,l+'1'),header=False,index=False)
+
+
+    end=time.time()
+    print(end-start)
+
+
+# In[27]:
+produce_features('HRT',build_lexicon('HRT'))
+
+'''for dataset in ['AT','LA','CC','HC','FM']:
+    produce_features(dataset,build_lexicon(dataset))'''
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+
+
--- a/SEN-SVM/SVM.py
+++ b/SEN-SVM/SVM.py
@ -0,0 +1,318 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[9]:
+
+
+import csv
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+import string
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import wordninja
+import re
+import json
+from sklearn import svm
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import classification_report,confusion_matrix
+import numpy as np
+from nltk.tokenize import word_tokenize
+from nltk.stem.porter import PorterStemmer
+import pandas as pd
+
+
+# In[2]:
+stemmer = PorterStemmer()
+
+def load_glove_embeddings_set():
+    word2emb = []
+    WORD2VEC_MODEL = "../glove.6B.300d.txt"
+    fglove = open(WORD2VEC_MODEL,"r")
+    for line in fglove:
+        cols = line.strip().split()
+        word = cols[0]
+        word2emb.append(word)
+    fglove.close()
+    return set(word2emb)
+
+def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
+    print("Creating Normalization Dictionary")
+    with open(no_slang_data, "r") as f:
+        data1 = json.load(f)
+
+    data2 = {}
+
+    with open(emnlp_dict,"r") as f:
+        lines = f.readlines()
+        for line in lines:
+            row = line.split('\t')
+            data2[row[0]] = row[1].rstrip()
+
+    normalization_dict = {**data1,**data2}
+    #print(normalization_dict)
+    return normalization_dict
+
+word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
+
+
+# In[3]:
+
+
+def sent_process(sent):
+    sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
+    sent = re.sub(r"#SemST", "", sent)
+    sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
+    #sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
+    #sent =  re.sub(r"([A-Z])", r" \1", sent)
+    sent = re.sub(r"\'s", " \'s", sent)
+    sent = re.sub(r"\'ve", " \'ve", sent)
+    sent = re.sub(r"n\'t", " n\'t", sent)
+    sent = re.sub(r"\'re", " \'re", sent)
+    sent = re.sub(r"\'d", " \'d", sent)
+    sent = re.sub(r"\'ll", " \'ll", sent)
+    sent = re.sub(r",", " , ", sent)
+    sent = re.sub(r"!", " ! ", sent)
+    sent = re.sub(r"\(", " ( ", sent)
+    sent = re.sub(r"\)", " ) ", sent)
+    sent = re.sub(r"\?", " ? ", sent)
+    sent = re.sub(r"\s{2,}", " ", sent)
+    sent = sent.strip()
+    word_tokens = sent.split()
+    normalised_tokens = []
+    for word in word_tokens:
+        if word in norm_dict:
+        #if False:
+            normalised_tokens.extend(norm_dict[word].lower().split(" "))
+            print(word," normalised to ",norm_dict[word])
+        else:
+            normalised_tokens.append(word.lower())
+    wordninja_tokens = []
+    for word in normalised_tokens:
+        if word in word_dict:
+            wordninja_tokens+=[word]
+        else:
+            wordninja_tokens+=wordninja.split(word)
+    return " ".join(wordninja_tokens)
+
+
+# In[4]:
+
+
+
+def svc_param_selection(X, y, nfolds):
+    Cs = [0.001, 0.01, 0.1, 1, 10,100 ]
+    gammas = [0.001, 0.01, 0.1, 1]
+    param_grid = [{'C': Cs, 'gamma' : gammas , 'kernel' : ['rbf']},{'C': Cs , 'gamma' : gammas , 'kernel' : ['linear']}]
+    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
+    grid_search.fit(X, y)
+    grid_search.best_params_
+    return grid_search.best_params_
+
+
+# In[21]:
+
+
+def train(topic,bow=False,senti=False,sta=False,ent=False):
+    save_file = topic+"_"
+    print(topic)
+    y_train = []
+    y_test = []
+    sentences = []
+    features_ent = []
+    features_sta = []
+    features_senti = []
+    senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2}
+    with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f:
+            reader = csv.DictReader(f, delimiter=',')
+            done = False
+            for row in reader:
+                sentences.append(row['sentence'])
+                if ent:
+                    features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
+                    if not done:
+                        save_file = save_file + "ent_"
+                if sta:
+                    features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
+                    if not done:
+                        save_file = save_file + "sta_"
+                if senti:
+                    features_senti.append([senti_dict[row['senti']]])
+                    if not done:
+                        save_file = save_file + "senti_"
+                done = True
+                y_train.append(row['label'])
+    L = len(sentences)
+    with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f:
+            reader = csv.DictReader(f, delimiter=',')
+            for row in reader:
+                sentences.append(row['sentence'])
+                if ent:
+                    features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
+                if sta:
+                    features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
+                if senti:
+                    features_senti.append([senti_dict[row['senti']]])
+                y_test.append(row['label'])
+                
+    all_features = []
+    if bow:
+        new_sentences = []
+        for sent in sentences:
+            tokens = word_tokenize(sent)
+            tokens = [stemmer.stem(token) for token in tokens]
+            ret = " ".join(w for w in tokens)
+            new_sentences.append(ret)
+        save_file = save_file + "bow_"
+        vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2)
+        features_bow = vectorizer.fit_transform(new_sentences)
+        all_features.append(features_bow.toarray())
+    #features_bow_train = np.array(features_bow[:L].toarray())
+    #features_bow_test = np.array(features_bow[L:].toarray())
+    
+    if ent:
+        #features_ent_train = np.array(features_ent[:L])
+        #features_ent_test = np.array(features_ent[L:])
+        all_features.append(features_ent)
+    
+    if sta:
+        #features_ent_train = np.array(features_ent[:L])
+        #features_ent_test = np.array(features_ent[L:])
+        all_features.append(features_sta)
+    
+    
+    if senti:
+        #features_senti_train = np.array(features_senti[:L])
+        #features_senti_test = np.array(features_senti[L:])
+        all_features.append(features_senti)
+    
+    dataset = np.concatenate(all_features,axis=1)
+    train_dataset = dataset[:L]
+    test_dataset = dataset[L:]
+    
+    best_params = svc_param_selection(train_dataset,y_train,nfolds=5)
+    print(best_params)
+    if best_params['kernel'] == 'rbf':
+        model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
+    else:
+        model = svm.SVC(kernel='linear' ,C = best_params['C'],probability=True)
+    
+    
+    model.fit(train_dataset,y_train)
+    
+    y_pred = model.predict(test_dataset)
+    print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+    #cm = confusion_matrix(y_test,y_pred,labels=['0','1','2'])
+    conf_score = model.predict_proba(test_dataset)
+
+    #print
+    df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1))
+    df.to_csv(save_file+".csv",header=False,index=False)
+    return y_pred,y_test
+                
+
+
+# In[22]:
+
+'''
+#BOW
+print ('BOW---------')
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,bow = True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))'''
+
+
+# In[23]:
+
+print ("STA---------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[7]:
+
+print("ENT----------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,ent=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[8]:
+
+print("SENTI---------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,senti=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[9]:
+
+print("ENT_SENTI--------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,ent=True,senti=True,sta=False)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[10]:
+
+print("ENT_STA---------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,ent=True,senti=False,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[11]:
+
+print("SENTI_STA-----------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,ent=False,senti=True,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[12]:
+
+print("ENT_SENTI_STA---------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,ent=True,senti=True,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[ ]:
+
+
+
+
--- a/SEN-SVM/SVM_mpchi.py
+++ b/SEN-SVM/SVM_mpchi.py
@ -0,0 +1,380 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[9]:
+
+
+import csv
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+import string
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import wordninja
+import re
+import json
+from sklearn import svm
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import classification_report,confusion_matrix
+import numpy as np
+from nltk.tokenize import word_tokenize
+from nltk.stem.porter import PorterStemmer
+import pandas as pd
+
+
+# In[2]:
+stemmer = PorterStemmer()
+
+def load_glove_embeddings_set():
+    word2emb = []
+    WORD2VEC_MODEL = "../glove.6B.300d.txt"
+    fglove = open(WORD2VEC_MODEL,"r")
+    for line in fglove:
+        cols = line.strip().split()
+        word = cols[0]
+        word2emb.append(word)
+    fglove.close()
+    return set(word2emb)
+
+def create_normalise_dict(no_slang_data = "noslang_data.json", emnlp_dict = "emnlp_dict.txt"):
+    print("Creating Normalization Dictionary")
+    with open(no_slang_data, "r") as f:
+        data1 = json.load(f)
+
+    data2 = {}
+
+    with open(emnlp_dict,"r") as f:
+        lines = f.readlines()
+        for line in lines:
+            row = line.split('\t')
+            data2[row[0]] = row[1].rstrip()
+
+    normalization_dict = {**data1,**data2}
+    #print(normalization_dict)
+    return normalization_dict
+
+word_dict,norm_dict = load_glove_embeddings_set(),create_normalise_dict()
+
+
+# In[3]:
+
+
+def sent_process(sent):
+    sent = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", sent)
+    sent = re.sub(r"#SemST", "", sent)
+    sent = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", sent)
+    #sent = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", sent)
+    #sent =  re.sub(r"([A-Z])", r" \1", sent)
+    sent = re.sub(r"\'s", " \'s", sent)
+    sent = re.sub(r"\'ve", " \'ve", sent)
+    sent = re.sub(r"n\'t", " n\'t", sent)
+    sent = re.sub(r"\'re", " \'re", sent)
+    sent = re.sub(r"\'d", " \'d", sent)
+    sent = re.sub(r"\'ll", " \'ll", sent)
+    sent = re.sub(r",", " , ", sent)
+    sent = re.sub(r"!", " ! ", sent)
+    sent = re.sub(r"\(", " ( ", sent)
+    sent = re.sub(r"\)", " ) ", sent)
+    sent = re.sub(r"\?", " ? ", sent)
+    sent = re.sub(r"\s{2,}", " ", sent)
+    sent = sent.strip()
+    word_tokens = sent.split()
+    normalised_tokens = []
+    for word in word_tokens:
+        if word in norm_dict:
+        #if False:
+            normalised_tokens.extend(norm_dict[word].lower().split(" "))
+            print(word," normalised to ",norm_dict[word])
+        else:
+            normalised_tokens.append(word.lower())
+    wordninja_tokens = []
+    for word in normalised_tokens:
+        if word in word_dict:
+            wordninja_tokens+=[word]
+        else:
+            wordninja_tokens+=wordninja.split(word)
+    return " ".join(wordninja_tokens)
+
+
+# In[4]:
+
+#{'C': c, 'gamma' : gammas , 'kernel' : ['rbf']},
+
+def svc_param_selection(X, y, nfolds):
+    Cs = [0.001, 0.01, 0.1, 1, 10,100 ]
+    c = [1]
+    gammas = [0.001, 0.01, 0.1, 1]
+    param_grid = [{'C': c , 'gamma' : gammas , 'kernel' : ['linear']}]
+    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
+    grid_search.fit(X, y)
+    grid_search.best_params_
+    return grid_search.best_params_
+
+
+# In[21]:
+
+
+def train(topic,bow=False,senti=False,sta=False,ent=False,med = False):
+    save_file = topic+"_"
+    print(topic)
+    y_train = []
+    y_test = []
+    sentences = []
+    features_ent = []
+    features_sta = []
+    features_senti = []
+    features_med = []
+    senti_dict = {'Neutral' : 0, 'Positive' : 1, 'Negative' : 2}
+    with open("./final_feature_set/{}_train.csv".format(topic),"r",encoding='latin-1') as f:
+            reader = csv.DictReader(f, delimiter=',')
+            done = False
+            for row in reader:
+                sentences.append(row['sentence'])
+                if ent:
+                    features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
+                    if not done:
+                        save_file = save_file + "ent_"
+                if sta:
+                    features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
+                    if not done:
+                        save_file = save_file + "sta_"
+                if senti:
+                    features_senti.append([senti_dict[row['senti']]])
+                    if not done:
+                        save_file = save_file + "senti_"
+                if med:
+                    features_med.append([row['med_aff'],row['med_treat']])
+                    if not done:
+                        save_file = save_file + "med"
+                done = True
+                y_train.append(row['label'])
+    L = len(sentences)
+    with open("./final_feature_set/{}_test.csv".format(topic),"r",encoding='latin-1') as f:
+            reader = csv.DictReader(f, delimiter=',')
+            for row in reader:
+                sentences.append(row['sentence'])
+                if ent:
+                    features_ent.append([row['ent_nut'],row['ent_pos'],row['ent_neg']])
+                if sta:
+                    features_sta.append([row['sta_nut'],row['sta_sup'],row['sta_opp']])
+                if senti:
+                    features_senti.append([senti_dict[row['senti']]])
+                if med:
+                    features_med.append([row['med_aff'],row['med_treat']])
+                y_test.append(row['label'])
+                
+    all_features = []
+    if bow:
+        new_sentences = []
+        for sent in sentences:
+            tokens = word_tokenize(sent)
+            tokens = [stemmer.stem(token) for token in tokens]
+            ret = " ".join(w for w in tokens)
+            new_sentences.append(ret)
+        save_file = save_file + "bow_"
+        vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1),min_df = 2)
+        features_bow = vectorizer.fit_transform(new_sentences)
+        all_features.append(features_bow.toarray())
+    #features_bow_train = np.array(features_bow[:L].toarray())
+    #features_bow_test = np.array(features_bow[L:].toarray())
+    
+    if ent:
+        #features_ent_train = np.array(features_ent[:L])
+        #features_ent_test = np.array(features_ent[L:])
+        all_features.append(features_ent)
+    
+    if sta:
+        #features_ent_train = np.array(features_ent[:L])
+        #features_ent_test = np.array(features_ent[L:])
+        all_features.append(features_sta)
+    
+    
+    if senti:
+        #features_senti_train = np.array(features_senti[:L])
+        #features_senti_test = np.array(features_senti[L:])
+        all_features.append(features_senti)
+
+    if med:
+        all_features.append(features_med)
+    dataset = np.concatenate(all_features,axis=1)
+    train_dataset = dataset[:L]
+    test_dataset = dataset[L:]
+    
+    best_params = svc_param_selection(train_dataset,y_train,nfolds=5)
+    print(best_params)
+    if best_params['kernel'] == 'rbf':
+        model = svm.SVC(kernel='rbf' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
+    else:
+        model = svm.SVC(kernel='linear' ,C = best_params['C'], gamma = best_params['gamma'],probability=True)
+    
+    
+    model.fit(train_dataset,y_train)
+    
+    y_pred = model.predict(test_dataset)
+    print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+    #cm = confusion_matrix(y_test,y_pred,labels=['0','1','2'])
+    conf_score = model.predict_proba(test_dataset)
+
+    #print
+    df = pd.DataFrame(np.concatenate([np.array(sentences[L:]).reshape(-1,1),np.array(y_pred).reshape(-1,1),np.array(conf_score)],axis=1))
+    df.to_csv(save_file+".csv",header=False,index=False)
+    return y_pred,y_test
+                
+
+
+# In[22]:
+
+'''
+#BOW
+print ('BOW---------')
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,bow = True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[23]:
+
+print ("STA---------")
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[7]:
+
+print("ENT----------")
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,ent=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[8]:
+
+print("SENTI---------")
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,senti=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[9]:
+
+print("ENT_SENTI--------")
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset ,senti=True , ent=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[10]:
+
+print("ENT_STA---------")
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,ent=True,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[11]:
+
+print("SENTI_STA-----------")
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,senti=True,sta=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[12]:
+
+print("ENT_SENTI_STA---------")
+y_pred,y_test = [],[]
+for dataset in ['AT','CC','FM','LA','HC']:
+    a,b = train(dataset,ent=True,senti=True,sta=True,bow = True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+# In[ ]:
+
+
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,med=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,sta = True, med=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,senti = True, med=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,sta = True, senti = True, med=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
+
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,sta = True, senti = True, med=True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))'''
+
+y_pred,y_test = [],[]
+for dataset in ['MMR','HRT','EC','VC','SC']:
+    a,b = train(dataset,sta = True, senti = True, med=True, ent = True, bow= True)
+    y_pred.extend(a)
+    y_test.extend(b)
+    print(len(a),len(b))
+    
+print(classification_report(y_test,y_pred,labels=['support','oppose'],target_names=['support','oppose']))
--- a/SEN-SVM/sentiment_api_2.py
+++ b/SEN-SVM/sentiment_api_2.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 13 03:05:06 2018
+
+@author: shalmoli
+"""
+import pandas as pd
+import csv
+from pycorenlp import StanfordCoreNLP
+import json
+
+#reading Input file
+data=pd.read_csv('HRT2.csv')
+#Extracting Sentences
+sentence=data['sentences']
+length=len(sentence)
+
+#storing Result
+# before Executing this,we have to start the Stanford Server through Terminal
+'''It's returns sentiment sentences wise.
+If we have have abstract more than one sentence ,we conclude the sentiment for that sentence by considering the majority and breaking tie randomly.'''
+result=[]
+for i in range(length):
+    nlp = StanfordCoreNLP('http://localhost:9000')
+    res = nlp.annotate(sentence[i],
+                       properties={
+                           'annotators': 'sentiment',
+                           'outputFormat': 'json',
+                           'timeout': 100000,
+                       })
+    print (res)
+    count=0# counting number of sentences in input
+    count_1=0#counting Negative Sentiment
+    count_2=0#counting Neutral Sentiment
+    count_3=0#counting Positive Sentiment
+    for s in (res['sentences']):
+        if(s["sentimentValue"]):
+            count=count+1
+            if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'):
+                count_1=count_1+1
+            else:
+                if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'):
+                    count_3=count_3+1
+                else:
+                    count_2=count_2+1
+    if(count>1):
+        if(count_1 > count_2 and count_1 > count_3):
+            result.append("Negative")
+        else:
+            if(count_2 > count_1 and count_2 > count_3):
+                result.append('Neutral')
+            else:
+                result.append('Positive')
+    else:
+        if(s["sentiment"]=='Negative' or s["sentiment"]=='Verynegative'):
+            result.append("Negative")
+        else:
+            if(s["sentiment"]=='Positive' or s["sentiment"]=='Verypositive'):
+                result.append("Positive")
+            else:
+                result.append('Neutral')
+ 
+#Storing Output to file    Oytput is binary for each class individually               
+file = open('Sentiment_HRT2.csv','a')
+fields = ('sentence','positive','negative','neutral','sentiment')
+wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
+wr.writeheader()
+for i in range(length):
+    if(result[i]=='Negative'):
+            wr.writerow({'sentence':sentence[i],'positive':0,'negative':1,'neutral':0,'sentiment':result[i]}) 
+    else:
+        if(result[i]=='Positive'):
+            wr.writerow({'sentence':sentence[i],'positive':1,'negative':0,'neutral':0,'sentiment':result[i]})
+        else:
+            wr.writerow({'sentence':sentence[i],'positive':0,'negative':0,'neutral':1,'sentiment':result[i]})
+ 
+file.close()           
--- a/SEN-SVM/te_f.py
+++ b/SEN-SVM/te_f.py
@ -0,0 +1,342 @@
+def isnan(value):
+  try:
+      import math
+      return math.isnan(float(value))
+  except:
+      return False
+
+#matplotlib inline
+
+import tensorflow as tf
+import numpy as np
+import pandas as pd
+#import matplotlib.pyplot as plt
+#import matplotlib.ticker as ticker
+import urllib
+import sys
+import os
+import csv
+import zipfile
+
+glove_zip_file = "glove.6B.zip"
+glove_vectors_file = "glove.6B.50d.txt"
+
+snli_zip_file = "snli_1.0.zip"
+snli_dev_file = "snli_1.0_dev.txt"
+snli_full_dataset_file = "snli_1.0_train.txt"
+
+from six.moves.urllib.request import urlretrieve
+    
+#large file - 862 MB
+if (not os.path.isfile(glove_zip_file) and
+    not os.path.isfile(glove_vectors_file)):
+    urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip", 
+                 glove_zip_file)
+
+#medium-sized file - 94.6 MB
+if (not os.path.isfile(snli_zip_file) and
+    not os.path.isfile(snli_dev_file)):
+    urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip", 
+                 snli_zip_file)
+
+def unzip_single_file(zip_file_name, output_file_name):
+    """
+        If the outFile is already created, don't recreate
+        If the outFile does not exist, create it from the zipFile
+    """
+    if not os.path.isfile(output_file_name):
+        with open(output_file_name, 'wb') as out_file:
+            with zipfile.ZipFile(zip_file_name) as zipped:
+                for info in zipped.infolist():
+                    if output_file_name in info.filename:
+                        with zipped.open(info) as requested_file:
+                            out_file.write(requested_file.read())
+                            return
+
+unzip_single_file(glove_zip_file, glove_vectors_file)
+unzip_single_file(snli_zip_file, snli_dev_file)
+
+glove_wordmap = {}
+with open(glove_vectors_file, "r") as glove:
+    for line in glove:
+        name, vector = tuple(line.split(" ", 1))
+        glove_wordmap[name] = np.fromstring(vector, sep=" ")
+        
+
+def sentence2sequence(sentence):
+    """
+     
+    - Turns an input sentence into an (n,d) matrix, 
+        where n is the number of tokens in the sentence
+        and d is the number of dimensions each word vector has.
+    
+    """
+    tokens = sentence.lower().split(" ")
+    rows = []
+    words = []
+    #Greedy search for tokens
+    for token in tokens:
+        i = len(token)
+        while len(token) > 0 and i > 0:
+            word = token[:i]
+            if word in glove_wordmap:
+                rows.append(glove_wordmap[word])
+                words.append(word)
+                token = token[i:]
+                i = len(token)
+            else:
+                i = i-1
+    return rows, words
+
+rnn_size = 64
+rnn = tf.contrib.rnn.BasicRNNCell(rnn_size)
+
+#Constants setup
+max_hypothesis_length, max_evidence_length = 60, 50
+batch_size, vector_size, hidden_size = 128, 50, 64
+
+lstm_size = hidden_size
+
+weight_decay = 0.0001
+
+learning_rate = 1
+
+input_p, output_p = 0.5, 0.5
+
+training_iterations_count = 100000
+
+display_step = 10
+
+def score_setup(row):
+    convert_dict = {
+      'entailment': 0,
+      'neutral': 1,
+      'contradiction': 2
+    }
+    score = np.zeros((3,))
+    for x in range(1,6):
+        tag = row["label"+str(x)]
+        if tag in convert_dict: score[convert_dict[tag]] += 1
+    return score / (1.0*np.sum(score))
+
+def fit_to_size(matrix, shape):
+    res = np.zeros(shape)
+    slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)]
+    res[slices] = matrix[slices]
+    return res
+
+
+def split_data_into_scores():
+    import csv
+    with open("snli_1.0_dev.txt","r") as data:
+        train = csv.DictReader(data, delimiter='\t')
+        evi_sentences = []
+        hyp_sentences = []
+        labels = []
+        scores = []
+        for row in train:
+            hyp_sentences.append(np.vstack(
+                    sentence2sequence(row["sentence1"].lower())[0]))
+            evi_sentences.append(np.vstack(
+                    sentence2sequence(row["sentence2"].lower())[0]))
+            labels.append(row["gold_label"])
+            scores.append(score_setup(row))
+        
+        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
+                          for x in hyp_sentences])
+        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
+                          for x in evi_sentences])
+                                 
+        return (hyp_sentences, evi_sentences), labels, np.array(scores)
+    
+data_feature_list, correct_values, correct_scores = split_data_into_scores()
+
+l_h, l_e = max_hypothesis_length, max_evidence_length
+N, D, H = batch_size, vector_size, hidden_size
+l_seq = l_h + l_e
+
+tf.reset_default_graph()
+# lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
+lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
+lstm_drop =  tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p)
+
+
+# N: The number of elements in each of our batches, 
+#   which we use to train subsets of data for efficiency's sake.
+# l_h: The maximum length of a hypothesis, or the second sentence.  This is
+#   used because training an RNN is extraordinarily difficult without 
+#   rolling it out to a fixed length.
+# l_e: The maximum length of evidence, the first sentence.  This is used
+#   because training an RNN is extraordinarily difficult without 
+#   rolling it out to a fixed length.
+# D: The size of our used GloVe or other vectors.
+hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis')
+evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence')
+y = tf.placeholder(tf.float32, [N, 3], 'label')
+# hyp: Where the hypotheses will be stored during training.
+# evi: Where the evidences will be stored during training.
+# y: Where correct scores will be stored during training.
+
+# lstm_size: the size of the gates in the LSTM, 
+#    as in the first LSTM layer's initialization.
+# lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size)
+lstm_back = tf.nn.rnn_cell.LSTMCell(lstm_size)
+# lstm_back:  The LSTM used for looking backwards 
+#   through the sentences, similar to lstm.
+
+# input_p: the probability that inputs to the LSTM will be retained at each
+#   iteration of dropout.
+# output_p: the probability that outputs from the LSTM will be retained at 
+#   each iteration of dropout.
+lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p)
+# lstm_drop_back:  A dropout wrapper for lstm_back, like lstm_drop.
+
+
+fc_initializer = tf.random_normal_initializer(stddev=0.1) 
+# fc_initializer: initial values for the fully connected layer's weights.
+# hidden_size: the size of the outputs from each lstm layer.  
+#   Multiplied by 2 to account for the two LSTMs.
+fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3], 
+                            initializer = fc_initializer)
+# fc_weight: Storage for the fully connected layer's weights.
+fc_bias = tf.get_variable('bias', [3])
+# fc_bias: Storage for the fully connected layer's bias.
+
+# tf.GraphKeys.REGULARIZATION_LOSSES:  A key to a collection in the graph
+#   designated for losses due to regularization.
+#   In this case, this portion of loss is regularization on the weights
+#   for the fully connected layer.
+tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, 
+                     tf.nn.l2_loss(fc_weight)) 
+
+x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d
+# Permuting batch_size and n_steps
+x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d
+# Reshaping to (n_steps*batch_size, n_input)
+x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d
+# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
+x = tf.split(x, l_seq,)
+
+# x: the inputs to the bidirectional_rnn
+
+
+# tf.contrib.rnn.static_bidirectional_rnn: Runs the input through
+#   two recurrent networks, one that runs the inputs forward and one
+#   that runs the inputs in reversed order, combining the outputs.
+rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back,x, dtype=tf.float32)
+# rnn_outputs: the list of LSTM outputs, as a list. 
+#   What we want is the latest output, rnn_outputs[-1]
+
+classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias
+# The scores are relative certainties for how likely the output matches
+#   a certain entailment: 
+#     0: Positive entailment
+#     1: Neutral entailment
+#     2: Negative entailment
+
+with tf.variable_scope('Accuracy'):
+    predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32')
+    y_label = tf.cast(tf.argmax(y, 1), 'int32')
+    corrects = tf.equal(predicts, y_label)
+    num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
+    accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))
+
+with tf.variable_scope("loss"):
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
+        logits = classification_scores, labels = y)
+    loss = tf.reduce_mean(cross_entropy)
+    total_loss = loss + weight_decay * tf.add_n(
+        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+
+optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+
+opt_op = optimizer.minimize(total_loss)
+
+# Initialize variables
+init = tf.global_variables_initializer()
+
+# Use TQDM if installed
+tqdm_installed = False
+try:
+    from tqdm import tqdm
+    tqdm_installed = True
+except:
+    pass
+
+# Launch the Tensorflow session
+sess = tf.Session()
+sess.run(init)
+
+# training_iterations_count: The number of data pieces to train on in total
+# batch_size: The number of data pieces per batch
+training_iterations = range(0,training_iterations_count,batch_size)
+if tqdm_installed:
+    # Add a progress bar if TQDM is installed
+    training_iterations = tqdm(training_iterations)
+
+for i in training_iterations:
+
+    # Select indices for a random data subset
+    batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size)
+    
+    # Use the selected subset indices to initialize the graph's 
+    #   placeholder values
+    hyps, evis, ys = (data_feature_list[0][batch,:],
+                      data_feature_list[1][batch,:],
+                      correct_scores[batch])
+    
+    # Run the optimization with these initialized values
+    sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys})
+    # display_step: how often the accuracy and loss should 
+    #   be tested and displayed.
+    if (i/batch_size) % display_step == 0:
+        # Calculate batch accuracy
+        acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys})
+        # Calculate batch loss
+        tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys})
+        # Display results
+        print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \
+              "{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \
+              "{:.5f}".format(acc))
+Features_pmh=pd.read_csv('Climate1.csv')
+
+length_features=len(Features_pmh)
+result=[]
+pred=[]
+ 
+import string
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize,word_tokenize
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+tokenizer = PunktSentenceTokenizer()
+
+text=Features_pmh['Tweet'].copy()
+def sent_process(sent):
+    sent = sent.translate(str.maketrans('', '', string.punctuation))
+    sent = [word for word in sent.split() if word.lower() not in stopwords.words('english')]
+    return " ".join(sent)    
+Features_pmh['Tweet']=text.apply(sent_process)
+file = open('Climate_t.csv','a')
+fields = ('Text','hypotheses','result','pos_scr','neg_scr','nut_scr')
+wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
+wr.writeheader()
+ 
+for i in range(length_features):
+    if(isnan(Features_pmh['Tweet'][i])==False):
+        evidences = [Features_pmh['Tweet'][i]]
+    else:
+        evidences = [Features_pmh['Tweet'][i]]
+
+    hypotheses = ["Climate Change is a Real Concern"]
+
+    sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]),(60, 50)) for evidence in evidences]
+
+    sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]),(50,50)) for hypothesis in hypotheses]
+
+    prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N),evi: (sentence2 * N),y: [[0,0,0]]*N})
+    #print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+" entailment")
+    result.append(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])])
+    pred.append(prediction[0])
+    wr.writerow({'Text':Features_pmh['Tweet'][i],'hypotheses':hypotheses,"result" :result[i],'pos_scr':pred[i][0],'neg_scr':pred[i][1],'nut_scr':pred[i][2]}) 
+
+file.close()
--- a/TAN/networks.py
+++ b/TAN/networks.py
@ -18,7 +18,7 @@ class LSTM_TAN(nn.Module):

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
-        #WORD_EMBEDDINGS
+
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float))
        self.word_embeddings.weight.requires_grad=True
@ -30,14 +30,10 @@ class LSTM_TAN(nn.Module):
            self.attention = nn.Linear(2*embedding_dim,1)


-        #LSTM
-        # The LSTM takes word embeddings as inputs, and outputs hidden states
-        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=(version!="lstm"))

        self.dropout = nn.Dropout(dropout)

-        #FINAL_LAYER
        if version !="lstm":
            self.hidden2target = nn.Linear(2*self.hidden_dim, n_targets)
        else:
@ -46,10 +42,6 @@ class LSTM_TAN(nn.Module):
        self.hidden = self.init_hidden()

    def init_hidden(self):
-        # Before we've done anything, we dont have any hidden state.
-        # Refer to the Pytorch documentation to see exactly
-        # why they have this dimensionality.
-        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

@ -62,11 +54,8 @@ class LSTM_TAN(nn.Module):

        if version != "tan-":
            t_emb = self.word_embeddings(target)
-            #print(t_emb)
-            #print(torch.mean(t_emb,dim=0,keepdim=True).shape)
            t_emb = torch.mean(t_emb,dim=0,keepdim=True)
            xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
-                #print(xt_emb)

        if version == "tan-":
            lstm_out, _ = self.lstm(
@ -96,12 +85,3 @@ class LSTM_TAN(nn.Module):

        return target_scores

-        #t_emb = self.word_embeddings(target)
-        #print(t_emb)
-        #print(torch.mean(t_emb,dim=0,keepdim=True).shape)
-        #t_emb = torch.mean(t_emb,dim=0,keep    dim=True)
-
-        #xt_emb = torch.cat((x_emb,t_emb.expand(len(sentence),-1)),dim=1)
-        #print(xt_emb)
-
-# In[26]: