342 lines
12 KiB
Python
342 lines
12 KiB
Python
|
def isnan(value):
|
||
|
try:
|
||
|
import math
|
||
|
return math.isnan(float(value))
|
||
|
except:
|
||
|
return False
|
||
|
|
||
|
#matplotlib inline
|
||
|
|
||
|
import tensorflow as tf
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
#import matplotlib.pyplot as plt
|
||
|
#import matplotlib.ticker as ticker
|
||
|
import urllib
|
||
|
import sys
|
||
|
import os
|
||
|
import csv
|
||
|
import zipfile
|
||
|
|
||
|
glove_zip_file = "glove.6B.zip"
|
||
|
glove_vectors_file = "glove.6B.50d.txt"
|
||
|
|
||
|
snli_zip_file = "snli_1.0.zip"
|
||
|
snli_dev_file = "snli_1.0_dev.txt"
|
||
|
snli_full_dataset_file = "snli_1.0_train.txt"
|
||
|
|
||
|
from six.moves.urllib.request import urlretrieve
|
||
|
|
||
|
#large file - 862 MB
|
||
|
if (not os.path.isfile(glove_zip_file) and
|
||
|
not os.path.isfile(glove_vectors_file)):
|
||
|
urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip",
|
||
|
glove_zip_file)
|
||
|
|
||
|
#medium-sized file - 94.6 MB
|
||
|
if (not os.path.isfile(snli_zip_file) and
|
||
|
not os.path.isfile(snli_dev_file)):
|
||
|
urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
|
||
|
snli_zip_file)
|
||
|
|
||
|
def unzip_single_file(zip_file_name, output_file_name):
|
||
|
"""
|
||
|
If the outFile is already created, don't recreate
|
||
|
If the outFile does not exist, create it from the zipFile
|
||
|
"""
|
||
|
if not os.path.isfile(output_file_name):
|
||
|
with open(output_file_name, 'wb') as out_file:
|
||
|
with zipfile.ZipFile(zip_file_name) as zipped:
|
||
|
for info in zipped.infolist():
|
||
|
if output_file_name in info.filename:
|
||
|
with zipped.open(info) as requested_file:
|
||
|
out_file.write(requested_file.read())
|
||
|
return
|
||
|
|
||
|
unzip_single_file(glove_zip_file, glove_vectors_file)
|
||
|
unzip_single_file(snli_zip_file, snli_dev_file)
|
||
|
|
||
|
glove_wordmap = {}
|
||
|
with open(glove_vectors_file, "r") as glove:
|
||
|
for line in glove:
|
||
|
name, vector = tuple(line.split(" ", 1))
|
||
|
glove_wordmap[name] = np.fromstring(vector, sep=" ")
|
||
|
|
||
|
|
||
|
def sentence2sequence(sentence):
|
||
|
"""
|
||
|
|
||
|
- Turns an input sentence into an (n,d) matrix,
|
||
|
where n is the number of tokens in the sentence
|
||
|
and d is the number of dimensions each word vector has.
|
||
|
|
||
|
"""
|
||
|
tokens = sentence.lower().split(" ")
|
||
|
rows = []
|
||
|
words = []
|
||
|
#Greedy search for tokens
|
||
|
for token in tokens:
|
||
|
i = len(token)
|
||
|
while len(token) > 0 and i > 0:
|
||
|
word = token[:i]
|
||
|
if word in glove_wordmap:
|
||
|
rows.append(glove_wordmap[word])
|
||
|
words.append(word)
|
||
|
token = token[i:]
|
||
|
i = len(token)
|
||
|
else:
|
||
|
i = i-1
|
||
|
return rows, words
|
||
|
|
||
|
rnn_size = 64
|
||
|
rnn = tf.contrib.rnn.BasicRNNCell(rnn_size)
|
||
|
|
||
|
#Constants setup
|
||
|
max_hypothesis_length, max_evidence_length = 60, 50
|
||
|
batch_size, vector_size, hidden_size = 128, 50, 64
|
||
|
|
||
|
lstm_size = hidden_size
|
||
|
|
||
|
weight_decay = 0.0001
|
||
|
|
||
|
learning_rate = 1
|
||
|
|
||
|
input_p, output_p = 0.5, 0.5
|
||
|
|
||
|
training_iterations_count = 100000
|
||
|
|
||
|
display_step = 10
|
||
|
|
||
|
def score_setup(row):
|
||
|
convert_dict = {
|
||
|
'entailment': 0,
|
||
|
'neutral': 1,
|
||
|
'contradiction': 2
|
||
|
}
|
||
|
score = np.zeros((3,))
|
||
|
for x in range(1,6):
|
||
|
tag = row["label"+str(x)]
|
||
|
if tag in convert_dict: score[convert_dict[tag]] += 1
|
||
|
return score / (1.0*np.sum(score))
|
||
|
|
||
|
def fit_to_size(matrix, shape):
|
||
|
res = np.zeros(shape)
|
||
|
slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)]
|
||
|
res[slices] = matrix[slices]
|
||
|
return res
|
||
|
|
||
|
|
||
|
def split_data_into_scores():
|
||
|
import csv
|
||
|
with open("snli_1.0_dev.txt","r") as data:
|
||
|
train = csv.DictReader(data, delimiter='\t')
|
||
|
evi_sentences = []
|
||
|
hyp_sentences = []
|
||
|
labels = []
|
||
|
scores = []
|
||
|
for row in train:
|
||
|
hyp_sentences.append(np.vstack(
|
||
|
sentence2sequence(row["sentence1"].lower())[0]))
|
||
|
evi_sentences.append(np.vstack(
|
||
|
sentence2sequence(row["sentence2"].lower())[0]))
|
||
|
labels.append(row["gold_label"])
|
||
|
scores.append(score_setup(row))
|
||
|
|
||
|
hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
|
||
|
for x in hyp_sentences])
|
||
|
evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
|
||
|
for x in evi_sentences])
|
||
|
|
||
|
return (hyp_sentences, evi_sentences), labels, np.array(scores)
|
||
|
|
||
|
data_feature_list, correct_values, correct_scores = split_data_into_scores()
|
||
|
|
||
|
l_h, l_e = max_hypothesis_length, max_evidence_length
|
||
|
N, D, H = batch_size, vector_size, hidden_size
|
||
|
l_seq = l_h + l_e
|
||
|
|
||
|
tf.reset_default_graph()
|
||
|
# lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
|
||
|
lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
|
||
|
lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p)
|
||
|
|
||
|
|
||
|
# N: The number of elements in each of our batches,
|
||
|
# which we use to train subsets of data for efficiency's sake.
|
||
|
# l_h: The maximum length of a hypothesis, or the second sentence. This is
|
||
|
# used because training an RNN is extraordinarily difficult without
|
||
|
# rolling it out to a fixed length.
|
||
|
# l_e: The maximum length of evidence, the first sentence. This is used
|
||
|
# because training an RNN is extraordinarily difficult without
|
||
|
# rolling it out to a fixed length.
|
||
|
# D: The size of our used GloVe or other vectors.
|
||
|
hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis')
|
||
|
evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence')
|
||
|
y = tf.placeholder(tf.float32, [N, 3], 'label')
|
||
|
# hyp: Where the hypotheses will be stored during training.
|
||
|
# evi: Where the evidences will be stored during training.
|
||
|
# y: Where correct scores will be stored during training.
|
||
|
|
||
|
# lstm_size: the size of the gates in the LSTM,
|
||
|
# as in the first LSTM layer's initialization.
|
||
|
# lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size)
|
||
|
lstm_back = tf.nn.rnn_cell.LSTMCell(lstm_size)
|
||
|
# lstm_back: The LSTM used for looking backwards
|
||
|
# through the sentences, similar to lstm.
|
||
|
|
||
|
# input_p: the probability that inputs to the LSTM will be retained at each
|
||
|
# iteration of dropout.
|
||
|
# output_p: the probability that outputs from the LSTM will be retained at
|
||
|
# each iteration of dropout.
|
||
|
lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p)
|
||
|
# lstm_drop_back: A dropout wrapper for lstm_back, like lstm_drop.
|
||
|
|
||
|
|
||
|
fc_initializer = tf.random_normal_initializer(stddev=0.1)
|
||
|
# fc_initializer: initial values for the fully connected layer's weights.
|
||
|
# hidden_size: the size of the outputs from each lstm layer.
|
||
|
# Multiplied by 2 to account for the two LSTMs.
|
||
|
fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3],
|
||
|
initializer = fc_initializer)
|
||
|
# fc_weight: Storage for the fully connected layer's weights.
|
||
|
fc_bias = tf.get_variable('bias', [3])
|
||
|
# fc_bias: Storage for the fully connected layer's bias.
|
||
|
|
||
|
# tf.GraphKeys.REGULARIZATION_LOSSES: A key to a collection in the graph
|
||
|
# designated for losses due to regularization.
|
||
|
# In this case, this portion of loss is regularization on the weights
|
||
|
# for the fully connected layer.
|
||
|
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
|
||
|
tf.nn.l2_loss(fc_weight))
|
||
|
|
||
|
x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d
|
||
|
# Permuting batch_size and n_steps
|
||
|
x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d
|
||
|
# Reshaping to (n_steps*batch_size, n_input)
|
||
|
x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d
|
||
|
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
|
||
|
x = tf.split(x, l_seq,)
|
||
|
|
||
|
# x: the inputs to the bidirectional_rnn
|
||
|
|
||
|
|
||
|
# tf.contrib.rnn.static_bidirectional_rnn: Runs the input through
|
||
|
# two recurrent networks, one that runs the inputs forward and one
|
||
|
# that runs the inputs in reversed order, combining the outputs.
|
||
|
rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back,x, dtype=tf.float32)
|
||
|
# rnn_outputs: the list of LSTM outputs, as a list.
|
||
|
# What we want is the latest output, rnn_outputs[-1]
|
||
|
|
||
|
classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias
|
||
|
# The scores are relative certainties for how likely the output matches
|
||
|
# a certain entailment:
|
||
|
# 0: Positive entailment
|
||
|
# 1: Neutral entailment
|
||
|
# 2: Negative entailment
|
||
|
|
||
|
with tf.variable_scope('Accuracy'):
|
||
|
predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32')
|
||
|
y_label = tf.cast(tf.argmax(y, 1), 'int32')
|
||
|
corrects = tf.equal(predicts, y_label)
|
||
|
num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
|
||
|
accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))
|
||
|
|
||
|
with tf.variable_scope("loss"):
|
||
|
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
|
||
|
logits = classification_scores, labels = y)
|
||
|
loss = tf.reduce_mean(cross_entropy)
|
||
|
total_loss = loss + weight_decay * tf.add_n(
|
||
|
tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
|
||
|
|
||
|
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
|
||
|
|
||
|
opt_op = optimizer.minimize(total_loss)
|
||
|
|
||
|
# Initialize variables
|
||
|
init = tf.global_variables_initializer()
|
||
|
|
||
|
# Use TQDM if installed
|
||
|
tqdm_installed = False
|
||
|
try:
|
||
|
from tqdm import tqdm
|
||
|
tqdm_installed = True
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
# Launch the Tensorflow session
|
||
|
sess = tf.Session()
|
||
|
sess.run(init)
|
||
|
|
||
|
# training_iterations_count: The number of data pieces to train on in total
|
||
|
# batch_size: The number of data pieces per batch
|
||
|
training_iterations = range(0,training_iterations_count,batch_size)
|
||
|
if tqdm_installed:
|
||
|
# Add a progress bar if TQDM is installed
|
||
|
training_iterations = tqdm(training_iterations)
|
||
|
|
||
|
for i in training_iterations:
|
||
|
|
||
|
# Select indices for a random data subset
|
||
|
batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size)
|
||
|
|
||
|
# Use the selected subset indices to initialize the graph's
|
||
|
# placeholder values
|
||
|
hyps, evis, ys = (data_feature_list[0][batch,:],
|
||
|
data_feature_list[1][batch,:],
|
||
|
correct_scores[batch])
|
||
|
|
||
|
# Run the optimization with these initialized values
|
||
|
sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys})
|
||
|
# display_step: how often the accuracy and loss should
|
||
|
# be tested and displayed.
|
||
|
if (i/batch_size) % display_step == 0:
|
||
|
# Calculate batch accuracy
|
||
|
acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys})
|
||
|
# Calculate batch loss
|
||
|
tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys})
|
||
|
# Display results
|
||
|
print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \
|
||
|
"{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \
|
||
|
"{:.5f}".format(acc))
|
||
|
Features_pmh=pd.read_csv('Climate1.csv')
|
||
|
|
||
|
length_features=len(Features_pmh)
|
||
|
result=[]
|
||
|
pred=[]
|
||
|
|
||
|
import string
|
||
|
from nltk.corpus import stopwords
|
||
|
from nltk.tokenize import sent_tokenize,word_tokenize
|
||
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||
|
tokenizer = PunktSentenceTokenizer()
|
||
|
|
||
|
text=Features_pmh['Tweet'].copy()
|
||
|
def sent_process(sent):
|
||
|
sent = sent.translate(str.maketrans('', '', string.punctuation))
|
||
|
sent = [word for word in sent.split() if word.lower() not in stopwords.words('english')]
|
||
|
return " ".join(sent)
|
||
|
Features_pmh['Tweet']=text.apply(sent_process)
|
||
|
file = open('Climate_t.csv','a')
|
||
|
fields = ('Text','hypotheses','result','pos_scr','neg_scr','nut_scr')
|
||
|
wr = csv.DictWriter(file, fieldnames=fields, lineterminator = '\n')
|
||
|
wr.writeheader()
|
||
|
|
||
|
for i in range(length_features):
|
||
|
if(isnan(Features_pmh['Tweet'][i])==False):
|
||
|
evidences = [Features_pmh['Tweet'][i]]
|
||
|
else:
|
||
|
evidences = [Features_pmh['Tweet'][i]]
|
||
|
|
||
|
hypotheses = ["Climate Change is a Real Concern"]
|
||
|
|
||
|
sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]),(60, 50)) for evidence in evidences]
|
||
|
|
||
|
sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]),(50,50)) for hypothesis in hypotheses]
|
||
|
|
||
|
prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N),evi: (sentence2 * N),y: [[0,0,0]]*N})
|
||
|
#print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+" entailment")
|
||
|
result.append(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])])
|
||
|
pred.append(prediction[0])
|
||
|
wr.writerow({'Text':Features_pmh['Tweet'][i],'hypotheses':hypotheses,"result" :result[i],'pos_scr':pred[i][0],'neg_scr':pred[i][1],'nut_scr':pred[i][2]})
|
||
|
|
||
|
file.close()
|