254 lines
9.0 KiB
Python
254 lines
9.0 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
import numpy as np
|
||
|
import cPickle
|
||
|
from collections import defaultdict
|
||
|
import sys, re
|
||
|
import pandas as pd
|
||
|
|
||
|
MAX_SENTANCE_LEN = 100
|
||
|
EMBEDDING_DIMENSION = 300
|
||
|
def process_T6SA_data(line):
|
||
|
tmp = line.split('\t')
|
||
|
msg=''.join(tmp[2])
|
||
|
msg=msg.replace("#SemST","")
|
||
|
msg=msg.replace("\r\n","\n")
|
||
|
msg=msg.lower()
|
||
|
msg=msg+"\n"
|
||
|
stance=''.join(tmp[3])
|
||
|
if stance=="AGAINST\n":
|
||
|
stance=0
|
||
|
if stance=="FAVOR\n":
|
||
|
stance=1
|
||
|
if stance=="NONE\n":
|
||
|
stance=2
|
||
|
if stance == "UNKNOWN\n":
|
||
|
stance= 3
|
||
|
if msg == "not available\n":
|
||
|
msg = ""
|
||
|
return [msg,stance]
|
||
|
|
||
|
def build_data_cv(data_folder, test_folder, cv=10, clean_string=True):
|
||
|
"""
|
||
|
Loads data and split into 10 folds.
|
||
|
"""
|
||
|
revs,revs_test = [],[]
|
||
|
pos_file = data_folder[0]
|
||
|
neg_file = data_folder[1]
|
||
|
cen_file = data_folder[2]
|
||
|
print pos_file
|
||
|
print neg_file
|
||
|
print cen_file
|
||
|
vocab = defaultdict(float)
|
||
|
""" data should be preprocessed into three files - pos, neg and none. """
|
||
|
with open(pos_file, "rb") as f:
|
||
|
for line in f:
|
||
|
rev = []
|
||
|
rev.append(line.strip())
|
||
|
if clean_string:
|
||
|
orig_rev = clean_str(" ".join(rev))
|
||
|
else:
|
||
|
orig_rev = " ".join(rev).lower()
|
||
|
words = set(orig_rev.split())
|
||
|
for word in words:
|
||
|
vocab[word] += 1
|
||
|
datum = {"y":1,
|
||
|
"text": orig_rev,
|
||
|
"num_words": len(orig_rev.split()),
|
||
|
"split": np.random.randint(0,cv)}
|
||
|
revs.append(datum)
|
||
|
with open(neg_file, "rb") as f:
|
||
|
for line in f:
|
||
|
rev = []
|
||
|
rev.append(line.strip())
|
||
|
if clean_string:
|
||
|
orig_rev = clean_str(" ".join(rev))
|
||
|
else:
|
||
|
orig_rev = " ".join(rev).lower()
|
||
|
words = set(orig_rev.split())
|
||
|
for word in words:
|
||
|
vocab[word] += 1
|
||
|
datum = {"y":0,
|
||
|
"text": orig_rev,
|
||
|
"num_words": len(orig_rev.split()),
|
||
|
"split": np.random.randint(0,cv)}
|
||
|
revs.append(datum)
|
||
|
with open(cen_file, "rb") as f:
|
||
|
for line in f:
|
||
|
rev = []
|
||
|
rev.append(line.strip())
|
||
|
if clean_string:
|
||
|
orig_rev = clean_str(" ".join(rev))
|
||
|
else:
|
||
|
orig_rev = " ".join(rev).lower()
|
||
|
words = set(orig_rev.split())
|
||
|
for word in words:
|
||
|
vocab[word] += 1
|
||
|
datum = {"y":2,
|
||
|
"text": orig_rev,
|
||
|
"num_words": len(orig_rev.split()),
|
||
|
"split": np.random.randint(0,cv)}
|
||
|
revs.append(datum)
|
||
|
|
||
|
''' if the parameter test_folder is "null", 10% of the dataset will be used for test.
|
||
|
Otherwise test set will be used, so that prediction can be performed.
|
||
|
'''
|
||
|
if test_folder!="null":
|
||
|
print("Test Data loading....")
|
||
|
with open(test_folder, "r") as f:
|
||
|
for line in f:
|
||
|
if line.split('\t')[0] == "ID":
|
||
|
continue
|
||
|
rev = []
|
||
|
''' the name of test_folder is specific '''
|
||
|
if test_folder==test_folder:
|
||
|
msg,stance = process_T6SA_data(line)
|
||
|
else:
|
||
|
revs_test = []
|
||
|
break
|
||
|
if msg == "":
|
||
|
continue
|
||
|
rev.append(msg.strip())
|
||
|
if clean_string:
|
||
|
orig_rev = clean_str(" ".join(rev))
|
||
|
else:
|
||
|
orig_rev = " ".join(rev).lower()
|
||
|
words = set(orig_rev.split())
|
||
|
for word in words:
|
||
|
vocab[word] += 1
|
||
|
datum = {"y":stance,
|
||
|
"text": orig_rev,
|
||
|
"num_words": len(orig_rev.split()),
|
||
|
"split": np.random.randint(0,cv)}
|
||
|
if datum["num_words"] > MAX_SENTANCE_LEN:
|
||
|
continue
|
||
|
revs_test.append(datum)
|
||
|
return revs, revs_test, vocab
|
||
|
|
||
|
def get_W(word_vecs, k=300):
|
||
|
"""
|
||
|
Get word matrix. W[i] is the vector for word indexed by i
|
||
|
"""
|
||
|
vocab_size = len(word_vecs)
|
||
|
word_idx_map = dict()
|
||
|
W = np.zeros(shape=(vocab_size+1, k))
|
||
|
W[0] = np.zeros(k)
|
||
|
i = 1
|
||
|
for word in word_vecs:
|
||
|
W[i] = word_vecs[word]
|
||
|
word_idx_map[word] = i
|
||
|
i += 1
|
||
|
return W, word_idx_map
|
||
|
|
||
|
def load_bin_vec(fname1, fname2, vocab):
|
||
|
"""
|
||
|
2 word2vec files can be used together. fname2's word2vec will cover fname1's word2vec if they
|
||
|
have same words.
|
||
|
If only one word2vec file is used, then fname2 is "null".
|
||
|
"""
|
||
|
word_vecs = {}
|
||
|
"""Loads 300x1 word vecs from Google (Mikolov) word2vec"""
|
||
|
with open(fname1, "rb") as f:
|
||
|
header = f.readline()
|
||
|
vocab_size, layer1_size = map(int, header.split())
|
||
|
binary_len = np.dtype('float32').itemsize * layer1_size
|
||
|
for line in xrange(vocab_size):
|
||
|
word = []
|
||
|
while True:
|
||
|
ch = f.read(1)
|
||
|
if ch == ' ':
|
||
|
word = ''.join(word)
|
||
|
break
|
||
|
if ch != '\n':
|
||
|
word.append(ch)
|
||
|
if word in vocab:
|
||
|
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
|
||
|
else:
|
||
|
f.read(binary_len)
|
||
|
f.close()
|
||
|
|
||
|
if fname2 != "null":
|
||
|
with open(fname2, "rb") as f:
|
||
|
header = f.readline()
|
||
|
vocab_size, layer1_size = map(int, header.split())
|
||
|
binary_len = np.dtype('float32').itemsize * layer1_size
|
||
|
for line in xrange(vocab_size):
|
||
|
word = []
|
||
|
while True:
|
||
|
ch = f.read(1)
|
||
|
if ch == ' ':
|
||
|
word = ''.join(word)
|
||
|
break
|
||
|
if ch != '\n':
|
||
|
word.append(ch)
|
||
|
if word in vocab:
|
||
|
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
|
||
|
else:
|
||
|
f.read(binary_len)
|
||
|
f.close()
|
||
|
return word_vecs
|
||
|
|
||
|
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
|
||
|
"""
|
||
|
For words that occur in at least min_df documents, create a separate word vector.
|
||
|
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
|
||
|
"""
|
||
|
for word in vocab:
|
||
|
if word not in word_vecs and vocab[word] >= min_df:
|
||
|
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
|
||
|
|
||
|
def clean_str(string, TREC=False):
|
||
|
"""
|
||
|
Tokenization/string cleaning for all datasets.
|
||
|
Every dataset is lower cased.
|
||
|
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
|
||
|
"""
|
||
|
string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
|
||
|
string = re.sub(r"#SemST", "", string)
|
||
|
string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
|
||
|
#string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
|
||
|
#string = re.sub(r"([A-Z])", r" \1", string)
|
||
|
string = re.sub(r"\'s", " \'s", string)
|
||
|
string = re.sub(r"\'ve", " \'ve", string)
|
||
|
string = re.sub(r"n\'t", " n\'t", string)
|
||
|
string = re.sub(r"\'re", " \'re", string)
|
||
|
string = re.sub(r"\'d", " \'d", string)
|
||
|
string = re.sub(r"\'ll", " \'ll", string)
|
||
|
string = re.sub(r",", " , ", string)
|
||
|
string = re.sub(r"!", " ! ", string)
|
||
|
string = re.sub(r"\(", " ( ", string)
|
||
|
string = re.sub(r"\)", " ) ", string)
|
||
|
string = re.sub(r"\?", " ? ", string)
|
||
|
string = re.sub(r"\s{2,}", " ", string)
|
||
|
return string.strip() if TREC else string.strip().lower()
|
||
|
|
||
|
|
||
|
if __name__=="__main__":
|
||
|
w2v_file1 = sys.argv[1]
|
||
|
w2v_file2 = sys.argv[2]
|
||
|
test_folder = sys.argv[3]
|
||
|
path = sys.argv[4]
|
||
|
data_folder = [path+"/T6SA_stance.pos",path+"/T6SA_stance.neg",path+"/T6SA_stance.none"]
|
||
|
print test_folder
|
||
|
print "loading data...",
|
||
|
revs, revs_test, vocab = build_data_cv(data_folder, test_folder, cv=10, clean_string=True)
|
||
|
print pd.DataFrame(revs)
|
||
|
max_l = np.max(pd.DataFrame(revs)["num_words"])
|
||
|
print "data loaded!"
|
||
|
print "number of train sentences: " + str(len(revs))
|
||
|
print "number of out-test sentences: " + str(len(revs_test))
|
||
|
print "vocab size: " + str(len(vocab))
|
||
|
print "max sentence length: " + str(max_l)
|
||
|
print "loading word2vec vectors...",
|
||
|
w2v = load_bin_vec(w2v_file1, w2v_file2, vocab)
|
||
|
print "word2vec loaded!"
|
||
|
print "num words already in word2vec: " + str(len(w2v))
|
||
|
add_unknown_words(w2v, vocab, k=EMBEDDING_DIMENSION)
|
||
|
W, word_idx_map = get_W(w2v, k=EMBEDDING_DIMENSION)
|
||
|
rand_vecs = {}
|
||
|
add_unknown_words(rand_vecs, vocab,k=EMBEDDING_DIMENSION)
|
||
|
W2, _ = get_W(rand_vecs,k=EMBEDDING_DIMENSION)
|
||
|
|
||
|
cPickle.dump([revs, revs_test, W, W2, word_idx_map, vocab], open(path+"/mr.p", "wb"))
|
||
|
print "dataset created!"
|
||
|
|