Stance-Detection-in-Web-and.../CNN/T6SA_process_data.py

254 lines
9.0 KiB
Python
Raw Permalink Normal View History

2019-06-16 12:35:19 +08:00
# -*- coding: utf-8 -*-
import numpy as np
import cPickle
from collections import defaultdict
import sys, re
import pandas as pd
MAX_SENTANCE_LEN = 100
EMBEDDING_DIMENSION = 300
def process_T6SA_data(line):
tmp = line.split('\t')
msg=''.join(tmp[2])
msg=msg.replace("#SemST","")
msg=msg.replace("\r\n","\n")
msg=msg.lower()
msg=msg+"\n"
stance=''.join(tmp[3])
if stance=="AGAINST\n":
stance=0
if stance=="FAVOR\n":
stance=1
if stance=="NONE\n":
stance=2
if stance == "UNKNOWN\n":
stance= 3
if msg == "not available\n":
msg = ""
return [msg,stance]
def build_data_cv(data_folder, test_folder, cv=10, clean_string=True):
"""
Loads data and split into 10 folds.
"""
revs,revs_test = [],[]
pos_file = data_folder[0]
neg_file = data_folder[1]
cen_file = data_folder[2]
print pos_file
print neg_file
print cen_file
vocab = defaultdict(float)
""" data should be preprocessed into three files - pos, neg and none. """
with open(pos_file, "rb") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":1,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
revs.append(datum)
with open(neg_file, "rb") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":0,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
revs.append(datum)
with open(cen_file, "rb") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":2,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
revs.append(datum)
''' if the parameter test_folder is "null", 10% of the dataset will be used for test.
Otherwise test set will be used, so that prediction can be performed.
'''
if test_folder!="null":
print("Test Data loading....")
with open(test_folder, "r") as f:
for line in f:
if line.split('\t')[0] == "ID":
continue
rev = []
''' the name of test_folder is specific '''
if test_folder==test_folder:
msg,stance = process_T6SA_data(line)
else:
revs_test = []
break
if msg == "":
continue
rev.append(msg.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":stance,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
if datum["num_words"] > MAX_SENTANCE_LEN:
continue
revs_test.append(datum)
return revs, revs_test, vocab
def get_W(word_vecs, k=300):
"""
Get word matrix. W[i] is the vector for word indexed by i
"""
vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(vocab_size+1, k))
W[0] = np.zeros(k)
i = 1
for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1
return W, word_idx_map
def load_bin_vec(fname1, fname2, vocab):
"""
2 word2vec files can be used together. fname2's word2vec will cover fname1's word2vec if they
have same words.
If only one word2vec file is used, then fname2 is "null".
"""
word_vecs = {}
"""Loads 300x1 word vecs from Google (Mikolov) word2vec"""
with open(fname1, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
f.close()
if fname2 != "null":
with open(fname2, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
f.close()
return word_vecs
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
"""
For words that occur in at least min_df documents, create a separate word vector.
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets.
Every dataset is lower cased.
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
string = re.sub(r"#SemST", "", string)
string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
#string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
#string = re.sub(r"([A-Z])", r" \1", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " ( ", string)
string = re.sub(r"\)", " ) ", string)
string = re.sub(r"\?", " ? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip() if TREC else string.strip().lower()
if __name__=="__main__":
w2v_file1 = sys.argv[1]
w2v_file2 = sys.argv[2]
test_folder = sys.argv[3]
path = sys.argv[4]
data_folder = [path+"/T6SA_stance.pos",path+"/T6SA_stance.neg",path+"/T6SA_stance.none"]
print test_folder
print "loading data...",
revs, revs_test, vocab = build_data_cv(data_folder, test_folder, cv=10, clean_string=True)
print pd.DataFrame(revs)
max_l = np.max(pd.DataFrame(revs)["num_words"])
print "data loaded!"
print "number of train sentences: " + str(len(revs))
print "number of out-test sentences: " + str(len(revs_test))
print "vocab size: " + str(len(vocab))
print "max sentence length: " + str(max_l)
print "loading word2vec vectors...",
w2v = load_bin_vec(w2v_file1, w2v_file2, vocab)
print "word2vec loaded!"
print "num words already in word2vec: " + str(len(w2v))
add_unknown_words(w2v, vocab, k=EMBEDDING_DIMENSION)
W, word_idx_map = get_W(w2v, k=EMBEDDING_DIMENSION)
rand_vecs = {}
add_unknown_words(rand_vecs, vocab,k=EMBEDDING_DIMENSION)
W2, _ = get_W(rand_vecs,k=EMBEDDING_DIMENSION)
cPickle.dump([revs, revs_test, W, W2, word_idx_map, vocab], open(path+"/mr.p", "wb"))
print "dataset created!"