191 lines
6.1 KiB
Python
191 lines
6.1 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
import json
|
|
import os
|
|
import glob
|
|
import sys
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import PorterStemmer,WordNetLemmatizer
|
|
import re
|
|
import wordninja
|
|
import numpy as np
|
|
import csv
|
|
import argparse
|
|
import os
|
|
import shutil
|
|
import copy
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-nd','--non-dl',dest='nondl', action='store_true')
|
|
args = parser.parse_args()
|
|
|
|
|
|
wnl = WordNetLemmatizer()
|
|
ps = PorterStemmer()
|
|
stop_words = set(stopwords.words('english'))
|
|
|
|
#Creating Normalization Dictionary
|
|
with open("./noslang_data.json", "r") as f:
|
|
data1 = json.load(f)
|
|
|
|
data2 = {}
|
|
with open("./emnlp_dict.txt","r") as f:
|
|
lines = f.readlines()
|
|
for line in lines:
|
|
row = line.split('\t')
|
|
data2[row[0]] = row[1].rstrip()
|
|
|
|
normalization_dict = {**data1,**data2}
|
|
|
|
def clean_str(string, TREC=False):
|
|
"""
|
|
Tokenization/string cleaning for all datasets.
|
|
Every dataset is lower cased.
|
|
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
|
|
"""
|
|
string = re.sub(r"[^A-Za-z0-9(),!?\'\`#]", " ", string)
|
|
string = re.sub(r"#SemST", "", string)
|
|
string = re.sub(r"#([A-Za-z0-9]*)", r"# \1 #", string)
|
|
#string = re.sub(r"# ([A-Za-z0-9 ]*)([A-Z])(.*) #", r"# \1 \2\3 #", string)
|
|
#string = re.sub(r"([A-Z])", r" \1", string)
|
|
string = re.sub(r"\'s", " \'s", string)
|
|
string = re.sub(r"\'ve", " \'ve", string)
|
|
string = re.sub(r"n\'t", " n\'t", string)
|
|
string = re.sub(r"\'re", " \'re", string)
|
|
string = re.sub(r"\'d", " \'d", string)
|
|
string = re.sub(r"\'ll", " \'ll", string)
|
|
string = re.sub(r",", " , ", string)
|
|
string = re.sub(r"!", " ! ", string)
|
|
string = re.sub(r"\(", " ( ", string)
|
|
string = re.sub(r"\)", " ) ", string)
|
|
string = re.sub(r"\?", " ? ", string)
|
|
string = re.sub(r"\s{2,}", " ", string)
|
|
return string.strip() if TREC else string.strip().lower()
|
|
|
|
|
|
def split(word, word2emb):
|
|
if word in word2emb:
|
|
return [word]
|
|
return wordninja.split(word)
|
|
|
|
def load_glove_embeddings():
|
|
word2emb = {}
|
|
WORD2VEC_MODEL = "./glove.6B.300d.txt"
|
|
fglove = open(WORD2VEC_MODEL,"r")
|
|
for line in fglove:
|
|
cols = line.strip().split()
|
|
word = cols[0]
|
|
embedding = np.array(cols[1:],dtype="float32")
|
|
word2emb[word]=embedding
|
|
fglove.close()
|
|
return word2emb
|
|
|
|
word2emb = load_glove_embeddings()
|
|
raw_folders = ['./Data_SemE','./Data_MPCHI']
|
|
processed_folders = ['./Data_SemE_P','./Data_MPCHI_P']
|
|
|
|
for folder in processed_folders:
|
|
if os.path.exists(folder):
|
|
shutil.rmtree(folder)
|
|
os.mkdir(folder)
|
|
|
|
for dataset,new_folder in zip(raw_folders,processed_folders):
|
|
f = {}
|
|
print (dataset)
|
|
for root, directories, filenames in os.walk(dataset):
|
|
for directory in directories:
|
|
f[os.path.join(root, directory)] = {}
|
|
for filename in filenames:
|
|
f[root][os.path.splitext(filename)[0]] = os.path.join(root,filename)
|
|
print (f)
|
|
|
|
correct = 0
|
|
|
|
for key in sorted(f.keys()):
|
|
|
|
cat = key.replace(dataset,"")
|
|
new_cat_folder = new_folder+"/"+cat
|
|
os.mkdir(new_cat_folder)
|
|
|
|
for k in ["train","test"]:
|
|
n_count = 0
|
|
s_words = 0
|
|
new_lines = []
|
|
old_lines = []
|
|
with open(f[key][k],"r") as fp:
|
|
lines = fp.readlines()
|
|
for line in lines:
|
|
x = line.split("\t")
|
|
old_sent = copy.deepcopy(x[2])
|
|
old_lines.append(old_sent)
|
|
sent = clean_str(x[2])
|
|
word_tokens = sent.split(' ')
|
|
|
|
#Normalization
|
|
normalized_tokens = []
|
|
for word in word_tokens:
|
|
if word in normalization_dict.keys():
|
|
normalized_tokens.append(normalization_dict[word])
|
|
n_count+=1
|
|
else:
|
|
normalized_tokens.append(word)
|
|
|
|
#Word Ninja Splitting
|
|
normalized_tokens_s = []
|
|
for word in normalized_tokens:
|
|
normalized_tokens_s.extend(split(word,word2emb))
|
|
|
|
final_tokens = normalized_tokens_s
|
|
|
|
if args.nondl == True:
|
|
#Stop Word Removal
|
|
filtered_tokens = []
|
|
for w in normalized_tokens_s:
|
|
if w not in stop_words:
|
|
filtered_tokens.append(w)
|
|
else:
|
|
s_words+=1
|
|
|
|
# Stemming using Porter Stemmer
|
|
stemmed_tokens = []
|
|
for w in filtered_tokens:
|
|
stemmed_tokens.append(ps.stem(w))
|
|
final_tokens = stemmed_tokens
|
|
|
|
new_sent = ' '.join(final_tokens)
|
|
x[2] = new_sent
|
|
if (len(x) == 3):
|
|
if correct == 0:
|
|
x.append('NONE\n')
|
|
correct+=1
|
|
else:
|
|
x.append('FAVOR\n')
|
|
new_line = '\t'.join(x)
|
|
new_lines.append(new_line)
|
|
|
|
print ("%s %s- (%d,%d)"%(cat,k,n_count,s_words))
|
|
|
|
#Write to a txt file
|
|
with open(new_cat_folder+"/"+k+"_clean.txt","w") as wf:
|
|
wf.writelines(new_lines)
|
|
with open(new_cat_folder+"/"+k+"_preprocessed.csv","w") as csvf:
|
|
writer = csv.writer(csvf)
|
|
writer.writerow(["Tweet", "Stance","Index","Original Tweet"])
|
|
for i,line in enumerate(new_lines):
|
|
try:
|
|
writer.writerow([line.split("\t")[2],line.split("\t")[3][:-1],int(line.split("\t")[0]),old_lines[i]])
|
|
except:
|
|
print (line.split('\t'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|