Create preprocess.py

This commit is contained in:
missQian 2020-10-04 21:23:16 +08:00 committed by GitHub
parent f653d53e04
commit 9e3b1cda22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -0,0 +1,39 @@
import pandas as pd
from utils import *
import pickle
from sklearn.model_selection import train_test_split
def GenerateUnlabeled(vocabs, batch_size, dataset):
print("Loading the unlabeled dataset")
patch = pd.read_csv("Data/patch_more3.csv")
print("Unlabeled set shape:", patch.shape)
print("Converting unlabeled dataset to batches")
test = TrainToBags(patch, vocabs, True)
print("Saving the unlabeled dataset into patch.pkl")
pickle.dump(test, open("Data/" + dataset + "/patch.pkl", "wb"))
return test
def GenerateTrain(batch_size, dataset):
df = pd.read_csv("Data/" + dataset + "/train_" + dataset + ".csv")
print("Train set includes", df.shape[0], "annotated data points")
print("Learning vocabs")
vocabs = get_vocabs(df["text"].tolist())
print("Converting articles to bags of sentences")
bags = TrainToBags(df, vocabs)
print("Splitting into train and dev set")
train, dev_test = train_test_split(bags, test_size=0.3, random_state=33)
test, dev = train_test_split(dev_test, test_size=0.33, random_state=33)
print("Loading pretrained word embeddings")
embedding = read_embedding(vocabs)
print("All datasets are saved in data.pkl")
pickle.dump((train, dev, test, vocabs, embedding), open("Data/" + dataset + "/data.pkl", "wb"))
return (train, dev, test, vocabs, embedding)