commit 4d58585ae2116d70d593c2ffcce3f37952cf47e4 Author: wglti <49828433+wglti@users.noreply.github.com> Date: Wed Nov 20 21:59:53 2019 -0500 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..377a28d --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +## Path Ranking with Attention to Type Hierarchies (Review only) +This repo contains code for training and testing the proposed models in *Path Ranking with Attention to Type Hierarchies*. +Due to its large size, data needs to be downloaded separately from [dropbox](https://www.dropbox.com/s/0a4o2jljg4imuux/data.zip?dl=0). + +## Notes +1. Code for baseline models in the paper can be found [here](https://github.com/matt-gardner/pra) (PRA and SFE) and +[here](https://github.com/rajarshd/ChainsofReasoning) (Path-RNN). +2. We provide tokenized data for WN18RR and FB15k-237. Our data format follows +[*ChainsofReasoning*](https://github.com/rajarshd/ChainsofReasoning). Vocabularies used for tokenizing data are also +provided for reference. +3. Raw data for WN18RR and FB15k-237 can be found +[here](https://github.com/TimDettmers/ConvE). Types for WN18RR entities can be obtained from Wordnet. Types for +FB15k-237 entities can be found [here](https://github.com/thunlp/TKRL). + +## Tested platform +* Hardware: 64GB RAM, 12GB GPU memory +* Software: ubuntu 16.04, python 3.5, cuda 8 + +## Setup +1. Install cuda +2. (Optional) Set up python virtual environment by running `virtualenv -p python3 .` +3. (Optional) Activate virtual environment by running `source bin/activate` +3. Install pytorch with cuda +4. Install requirements by running `pip3 install -r requirements.txt` + +## Instruction for running the code +### Data +1. Compressed data file can be downloaded from [dropbox](https://www.dropbox.com/s/0a4o2jljg4imuux/data.zip?dl=0) +2. Unzip the file in the root directory of this repo. + +### Run the model +1. Use `run.py` to train and test the model on WN18RR or FB15k-237. +2. Use `/main/playground/model2/CompositionalVectorSpaceAlgorithm.py` to modify the training settings and hyperparamters. +3. Use `main/playground/model2/CompositionalVectorSpaceModel.py` to modify the network design. Different attention methods for +types and paths can be selected here. +4. Training progress can be monitored using tensorboardX by running `tensorboard --logdir runs`. Tutorials and Details can be found [here](https://github.com/lanpa/tensorboardX). diff --git a/main/__init__.py b/main/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main/experiments/Metrics.py b/main/experiments/Metrics.py new file mode 100644 index 0000000..65804bf --- /dev/null +++ b/main/experiments/Metrics.py @@ -0,0 +1,82 @@ +import os + + +def score_cvsm(result_filename): + # score_instances should be a tuple of (stuff, label, score) + score_instances = [] + target_relation = None + with open(result_filename, "r") as fh: + for line in fh: + line = line.strip() + if not line: + continue + target_relation, entity_pair_idx, score, label = line.split("\t") + score = float(score) + label = int(label) + score_instances.append(((target_relation, entity_pair_idx), label, score)) + print("Computing AP, RR, ACC for relation", target_relation, "for CVSM") + print("total number of predictions:", len(score_instances)) + ap, rr, acc = compute_scores(score_instances) + print("AP:", ap, "\nRR:", rr, "\nACC:", acc) + return ap, rr, acc + + +def compute_ap_and_rr(score_instances): + """ + Given a list of scored instances [(stuff, label, score)], this method computes AP and RR. + AP is none if no positive instance is in scored instances. + + :param score_instances: + :return: + """ + # sort score instances based on score from highest to lowest + sorted_score_instances = sorted(score_instances, key=lambda score_instance: score_instance[2])[::-1] + total_predictions = 0.0 + total_corrects = 0.0 + total_precisions = [] + first_correct = -1 + for stuff, label, score in sorted_score_instances: + # print(stuff, label, score) + total_predictions += 1 + if label == 1: + total_corrects += 1 + if first_correct == -1: + first_correct = total_predictions + total_precisions.append(total_corrects/total_predictions) + ap = sum(total_precisions) * 1.0 / len(total_precisions) if len(total_precisions) > 0 else None + rr = 0.0 if first_correct == -1 else 1.0 / first_correct + return ap, rr + + +def compute_scores(score_instances): + """ + Given a list of scored instances [(stuff, label, score)], this method computes Average Precision, Reciprocal Rank, + and Accuracy. + AP is none if no positive instance is in scored instances. + + :param score_instances: + :return: + """ + # sort score instances based on score from highest to lowest + sorted_score_instances = sorted(score_instances, key=lambda score_instance: score_instance[2])[::-1] + total_predictions = 0.0 + total_correct_pos = 0.0 + total_precisions = [] + first_correct = -1 + total_correct = 0.0 + for stuff, label, score in sorted_score_instances: + # print(stuff, label, score) + if abs(score - label) < 0.5: + total_correct += 1 + total_predictions += 1 + # debug + if label > 0: + # if label == 1: + total_correct_pos += 1 + if first_correct == -1: + first_correct = total_predictions + total_precisions.append(total_correct_pos/total_predictions) + ap = sum(total_precisions) * 1.0 / len(total_precisions) if len(total_precisions) > 0 else None + rr = 0.0 if first_correct == -1 else 1.0 / first_correct + acc = total_correct / len(score_instances) + return ap, rr, acc diff --git a/main/experiments/__init__.py b/main/experiments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main/playground/Batcher.py b/main/playground/Batcher.py new file mode 100644 index 0000000..c3c4208 --- /dev/null +++ b/main/playground/Batcher.py @@ -0,0 +1,86 @@ +import torch + + +class Batcher: + def __init__(self, filename, batch_size, shuffle): + self.labels = None + self.inputs = None + self.read_data(filename) + self.number_entity_pairs, self.number_of_paths, self.path_length, self.feature_size = self.inputs.shape + + self.shuffle = shuffle + if shuffle: + self.shuffle_data() + + # how many entity pairs will be bundled together + self.batch_size = batch_size + + # used to point to the current entity pair + self.current_index = 0 + + def read_data(self, filename): + with open(filename, "r") as fh: + inputs = [] + labels = [] + for line in fh: + line = line.strip() + if len(line) != 0: + paths_for_pair = [] + label, paths = line.split("\t") + label = int(label) + labels.append(label) + paths = paths.split(";") + for path in paths: + whole_path_features = [] + # a token can be a index or a list of indices representing a relation, entity, or entity types + steps = path.split(" ") + for step in steps: + features = step.split(",") + features = [int(f) for f in features] + whole_path_features.append(features) + paths_for_pair.append(whole_path_features) + inputs.append(paths_for_pair) + self.inputs = torch.LongTensor(inputs) + self.labels = torch.FloatTensor(labels) + # print(self.inputs.shape) + # print(self.labels.shape) + + def shuffle_data(self): + # only long type or byte type tensor can be used for index + indices = torch.randperm(self.number_entity_pairs).long() + self.inputs = self.inputs[indices] + self.labels = self.labels[indices] + + def get_batch(self): + start_index = self.current_index + if start_index >= self.number_entity_pairs: + return None + end_index = min(start_index+self.batch_size-1, self.number_entity_pairs-1) + batch_inputs = self.inputs[start_index:end_index+1] + batch_labels = self.labels[start_index:end_index+1] + self.current_index = end_index + 1 + return batch_inputs, batch_labels + + def reset(self): + self.current_index = 0 + if self.shuffle: + self.shuffle_data() + + def get_size(self): + return self.number_entity_pairs, self.number_of_paths, self.path_length, self.feature_size + + +if __name__ == "__main__": + batcher = Batcher("/home/weiyu/Research/ChainsOfReasoningWithAbstractEntities/data/_architecture_structure_address/train/train.txt.2.int", 3, False) + finished = False + count = 0 + while not finished: + data = batcher.get_batch() + if data is None: + break + inputs, labels = data + print(labels.shape) + print(inputs.shape) + count += 1 + print(count) + diff --git a/main/playground/BatcherFileList.py b/main/playground/BatcherFileList.py new file mode 100644 index 0000000..e9da995 --- /dev/null +++ b/main/playground/BatcherFileList.py @@ -0,0 +1,116 @@ +from main.playground.Batcher import Batcher +import torch +import os + +# Debug: Not finished + +class BatcherFileList: + def __init__(self, data_dir, batch_size, shuffle, max_number_batchers_on_gpu): + self.do_shuffle = shuffle + self.batch_size = batch_size + + # batchers store all batchers + self.batchers = [] + self.initialize_batchers(data_dir) + self.number_batchers_on_gpu = min(max_number_batchers_on_gpu, len(self.batchers)) + if self.do_shuffle: + self.shuffle_batchers() + + self.current_index = 0 + self.current_gpu_index = 0 + self.empty_batcher_indices = set() + + self.gpu_labels = [] + self.gpu_inputs = [] + self.preallocate_gpu() + + def initialize_batchers(self, data_dir): + print("Reading files from", data_dir) + for file in os.listdir(data_dir): + if file[-3:] == "int": + self.batchers.append(Batcher(os.path.join(data_dir, file), self.batch_size, self.do_shuffle)) + + def preallocate_gpu(self): + """ + Preallocate gpu space for data from current indexed batcher to the batcher that makes the total number of + batchers on gpu equal to number_batchers_on_gpu + :return: + """ + self.gpu_labels = [] + self.gpu_inputs = [] + # Important: min(self.current_index + self.number_batchers_on_gpu, len(self.batchers)) is used to deal with + # the last group of batchers that may be less than number_batchers_on_gpu. + # e.g., for example, when we have 100 batchers, the number_batchers_on_gpu is 30, we need to deal + # the last 10 batchers. + for i in range(self.current_index, min(self.current_index + self.number_batchers_on_gpu, len(self.batchers))): + batcher = self.batchers[i] + number_entity_pairs, number_of_paths, path_length, feature_size = batcher.get_size() + # here we create gpu tensors of specified dimensions + self.gpu_inputs.append(torch.cuda.LongTensor(self.batch_size, number_of_paths, path_length, feature_size)) + self.gpu_labels.append(torch.cuda.FloatTensor(self.batch_size, 1)) + self.populate_gpu() + + def populate_gpu(self): + for i in range(self.current_index, min(self.current_index + self.number_batchers_on_gpu, len(self.batchers))): + # current batch was alreday finished + if i in self.empty_batcher_indices: + continue + + batcher = self.batchers[i] + data = batcher.get_batch() + # current batch is finished + if data is None: + self.empty_batcher_indices.add(i) + continue + + # copy data from cpu to gpu + inputs, labels = data + self.gpu_inputs[i % self.number_batchers_on_gpu].resize_(inputs.shape).copy_(inputs) + self.gpu_labels[i % self.number_batchers_on_gpu].resize_(labels.shape).copy_(labels) + + def shuffle_batchers(self): + shuffled_batchers = [] + for i in torch.randperm(len(self.batchers)): + shuffled_batchers.append(self.batchers[i]) + self.batchers = shuffled_batchers + + def get_batch(self): + # Important: the outer loop is to iterate through all data. + # the inner loop is to iterate through current group of batchers we preallocate gpu space for. + while len(self.empty_batcher_indices) < len(self.batchers): + # empty_batcher_indices is for all batchers + # print(len(self.empty_batcher_indices), self.number_batchers_on_gpu + self.current_index) + while len(self.empty_batcher_indices) < min(self.current_index + self.number_batchers_on_gpu, len(self.batchers)): + # one loop through batchers on gpu has finished. This does not mean these batchers are used up. + # It just means we need to get new data from these batchers. + if self.current_gpu_index >= self.number_batchers_on_gpu or self.current_gpu_index + self.current_index >= len(self.batchers): + self.populate_gpu() + self.current_gpu_index = 0 + + # current batcher was already finished + if self.current_index + self.current_gpu_index in self.empty_batcher_indices: + self.current_gpu_index += 1 + continue + + # return the content from the current batcher + inputs, labels = self.gpu_inputs[self.current_gpu_index], self.gpu_labels[self.current_gpu_index] + self.current_gpu_index += 1 + return inputs, labels + # batchers on gpu has all been used up + if len(self.empty_batcher_indices) < len(self.batchers): + self.current_index = self.current_index + self.number_batchers_on_gpu + self.preallocate_gpu() + self.current_gpu_index = 0 + # end of an epoch + self.reset() + return None + + def reset(self): + self.current_index = 0 + self.current_gpu_index = 0 + self.empty_batcher_indices = set() + if self.do_shuffle: + self.shuffle_batchers() + for batcher in self.batchers: + batcher.reset() + self.preallocate_gpu() \ No newline at end of file diff --git a/main/playground/Logger.py b/main/playground/Logger.py new file mode 100644 index 0000000..ba702d6 --- /dev/null +++ b/main/playground/Logger.py @@ -0,0 +1,31 @@ +from tensorboardX import SummaryWriter + + +class Logger: + def __init__(self): + print("Setting up TensorboardX") + self.writer = SummaryWriter() + + def __del__(self): + self.writer.close() + + def log_train_and_validation_accuracy(self, train_acc, val_acc, n_iter, rel): + self.writer.add_scalars(rel + '/Accuracy', {'training': train_acc, 'validation': val_acc}, n_iter) + + def log_train_and_validation_ap(self, train_ap, val_ap, n_iter, rel): + self.writer.add_scalars(rel + '/AP', {'training': train_ap, 'validation': val_ap}, n_iter) + + def log_loss(self, loss, n_iter, rel): + self.writer.add_scalar(rel + '/Loss', loss, n_iter) + + def log_accuracy(self, train_acc, val_acc, test_acc, n_iter, rel): + self.writer.add_scalars(rel + '/Accuracy', {'training': train_acc, 'validation': val_acc, "testing": test_acc}, n_iter) + + def log_ap(self, train_ap, val_ap, test_ap, n_iter, rel): + self.writer.add_scalars(rel + '/AP', {'training': train_ap, 'validation': val_ap, "testing": test_ap}, n_iter) + + def log_param(self, name, param, n_iter): + self.writer.add_histogram(name, param, n_iter) + + def close(self): + self.writer.close() \ No newline at end of file diff --git a/main/playground/Visualizer.py b/main/playground/Visualizer.py new file mode 100644 index 0000000..a9d63dc --- /dev/null +++ b/main/playground/Visualizer.py @@ -0,0 +1,393 @@ +import os +import numpy as np +import shutil +import pickle + + +class Visualizer: + + def __init__(self, idx2entity, idx2entity_type, idx2relation, save_dir, mid2name_filename=None): + self.idx2entity = idx2entity + self.idx2entity_type = idx2entity_type + self.idx2relation = idx2relation + + self.save_dir = save_dir + if not os.path.exists(self.save_dir): + os.mkdir(self.save_dir) + + self.mid2name = None + if mid2name_filename is not None: + self.mid2name = pickle.load(open(mid2name_filename, "rb")) + + # this is a dictionary from query relation to another dictionary mapping from relation paths to contradictions + self.rel_path2contradictions = {} + + def visualize_paths(self, inputs, labels, type_weights, path_weights, rel, split, epoch, + filter_negative_example=False, filter_false_prediction=False, probs=None, + top_k_path=None, minimal_path_weight=None): + """ + This method is used to visualize paths with details. Specifically, entity hierarchy for each entity will be + printed. + + :param inputs: + :param labels: + :param type_weights: + :param path_weights: + :param rel: + :param split: + :param epoch: + :param filter_negative_example: + :param filter_false_prediction: + :param probs: + :param top_k_path: + :param minimal_path_weight: + :return: + """ + + num_ent_pairs, num_paths, num_steps, num_types = type_weights.shape + highest_weighted_type_indices = np.argmax(type_weights, axis=3) + + rel_dir = os.path.join(self.save_dir, rel) + if not os.path.exists(rel_dir): + os.mkdir(rel_dir) + rel_split_dir = os.path.join(rel_dir, split) + if not os.path.exists(rel_split_dir): + os.mkdir(rel_split_dir) + file_name = os.path.join(rel_split_dir, str(epoch) + ".detailed.tsv") + + with open(file_name, "a") as fh: + for ent_pairs_idx in range(num_ent_pairs): + paths = [] + subj = None + obj = None + label = labels[ent_pairs_idx] + + # filter out negative examples + if filter_negative_example: + if label == 0: + continue + + # filter out wrong predictions + if filter_false_prediction: + if probs is not None: + prob = probs[ent_pairs_idx] + if abs(prob - label) > 0.5: + continue + + for path_idx in range(num_paths): + # Each path string should be: ent1[type1:weight1,...,typeC:weightC] - rel1 - ent2[type1:weight1,...,typeC:weightC] + + # filter by path weight + if minimal_path_weight is not None and 0 < minimal_path_weight < 1: + if path_weights[ent_pairs_idx, path_idx] < minimal_path_weight: + continue + + # processing a path + path = [] + start = False + for stp in range(num_steps): + feats = inputs[ent_pairs_idx, path_idx, stp] + entity = feats[-2] + entity_name = self.idx2entity[entity] + + # use dict to map freebase mid to name + if self.mid2name is not None: + if entity_name != "#PAD_TOKEN": + entity_name = entity_name.split(":")[1] + if entity_name in self.mid2name: + entity_name = self.mid2name[entity_name] + + # ignore pre-paddings + if not start: + if entity_name != "#PAD_TOKEN": + start = True + if subj is None: + subj = entity_name + else: + assert subj == entity_name + if start: + rel = feats[-1] + types = feats[0:-2] + weights = type_weights[ent_pairs_idx, path_idx, stp] + types_str = [] + for i in range(len(types)): + type_name = self.idx2entity_type[types[i]] + weight = weights[i] + type_str = type_name + ":" + "%.3f" % weight + types_str.append(type_str) + types_str = "[" + ",".join(types_str) + "]" + rel_name = self.idx2relation[rel] + path += [entity_name + types_str] + if rel_name != "#END_RELATION": + path += [rel_name] + if stp == num_steps - 1: + if obj is None: + obj = entity_name + else: + assert obj == entity_name + path_str = "-".join(path) + paths.append((path_str, path_weights[ent_pairs_idx, path_idx])) + + if not paths: + continue + + paths = sorted(paths, key=lambda x: x[1], reverse=True) + # keep only top K paths + if top_k_path is not None and top_k_path > 0: + paths = paths[0:min(len(paths), top_k_path)-1] + + weighted_paths = [p[0] + "," + str(p[1]) for p in paths] + paths_str = " -#- ".join(weighted_paths) + fh.write(subj + "," + obj + "\t" + str(label) + "\t" + paths_str + "\n") + + def visualize_paths_with_relation_and_type(self, inputs, labels, type_weights, path_weights, rel, split, epoch, + filter_negative_example=False, filter_false_prediction=False, probs=None, + top_k_path=None, minimal_path_weight=None): + """ + This method is used to visualize paths in a compact way. Specifically, only the highest weighted entity type + for each entity will be printed. + + :param inputs: + :param labels: + :param type_weights: + :param path_weights: + :param rel: + :param split: + :param epoch: + :param filter_negative_example: + :param filter_false_prediction: + :param probs: + :param top_k_path: + :param minimal_path_weight: + :return: + """ + num_ent_pairs, num_paths, num_steps, num_types = type_weights.shape + highest_weighted_type_indices = np.argmax(type_weights, axis=3) + + rel_dir = os.path.join(self.save_dir, rel) + if not os.path.exists(rel_dir): + os.mkdir(rel_dir) + rel_split_dir = os.path.join(rel_dir, split) + if not os.path.exists(rel_split_dir): + os.mkdir(rel_split_dir) + file_name = os.path.join(rel_split_dir, str(epoch) + ".tsv") + + with open(file_name, "a") as fh: + for ent_pairs_idx in range(num_ent_pairs): + paths = [] + subj = None + obj = None + label = labels[ent_pairs_idx] + + # filter out negative examples + if filter_negative_example: + if label == 0: + continue + + # filter out wrong predictions + if filter_false_prediction: + if probs is not None: + prob = probs[ent_pairs_idx] + if abs(prob - label) > 0.5: + continue + + for path_idx in range(num_paths): + # Each path string should be: type1 - rel1 - type2 + + # filter by path weight + if minimal_path_weight is not None and 0 < minimal_path_weight < 1: + if path_weights[ent_pairs_idx, path_idx] < minimal_path_weight: + continue + + # processing a path + path = [] + start = False + for stp in range(num_steps): + feats = inputs[ent_pairs_idx, path_idx, stp] + entity = feats[-2] + entity_name = self.idx2entity[entity] + + # use dict to map freebase mid to name + if self.mid2name is not None: + if entity_name != "#PAD_TOKEN": + entity_name = entity_name.split(":")[1] + if entity_name in self.mid2name: + entity_name = self.mid2name[entity_name] + + # ignore pre-paddings + if not start: + if entity_name != "#PAD_TOKEN": + start = True + if subj is None: + subj = entity_name + else: + assert subj == entity_name + + if start: + rel = feats[-1] + types = feats[0:-2] + rel_name = self.idx2relation[rel] + highest_weighted_type = types[highest_weighted_type_indices[ent_pairs_idx, path_idx, stp]] + type_name = self.idx2entity_type[highest_weighted_type] + path += [type_name] + if rel_name != "#END_RELATION": + path += [rel_name] + if stp == num_steps - 1: + if obj is None: + obj = entity_name + else: + assert obj == entity_name + path_str = "-".join(path) + paths.append((path_str, path_weights[ent_pairs_idx, path_idx])) + + if not paths: + continue + + paths = sorted(paths, key=lambda x: x[1], reverse=True) + # keep only top K paths + if top_k_path is not None and top_k_path > 0: + paths = paths[0:min(len(paths), top_k_path)-1] + weighted_paths = [p[0] + "," + str(p[1]) for p in paths] + paths_str = " -#- ".join(weighted_paths) + fh.write(subj + "," + obj + "\t" + str(label) + "\t" + paths_str + "\n") + + def visualize_contradictions(self, inputs, labels, type_weights, path_weights, relation, split, + filter_false_prediction=False, probs=None, minimal_path_weight=None): + """ + This method is used to extract contradiction examples. Another method needs to be called to print these examples + + :param inputs: + :param labels: + :param type_weights: + :param path_weights: + :param relation: + :param split: + :param filter_false_prediction: + :param probs: + :param minimal_path_weight: + :return: + """ + + num_ent_pairs, num_paths, num_steps, num_types = type_weights.shape + highest_weighted_type_indices = np.argmax(type_weights, axis=3) + + if split != "test": + print("Skip generation of contradictions for split other than test") + return + + if relation not in self.rel_path2contradictions: + self.rel_path2contradictions[relation] = {} + + for ent_pairs_idx in range(num_ent_pairs): + subj = None + obj = None + label = labels[ent_pairs_idx] + + # filter out wrong predictions + if filter_false_prediction: + if probs is not None: + prob = probs[ent_pairs_idx] + if abs(prob - label) > 0.5: + continue + + for path_idx in range(num_paths): + + # filter by path weight + if minimal_path_weight is not None and 0 < minimal_path_weight < 1: + if path_weights[ent_pairs_idx, path_idx] < minimal_path_weight: + continue + + # processing a path + path = [] + rel_path = [] + start = False + for stp in range(num_steps): + feats = inputs[ent_pairs_idx, path_idx, stp] + entity = feats[-2] + entity_name = self.idx2entity[entity] + + # use dict to map freebase mid to name + if self.mid2name is not None: + if entity_name != "#PAD_TOKEN": + entity_name = entity_name.split(":")[1] + if entity_name in self.mid2name: + entity_name = self.mid2name[entity_name] + + # ignore pre-paddings + if not start: + if entity_name != "#PAD_TOKEN": + start = True + if subj is None: + subj = entity_name + else: + assert subj == entity_name + + if start: + rel = feats[-1] + types = feats[0:-2] + rel_name = self.idx2relation[rel] + highest_weighted_type = types[highest_weighted_type_indices[ent_pairs_idx, path_idx, stp]] + type_name = self.idx2entity_type[highest_weighted_type] + path += [entity_name + "[" + type_name + "]"] + if rel_name != "#END_RELATION": + path += [rel_name] + rel_path += [rel_name] + if stp == num_steps - 1: + if obj is None: + obj = entity_name + else: + assert obj == entity_name + path_str = "-".join(path) + rel_path_str = "-".join(rel_path) + + if rel_path_str not in self.rel_path2contradictions[relation]: + self.rel_path2contradictions[relation][rel_path_str] = [] + # each example will be (subj, obj, label): weight, subj[type1]-ent2[type2]-obj[type3] + example_str = "(" + subj + ", " + obj + ", " + str(label) + "): " + str(path_weights[ent_pairs_idx, path_idx]) + ", " + path_str + if label == 0: + self.rel_path2contradictions[relation][rel_path_str].append(example_str) + else: + self.rel_path2contradictions[relation][rel_path_str].insert(0, example_str) + + def print_contradictions(self, rel): + """ + This method is used to write contradiction examples. + + :param rel: + :return: + """ + + if rel not in self.rel_path2contradictions: + print("Relation {} does not have any contradictory examples".format(rel)) + return + + rel_dir = os.path.join(self.save_dir, rel) + if not os.path.exists(rel_dir): + os.mkdir(rel_dir) + rel_split_dir = os.path.join(rel_dir, "test") + if not os.path.exists(rel_split_dir): + os.mkdir(rel_split_dir) + file_name = os.path.join(rel_split_dir, "contradictions.tsv") + + with open(file_name, "a") as fh: + for idx, rel_path in enumerate(self.rel_path2contradictions[rel]): + for example in self.rel_path2contradictions[rel][rel_path]: + fh.write(str(idx) + "\t" + rel_path + "\t" + example + "\n") + + def save_space(self, rel, best_epoch): + """ + This method is used to delete visualizations that are not from the best models in order to save disk space. + + :param rel: + :param best_epoch: + :return: + """ + rel_dir = os.path.join(self.save_dir, rel) + for split in os.listdir(rel_dir): + rel_split_dir = os.path.join(rel_dir, split) + for file_name in os.listdir(rel_split_dir): + epoch = int(file_name.split(".")[0]) + if epoch == 0 or epoch == best_epoch or epoch == 29: + continue + # print(file_name) + os.remove(os.path.join(rel_split_dir, file_name)) \ No newline at end of file diff --git a/main/playground/__init__.py b/main/playground/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main/playground/model2/CompositionalVectorAlgorithm.py b/main/playground/model2/CompositionalVectorAlgorithm.py new file mode 100644 index 0000000..33ea684 --- /dev/null +++ b/main/playground/model2/CompositionalVectorAlgorithm.py @@ -0,0 +1,432 @@ +import time +import numpy as np +np.set_printoptions(threshold=np.inf) +import random +import pickle +from tqdm import tqdm +import os +import json +from collections import OrderedDict, defaultdict +from scipy.stats import kurtosis, skew +from scipy.interpolate import interp1d +import matplotlib.pyplot as plt + +import torch +import torch.optim as optim + +from main.playground.model2.CompositionalVectorSpaceModel import CompositionalVectorSpaceModel +from main.playground.BatcherFileList import BatcherFileList +from main.experiments.Metrics import compute_scores +from main.playground.Logger import Logger +from main.playground.Visualizer import Visualizer + + +class CompositionalVectorAlgorithm: + + def __init__(self, dataset, experiment_dir, entity_type2vec_filename, learning_rate=0.1, weight_decay=0.0001, + number_of_epochs=30, learning_rate_step_size=50, learning_rate_decay=0.5, visualize=False, + best_models=None, pooling_method="sat", attention_method="sat", early_stopping_metric="map", + mid2name_filename=None, calculate_path_attn_stats=False, calculate_type_attn_stats=False): + """ + This class is used to run Attentive Path Ranking algorithm. The training progress is logged in tensorboardx. + + :param dataset: + :param experiment_dir: + :param entity_type2vec_filename: + :param learning_rate: + :param weight_decay: + :param number_of_epochs: + :param learning_rate_step_size: + :param learning_rate_decay: + :param visualize: if set to true, save visualized paths to folder + :param best_models: if provided, models will only be trained to the epochs of the best models. This is mainly + used for visualizing paths after all models have been trained fully once. + :param pooling_method: "sat", "lse", "avg", or "max" + :param attention_method: "sat", "specific", or "abstract" + :param early_stopping_metric: "map" or "accuracy" + :param mid2name_filename: + :param calculate_path_attn_stats: + :param calculate_type_attn_stats: + """ + self.dataset = dataset + assert dataset == "wordnet" or dataset == "freebase" + + self.attention_method = attention_method + self.pooling_method = pooling_method + self.early_stopping_metric = early_stopping_metric + + self.entity_type2vec_filename = entity_type2vec_filename + self.input_dirs = [] + self.entity_vocab = None + self.relation_vocab = None + self.entity_type_vocab = None + self.experiment_dir = experiment_dir + self.load_data(experiment_dir) + + self.logger = Logger() + + # for visualizing results + self.best_models = best_models + self.visualize = visualize + self.calculate_path_attn_stats = calculate_path_attn_stats + self.calculate_type_attn_stats = calculate_type_attn_stats + + if calculate_path_attn_stats: + self.path_weights_dir = os.path.join(self.experiment_dir, "path_weights") + if not os.path.exists(self.path_weights_dir): + os.mkdir(self.path_weights_dir) + + if calculate_type_attn_stats: + self.type_weights_dir = os.path.join(self.experiment_dir, "type_weights") + if not os.path.exists(self.type_weights_dir): + os.mkdir(self.type_weights_dir) + + self.idx2entity = {v: k for k, v in self.entity_vocab.items()} + self.idx2entity_type = {v: k for k, v in self.entity_type_vocab.items()} + self.idx2relation = {v: k for k, v in self.relation_vocab.items()} + self.visualizer = Visualizer(self.idx2entity, self.idx2entity_type, self.idx2relation, + save_dir=os.path.join(experiment_dir, "results"), + mid2name_filename=mid2name_filename) + + self.all_best_epoch_val_test = {} + # best_epoch_val_test = {"epoch": -1, "val_acc": -1, "val_ap": -1, "test_acc": -1, "test_ap": -1} + self.number_of_epochs = number_of_epochs + + def load_data(self, experiment_dir): + data_dir = os.path.join(experiment_dir, "data") + for folder in os.listdir(data_dir): + if "data_output" in folder: + input_dir = os.path.join(data_dir, folder) + for fld in os.listdir(input_dir): + self.input_dirs.append(os.path.join(input_dir, fld)) + if "vocab" in folder: + vocab_dir = os.path.join(data_dir, folder) + for fld in os.listdir(vocab_dir): + if "entity_type_vocab" in fld: + entity_type_vocab_filename = os.path.join(vocab_dir, fld) + entity_type_vocab = json.load(open(entity_type_vocab_filename, "r")) + self.entity_type_vocab = entity_type_vocab + if "entity_vocab" in fld: + entity_vocab_filename = os.path.join(vocab_dir, fld) + self.entity_vocab = json.load(open(entity_vocab_filename, "r")) + if "relation_vocab" in fld: + relation_vocab_filename = os.path.join(vocab_dir, fld) + self.relation_vocab = json.load(open(relation_vocab_filename, "r")) + + def train_and_test(self): + print(self.input_dirs) + for input_dir in self.input_dirs: + self.train(input_dir) + + # print statistics + print(self.all_best_epoch_val_test) + accs = [] + aps = [] + for rel in self.all_best_epoch_val_test: + best_model_score = self.all_best_epoch_val_test[rel] + accs.append(best_model_score["test_acc"]) + aps.append(best_model_score["test_ap"]) + print("Average Accuracy:", sum(accs)/len(accs)) + print("Mean Average Precision:", sum(aps) / len(aps)) + + def train(self, input_dir): + print("Setting up model") + # default parameters: relation_embedding_dim=50, entity_embedding_dim=0, entity_type_embedding_dim=300, + # attention_dim = 50, relation_encoder_dim=150, full_encoder_dim=150 + + if self.dataset == "wordnet": + entity_type_embedding_dim = 300 + else: + entity_type_embedding_dim = 50 + model = CompositionalVectorSpaceModel(relation_vocab_size=len(self.relation_vocab), + entity_vocab_size=len(self.entity_vocab), + entity_type_vocab_size=len(self.entity_type_vocab), + relation_embedding_dim=50, + entity_embedding_dim=0, + entity_type_embedding_dim=entity_type_embedding_dim, + entity_type_vocab=self.entity_type_vocab, + entity_type2vec_filename=self.entity_type2vec_filename, + attention_dim=50, + relation_encoder_dim=150, + full_encoder_dim=150, + pooling_method=self.pooling_method, + attention_method=self.attention_method) + + # self.optimizer = optim.SGD(self.model.parameters(), lr=0.01) + # self.optimizer = optim.Adagrad(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) + # self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=learning_rate_step_size, gamma=learning_rate_decay) + optimizer = optim.Adam(model.parameters()) + criterion = torch.nn.BCELoss().cuda() + + best_epoch_val_test = {"epoch": -1, "val_acc": -1, "val_ap": -1, "test_acc": -1, "test_ap": -1} + rel = input_dir.split("/")[-1] + train_files_dir = os.path.join(input_dir, "train") + val_files_dir = os.path.join(input_dir, "dev") + test_files_dir = os.path.join(input_dir, "test") + print("Setting up train, validation, and test batcher...") + train_batcher = BatcherFileList(train_files_dir, batch_size=16, shuffle=True, max_number_batchers_on_gpu=100) + val_batcher = BatcherFileList(val_files_dir, batch_size=16, shuffle=False, max_number_batchers_on_gpu=100) + test_batcher = BatcherFileList(test_files_dir, batch_size=16, shuffle=True, max_number_batchers_on_gpu=100) + + count = 0 + while True: + data = train_batcher.get_batch() + if data is None: + break + count += 1 + + run_epochs = 0 + if self.best_models is not None: + run_epochs = self.best_models[rel]["epoch"] + 1 + else: + run_epochs = self.number_of_epochs + + # 1. training process + for epoch in range(run_epochs): + # self.scheduler.step() + total_loss = 0 + start = time.time() + + # for i in tqdm(range(count + 1)): + for i in range(count + 1): + data = train_batcher.get_batch() + if data is not None: + + inputs, labels = data + model.train() + model.zero_grad() + probs, path_weights, type_weights = model(inputs) + loss = criterion(probs, labels) + + loss.backward() + # IMPORTANT: grad clipping is important if loss is large. May not be necessary for LSTM + torch.nn.utils.clip_grad_norm_(model.parameters(), 5) + optimizer.step() + total_loss += loss.item() + + time.sleep(1) + print("Epoch", epoch, "spent", time.time() - start, "with total loss:", total_loss) + + # compute scores, record best scores, and generate visualizations on the go + if self.best_models is None: + # compute train, validation, and test scores and log in tensorboardx + train_acc, train_ap = self.score_and_visualize(model, train_batcher, rel, "train", epoch) + val_acc, val_ap = self.score_and_visualize(model, val_batcher, rel, "val", epoch) + test_acc, test_ap = self.score_and_visualize(model, test_batcher, rel, "test", epoch) + # log training progress on tensorboardx + self.logger.log_loss(total_loss, epoch, rel) + self.logger.log_accuracy(train_acc, val_acc, test_acc, epoch, rel) + self.logger.log_ap(train_ap, val_ap, test_ap, epoch, rel) + for name, param in model.named_parameters(): + self.logger.log_param(name, param, epoch) + + # selecting the best model based on performance on validation set + if self.early_stopping_metric == "accuracy": + if val_acc > best_epoch_val_test["val_acc"]: + best_epoch_val_test = {"epoch": epoch, + "val_acc": val_acc, "val_ap": val_ap, + "test_acc": test_acc, "test_ap": test_ap} + elif self.early_stopping_metric == "map": + if val_ap > best_epoch_val_test["val_ap"]: + best_epoch_val_test = {"epoch": epoch, + "val_acc": val_acc, "val_ap": val_ap, + "test_acc": test_acc, "test_ap": test_ap} + else: + raise Exception("Early stopping metric not recognized.") + + # Stop training if loss has reduced to zero + if total_loss == 0: + break + + else: + # only compute train and test scores for the best models + if epoch == self.best_models[rel]["epoch"]: + train_acc, train_ap = self.score_and_visualize(model, train_batcher, rel, "train", epoch) + test_acc, test_ap = self.score_and_visualize(model, test_batcher, rel, "test", epoch) + + # 2. save best model + if self.best_models is None: + print("Best model", best_epoch_val_test) + if self.visualize: + self.visualizer.save_space(rel, best_epoch_val_test["epoch"]) + self.all_best_epoch_val_test[rel] = best_epoch_val_test + + def test(self, input_dir): + test_files_dir = os.path.join(input_dir, "test") + print("Setting up test batcher") + batcher = BatcherFileList(test_files_dir, batch_size=16, shuffle=True, max_number_batchers_on_gpu=100) + + acc, ap = self.score_and_visualize(batcher) + print("Total accuracy for testing set:", acc) + print("AP for this relation:", ap) + + def score_and_visualize(self, model, batcher, rel, split, epoch): + # store groundtruths and predictions + score_instances = [] + # store various path stats for all entity pairs + path_weights_stats = defaultdict(list) + all_path_weights = None + all_type_weights = None + type_weights_sum = None + type_weights_count = 0 + + with torch.no_grad(): + model.eval() + batcher.reset() + while True: + data = batcher.get_batch() + if data is None: + break + inputs, labels = data + probs, path_weights, type_weights = model(inputs) + + if self.visualize and split == "test": + if (self.best_models is None) or (epoch == self.best_models[rel]["epoch"]): + # Visualizations + # (1) show top k paths with highest weighted types. + # (2) show only one path with detailed attention to each type in type hierarchies. + # (3) show examples with same relation paths but different proposed path patterns. + + # self.visualizer.visualize_paths_with_relation_and_type(inputs.clone().cpu().data.numpy(), + # labels.clone().cpu().data.numpy(), + # type_weights.clone().cpu().data.numpy(), + # path_weights.clone().cpu().data.numpy(), + # rel, split, epoch, + # filter_negative_example=True, + # filter_false_prediction=True, + # probs=probs.clone().cpu().data.numpy(), + # top_k_path=5, + # minimal_path_weight=0.2) + # self.visualizer.visualize_paths(inputs.clone().cpu().data.numpy(), + # labels.clone().cpu().data.numpy(), + # type_weights.clone().cpu().data.numpy(), + # path_weights.clone().cpu().data.numpy(), + # rel, split, epoch, + # filter_negative_example=True, + # filter_false_prediction=True, + # probs=probs.clone().cpu().data.numpy(), + # top_k_path=5, + # minimal_path_weight=0.2) + + self.visualizer.visualize_contradictions(inputs.clone().cpu().data.numpy(), + labels.clone().cpu().data.numpy(), + type_weights.clone().cpu().data.numpy(), + path_weights.clone().cpu().data.numpy(), + rel, split, + filter_false_prediction=True, + probs=probs.clone().cpu().data.numpy(), + minimal_path_weight=0.15) + + # Visualize attention stats + if self.calculate_type_attn_stats and split == "test": + # type_weights: [num_ent_pairs, num_paths, num_steps, num_types] + num_ent_pairs, num_paths, num_steps, num_types = type_weights.shape + if type_weights_sum is None: + type_weights_sum = torch.sum(type_weights.view(-1, num_types), dim=0) + else: + type_weights_sum += torch.sum(type_weights.view(-1, num_types), dim=0) + type_weights_count += num_ent_pairs * num_paths * num_steps + + # # store all type weights + # type_weights = type_weights.view(-1, num_types).clone().cpu().data.numpy() + # if all_type_weights is None: + # all_type_weights = type_weights + # else: + # all_type_weights = np.vstack([all_type_weights, type_weights]) + + if self.calculate_path_attn_stats and split == "test": + path_weights = path_weights.clone().cpu().data.numpy() + num_ent_pairs, num_paths = path_weights.shape + + # normalize path weights for plotting + if num_paths > 1: + path_weights_sorted = np.sort(path_weights, axis=1) + path_weights_sorted = path_weights_sorted / np.max(path_weights_sorted, axis=1).reshape(num_ent_pairs, 1) + x_old = np.array(range(num_paths)) + x_new = np.linspace(0, num_paths-1, 200) + func = interp1d(x_old, path_weights_sorted, axis=1) + path_weights_normalized = func(x_new) + if all_path_weights is None: + all_path_weights = path_weights_normalized + else: + all_path_weights = np.vstack([all_path_weights, path_weights_normalized]) + + # basic stats + # all_path_weights: [num_ent_pairs, num_paths] + # path_weights_stats["min"].extend(np.nanmin(all_path_weights, axis=1)) + # path_weights_stats["max"].extend(np.nanmax(all_path_weights, axis=1)) + # path_weights_stats["mean"].extend(np.nanmean(all_path_weights, axis=1)) + # path_weights_stats["std"].extend(np.nanstd(all_path_weights, axis=1)) + # + # # + # num_ent_pairs, num_paths = all_path_weights.shape + # for percent in [25, 50, 75]: + # percentile = np.nanpercentile(all_path_weights, percent, axis=1).reshape(num_ent_pairs, -1) + # smaller_paths_percentile = all_path_weights * (all_path_weights < percentile) + # sum_paths_percentile = np.sum(smaller_paths_percentile, axis=1) + # path_weights_stats["paths_" + str(percent)].extend(sum_paths_percentile) + + # measure of tails + # path_weights_stats["skew"].extend(skew(all_path_weights, axis=1)) + # path_weights_stats["kurtosis"].extend(kurtosis(all_path_weights, axis=1)) + + for label, prob in zip(labels, probs): + score_instances.append((None, label.item(), prob.item())) + # print("accuracy for this batch of", inputs.shape[0], "examples is", num_correct / inputs.shape[0]) + # print("Total accuracy for training set:", total_num_correct / total_pairs) + + # summarize scores and stats + ap, rr, acc = compute_scores(score_instances) + # print("AP for this relation:", ap) + + if self.visualize and split == "test": + self.visualizer.print_contradictions(rel) + + if self.calculate_type_attn_stats and split == "test": + if type_weights_sum is not None: + print("Average type attention weights for {} {}".format(rel, split), + type_weights_sum / type_weights_count) + + if all_type_weights is not None: + pass + # # save type weights to file + # type_weights_file = os.path.join(self.type_weights_dir, "{}_{}.csv".format(rel, split)) + # np.savetxt(type_weights_file, all_type_weights, delimiter=",", fmt='%.6e') + + if self.calculate_path_attn_stats and split == "test": + path_stats = OrderedDict() + # all_path_weights[all_path_weights == 0] = float("nan") + # path_stats["min"] = np.average(np.array(path_weights_stats["min"])) + # path_stats["max"] = np.average(np.array(path_weights_stats["max"])) + # path_stats["mean_mean"] = np.mean(np.array(path_weights_stats["mean"])) + # path_stats["mean_std"] = np.std(np.array(path_weights_stats["mean"])) + # path_stats["std_mean"] = np.mean(np.array(path_weights_stats["std"])) + # path_stats["std_std"] = np.std(np.array(path_weights_stats["std"])) + # + # for percent in [25, 50, 75]: + # path_stats["paths_" + str(percent) + "_mean"] = np.mean(np.array(path_weights_stats["paths_" + str(percent)])) + # path_stats["paths_" + str(percent) + "_std"] = np.std(np.array(path_weights_stats["paths_" + str(percent)])) + + # path_stats["skew_mean"] = np.average(np.array(path_weights_stats["skew"])) + # path_stats["skew_std"] = np.std(np.array(path_weights_stats["skew"])) + # path_stats["kurtosis_mean"] = np.average(np.array(path_weights_stats["kurtosis"])) + # path_stats["kurtosis_std"] = np.std(np.array(path_weights_stats["kurtosis"])) + # + # print("Path weights stats:", path_stats) + + # plot path weights + if all_path_weights is not None: + # visualize path weights + path_visualization_file = os.path.join(self.path_weights_dir, "{}_{}.png".format(rel, split)) + path_weights_total_avg = np.mean(all_path_weights, axis=0) + print(path_weights_total_avg) + plt.plot(range(200), path_weights_total_avg) + plt.savefig(path_visualization_file) + plt.cla() + plt.close() + + # save path weights to file + path_weights_file = os.path.join(self.path_weights_dir, "{}_{}.csv".format(rel, split)) + np.savetxt(path_weights_file, all_path_weights, delimiter=",", fmt='%.6e') + + return acc, ap \ No newline at end of file diff --git a/main/playground/model2/CompositionalVectorSpaceModel.py b/main/playground/model2/CompositionalVectorSpaceModel.py new file mode 100644 index 0000000..38fffd0 --- /dev/null +++ b/main/playground/model2/CompositionalVectorSpaceModel.py @@ -0,0 +1,283 @@ +import torch +import torch.nn as nn +import torch.nn.functional as functional +import torch.optim as optim + +import collections +import os +import random +import time +import numpy as np +import json + +from main.playground.model2.FeatureEmbedding import FeatureEmbedding + +torch.manual_seed(1) + + +def print_sum(module, grad_input, grad_output): + return print(grad_output[0].flatten().sum()) + + +class RelationEncoder(nn.Module): + def __init__(self, relation_embedding_dim, rnn_hidden_dim): + super(RelationEncoder, self).__init__() + + self.rnn_hidden_dim = rnn_hidden_dim + self.lstm = nn.LSTM(relation_embedding_dim, rnn_hidden_dim, batch_first=True).cuda() + + def init_hidden(self, batch_size): + # Hidden state axes semantics are (seq_len, batch, rnn_hidden_dim), even when LSTM is set to batch first + hidden_state = torch.cuda.FloatTensor(1, batch_size, self.rnn_hidden_dim) + hidden_state.copy_(torch.zeros(1, batch_size, self.rnn_hidden_dim)) + cell_state = torch.cuda.FloatTensor(1, batch_size, self.rnn_hidden_dim) + cell_state.copy_(torch.zeros(1, batch_size, self.rnn_hidden_dim)) + return (hidden_state, cell_state) + + def forward(self, relation_embeds): + # relation_embeds: [num_ent_pairs x num_paths, num_steps, num_feats] + reshaped_batch_size, num_steps, num_feats = relation_embeds.shape + + _, (last_hidden, _) = self.lstm(relation_embeds, self.init_hidden(reshaped_batch_size)) + last_hidden = last_hidden.squeeze(dim=0) + # last_hidden: [num_ent_pairs x num_paths, rnn_hidden_dim] + return last_hidden + + +class Attention(nn.Module): + + def __init__(self, types_embedding_dim, full_encoder_dim, attention_dim, attention_method="sat"): + super(Attention, self).__init__() + self.attention_method = attention_method + if self.attention_method == "sat": + self.type_encoder_att = nn.Linear(types_embedding_dim, attention_dim).cuda() + self.full_encoder_att = nn.Linear(full_encoder_dim, attention_dim).cuda() + self.full_att = nn.Linear(attention_dim, 1).cuda() + self.relu = nn.ReLU().cuda() + self.softmax = nn.Softmax(dim=1).cuda() + elif self.attention_method == "general": + self.full_encoder_dim = full_encoder_dim + self.linear_in = nn.Linear(types_embedding_dim, full_encoder_dim, bias=False).cuda() + self.softmax = nn.Softmax(dim=1).cuda() + elif self.attention_method == "abstract" or self.attention_method == "specific" or self.attention_method == "random": + self.type_encoder_att = nn.Linear(types_embedding_dim, attention_dim).cuda() + + def forward(self, types_embeds, full_encoder_hidden): + + if self.attention_method == "abstract": + reshaped_batch_size, num_types, _ = types_embeds.shape + types_embeds = self.type_encoder_att(types_embeds) + attention_weighted_type_embeds = types_embeds[:, -1, :] + alpha = torch.cuda.FloatTensor(reshaped_batch_size, num_types).fill_(0) + alpha[:, -1] = 1.0 + elif self.attention_method == "specific": + reshaped_batch_size, num_types, _ = types_embeds.shape + types_embeds = self.type_encoder_att(types_embeds) + attention_weighted_type_embeds = types_embeds[:, 0, :] + alpha = torch.cuda.FloatTensor(reshaped_batch_size, num_types).fill_(0) + alpha[:, 0] = 1.0 + elif self.attention_method == "random": + reshaped_batch_size, num_types, types_embedding_dim = types_embeds.shape + types_embeds = self.type_encoder_att(types_embeds) + dim1 = torch.cuda.LongTensor(list(range(reshaped_batch_size))) + dim2 = torch.cuda.LongTensor(np.random.randint(0, num_types, size=reshaped_batch_size)) + attention_weighted_type_embeds = types_embeds[dim1, dim2, :] + alpha = torch.cuda.FloatTensor(reshaped_batch_size, num_types).fill_(0) + alpha[dim1, dim2] = 1.0 + elif self.attention_method == "sat": + # type_embeds: [num_ent_pairs x num_paths, num_types, type_encoder_dim] + att1 = self.type_encoder_att(types_embeds) + # full_encoder_hidden: [num_ent_pairs x num_paths, full_encoder_dim] + att2 = self.full_encoder_att(full_encoder_hidden) + att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2) + # att: [num_ent_pairs x num_paths, num_types] + alpha = self.softmax(att) + attention_weighted_type_embeds = (att1 * alpha.unsqueeze(2)).sum(dim=1) + elif self.attention_method == "general": + # type_embeds: [num_ent_pairs x num_paths, num_types, type_encoder_dim] + # full_encoder_hidden: [num_ent_pairs x num_paths, full_encoder_dim] + context = self.linear_in(types_embeds) + # context: [num_ent_pairs x num_paths, num_types, full_encoder_dim] + full_encoder_hidden = full_encoder_hidden.unsqueeze(dim=1) + # full_encoder_hidden: [num_ent_pairs x num_paths, 1, full_encoder_dim] + attention_scores = torch.matmul(full_encoder_hidden, context.transpose(1, 2).contiguous()) + # attention_scores: [num_ent_pairs x num_paths, 1, num_types] + alpha = self.softmax(attention_scores.squeeze(dim=1)) + attention_weighted_type_embeds = (types_embeds * alpha.unsqueeze(2)).sum(dim=1) + + return attention_weighted_type_embeds, alpha + + +class CompositionalVectorSpaceModel(nn.Module): + + def __init__(self, relation_vocab_size, entity_vocab_size, entity_type_vocab_size, + relation_embedding_dim, entity_embedding_dim, entity_type_embedding_dim, + entity_type_vocab, entity_type2vec_filename, + attention_dim, relation_encoder_dim, full_encoder_dim, + pooling_method="sat", attention_method="sat"): + + super(CompositionalVectorSpaceModel, self).__init__() + + # params + # relation_vocab_size = relation_vocab_size + # relation_embedding_dim = relation_embedding_dim # 250 + # entity_vocab_size = entity_vocab_size + # entity_embedding_dim = entity_embedding_dim + # entity_type_vocab_size = entity_type_vocab_size + # entity_type_embedding_dim = entity_type_embedding_dim + label_dim = 1 + + # Networks + self.feature_embeddings = FeatureEmbedding(relation_vocab_size, relation_embedding_dim, + entity_vocab_size, entity_embedding_dim, + entity_type_vocab_size, entity_type_embedding_dim, + entity_type_vocab, entity_type2vec_filename) + + self.relation_encoder = RelationEncoder(relation_embedding_dim, relation_encoder_dim) + + self.attention = Attention(entity_type_embedding_dim, full_encoder_dim, attention_dim, + attention_method=attention_method) + + self.full_encoder_step = nn.LSTMCell(attention_dim, full_encoder_dim).cuda() + + # predict initial state for second encoder + self.init_h = nn.Linear(relation_encoder_dim, full_encoder_dim).cuda() + self.init_c = nn.Linear(relation_encoder_dim, full_encoder_dim).cuda() + + # attention gate + self.f_beta = nn.Linear(full_encoder_dim, attention_dim).cuda() + + self.sigmoid = nn.Sigmoid().cuda() + + self.pooling_method = pooling_method + if self.pooling_method == "lse": + self.fc = nn.Linear(full_encoder_dim + relation_encoder_dim, label_dim).cuda() + elif self.pooling_method == "hat": + path_hidden_dim = 100 + self.path_projector = nn.Linear(full_encoder_dim, path_hidden_dim).cuda() + self.tanh = nn.Tanh().cuda() + self.path_context = nn.Parameter(torch.cuda.FloatTensor(path_hidden_dim)) + torch.nn.init.normal_(self.path_context) + self.softmax = nn.Softmax(dim=1).cuda() + self.fc = nn.Linear(full_encoder_dim, label_dim).cuda() + elif self.pooling_method == "sat": + path_hidden_dim = 100 + self.path_context = nn.Parameter(torch.cuda.FloatTensor(path_hidden_dim)) + torch.nn.init.normal_(self.path_context) + self.path_att = nn.Linear(full_encoder_dim + relation_encoder_dim, path_hidden_dim).cuda() + self.att = nn.Linear(path_hidden_dim, 1).cuda() + self.relu = nn.ReLU().cuda() + self.softmax = nn.Softmax(dim=1).cuda() + self.fc = nn.Linear(full_encoder_dim + relation_encoder_dim, label_dim).cuda() + # self.dropout = nn.Dropout(p=0.5) + elif self.pooling_method == "max": + self.fc = nn.Linear(full_encoder_dim + relation_encoder_dim, label_dim).cuda() + elif self.pooling_method == "avg": + self.fc = nn.Linear(full_encoder_dim + relation_encoder_dim, label_dim).cuda() + + def init_hidden(self, relation_encoder_out): + # relation_encoder_out: [num_ent_pairs x num_paths, relation_encoder_dim] + h = self.init_h(relation_encoder_out) + c = self.init_c(relation_encoder_out) + return h, c + + def forward(self, x): + # x: [num_ent_pairs, num_paths, num_steps, num_feats] + num_ent_pairs, num_paths, num_steps, num_feats = x.shape + # collide dim 0 and dim 1 + reshaped_batch_size = num_ent_pairs * num_paths + x = x.view(reshaped_batch_size, num_steps, num_feats) + # x: [num_ent_pairs x num_paths, num_steps, num_feats] + + relation_embeds, types_embeds = self.feature_embeddings(x) + # relation_embeds: [num_ent_pairs x num_paths, num_steps, relation_embedding_dim] + # types_embeds: [num_ent_pairs x num_paths, num_steps, num_types, entity_type_embedding_dim] + + relation_encoder_out = self.relation_encoder(relation_embeds) + # relation_encoder_out: [num_ent_pairs x num_paths, relation_encoder_dim] + + h, c = self.init_hidden(relation_encoder_out) + # h or c: [num_ent_pairs x num_paths, full_encoder_dim] + + num_types = types_embeds.shape[2] + alphas = torch.cuda.FloatTensor(reshaped_batch_size, num_steps, num_types) + for t in range(num_steps): + types_embeds_t = types_embeds[:, t, :, :] + # types_embeds_t: [num_ent_pairs x num_paths, num_types, entity_type_embedding_dim] + attention_weighted_encoding, alpha = self.attention(types_embeds_t, h) + # alpha: [num_ent_pairs x num_paths, num_types] + gate = self.sigmoid(self.f_beta(h)) + attention_weighted_encoding = gate * attention_weighted_encoding + # attention_weighted_encoding: [num_ent_pairs x num_paths, entity_type_embedding_dim] + + feats_t = attention_weighted_encoding + + h, c = self.full_encoder_step(feats_t, (h, c)) + alphas[:, t, :] = alpha + + h = torch.cat((h, relation_encoder_out), dim=1) + + path_weights = torch.cuda.FloatTensor(num_ent_pairs, num_paths) + if self.pooling_method == "lse": + path_scores = self.fc(h) + # path_scores: [num_ent_pairs x num_paths, label_dim] + path_scores = path_scores.view(num_ent_pairs, num_paths, -1) + # path_scores: [num_ent_pairs, num_paths, label_dim] + # LogSumExp + maxes, max_indices = torch.max(path_scores, dim=1, keepdim=True) + # print(maxes.squeeze()) + score_minus_maxes = torch.add(path_scores, -1, maxes.expand_as(path_scores)) + exp_score_minus_max = torch.exp(score_minus_maxes) + sum_exp_score_minus_max = torch.sum(exp_score_minus_max, dim=1) + lse_scores = torch.log(sum_exp_score_minus_max) + lse_scores = lse_scores + maxes.squeeze(dim=2) + # print("lse scores shape", lse_scores.shape) + # print("maxes shape", maxes.shape) + probs = self.sigmoid(lse_scores).squeeze(dim=1) + # probs: [num_ent_pairs, 1] + elif self.pooling_method == "max": + path_scores = self.fc(h) + # path_scores: [num_ent_pairs x num_paths, label_dim] + path_scores = path_scores.view(num_ent_pairs, num_paths, -1) + # path_scores: [num_ent_pairs, num_paths, label_dim] + max_path_score, _ = torch.max(path_scores, dim=1) + probs = self.sigmoid(max_path_score).squeeze(dim=1) + elif self.pooling_method == "avg": + path_scores = self.fc(h) + # path_scores: [num_ent_pairs x num_paths, label_dim] + path_scores = path_scores.view(num_ent_pairs, num_paths, -1) + # path_scores: [num_ent_pairs, num_paths, label_dim] + path_score_sum = torch.sum(path_scores, dim=1) + probs = self.sigmoid(path_score_sum).squeeze(dim=1) + elif self.pooling_method == "hat": + # h: [num_ent_pairs x num_paths, full_encoder_dim] + paths_projected = self.tanh(self.path_projector(h)) + path_sims = paths_projected.matmul(self.path_context) + path_sims = path_sims.view(num_ent_pairs, num_paths, -1) + path_weights = self.softmax(path_sims) + # path_weights: [num_ent_pairs, num_paths, 1] + paths_feats = h.view(num_ent_pairs, num_paths, -1) + paths_weighted_sum = (paths_feats * path_weights).sum(dim=1) + # paths_weighted_sum: [num_ent_pairs, full_encoder_dim] + scores = self.fc(paths_weighted_sum) + probs = self.sigmoid(scores).squeeze(dim=1) + elif self.pooling_method == "sat": + # h: [num_ent_pairs x num_paths, full_encoder_dim] + path_hiddens = self.path_att(h) + # path_hiddens: [num_ent_pairs x num_paths, path_hidden_dim] + att = self.att(self.relu(path_hiddens + self.path_context)) + # att: [num_ent_pairs x num_paths, 1] + att = att.view(num_ent_pairs, num_paths, -1) + path_weights = self.softmax(att) + paths_feats = h.view(num_ent_pairs, num_paths, -1) + paths_weighted_sum = (paths_feats * path_weights).sum(dim=1) + # paths_weighted_sum: [num_ent_pairs, full_encoder_dim] + scores = self.fc(paths_weighted_sum) + probs = self.sigmoid(scores).squeeze(dim=1) + + # visualization + path_weights = path_weights.view(num_ent_pairs, num_paths) + type_weights = alphas.view(num_ent_pairs, num_paths, num_steps, num_types) + + return probs, path_weights, type_weights + diff --git a/main/playground/model2/FeatureEmbedding.py b/main/playground/model2/FeatureEmbedding.py new file mode 100644 index 0000000..ba18d68 --- /dev/null +++ b/main/playground/model2/FeatureEmbedding.py @@ -0,0 +1,57 @@ +import torch +import torch.nn as nn +import pickle + +torch.manual_seed(1) + + +class FeatureEmbedding(nn.Module): + + def __init__(self, relation_vocab_size, relation_embedding_dim, + entity_vocab_size, entity_embedding_dim, + entity_type_vocab_size, entity_type_embedding_dim, + entity_type_vocab=None, entity_type2vec_filename=None): + super(FeatureEmbedding, self).__init__() + + self.relation_embeddings = nn.Embedding(relation_vocab_size, relation_embedding_dim).cuda() + + if entity_type2vec_filename is not None and entity_type_vocab is not None: + self.entity_types_embeddings = None + self.load_pretrained_entity_types_embeddings(entity_type_vocab, entity_type2vec_filename) + else: + for entity_type in entity_type_vocab: + if entity_type == "#PAD_TOKEN": + pad_index = entity_type_vocab[entity_type] + self.entity_types_embeddings = nn.Embedding(entity_type_vocab_size, entity_type_embedding_dim, padding_idx=pad_index).cuda() + + def load_pretrained_entity_types_embeddings(self, entity_type_vocab, entity_type2vec_filename): + print("loading entity_type2vec from pickle file:", entity_type2vec_filename) + entity_type2vec = pickle.load(open(entity_type2vec_filename, "rb")) + # entity_type2vec doesn't have "#PAD_TOKENS" while entity_type_vocab does + print(len(entity_type2vec), len(entity_type_vocab)) + assert len(entity_type2vec) + 1 == len(entity_type_vocab) + + entity_type_embedding_dim = 0 + for entity_type in entity_type2vec: + entity_type_embedding_dim = len(entity_type2vec[entity_type]) + break + assert entity_type_embedding_dim != 0 + + matrix = torch.FloatTensor(len(entity_type_vocab), entity_type_embedding_dim) + for entity_type in entity_type_vocab: + index = entity_type_vocab[entity_type] + if entity_type == "#PAD_TOKEN": + matrix[index, :] = torch.zeros(1, entity_type_embedding_dim) + else: + matrix[index, :] = torch.FloatTensor(entity_type2vec[entity_type]) + + # initialize embedding with the matrix. Turn off training + self.entity_types_embeddings = torch.nn.Embedding.from_pretrained(matrix, freeze=True).cuda() + + def forward(self, x): + # the input dimension is #paths x #steps x #feats + # for each feature, num_entity_types type, 1 entity, 1 relation in order + relation_embeds = self.relation_embeddings(x[:, :, -1]) + types_embeds = self.entity_types_embeddings(x[:, :, :-2]) + + return relation_embeds, types_embeds diff --git a/main/playground/model2/__init__.py b/main/playground/model2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main/playground/test/TestBatcherFileList.py b/main/playground/test/TestBatcherFileList.py new file mode 100644 index 0000000..0e73902 --- /dev/null +++ b/main/playground/test/TestBatcherFileList.py @@ -0,0 +1,51 @@ +import unittest +from main.playground.BatcherFileList import BatcherFileList +from tqdm import tqdm + + +class TestBatcherFileList(unittest.TestCase): + def setUp(self): + # need to specify correct absolute path to data + self.files_dir = "data/wordnet18rr/cvsm_entity/data/auto_generated_data_output/also_see/dev" + + def test_shuffled_iterations(self): + batcher = BatcherFileList(self.files_dir, batch_size=32, shuffle=True, max_number_batchers_on_gpu=100) + count = 0 + while True: + data = batcher.get_batch() + if data is None: + break + count += 1 + + count1 = 0 + for i in tqdm(range(0, count)): + data = batcher.get_batch() + count1 += 1 + + assert count == count1 + assert batcher.get_batch() is None + assert batcher.get_batch() is not None + + def test_deterministic_iterations(self): + batcher = BatcherFileList(self.files_dir, batch_size=100, shuffle=False, max_number_batchers_on_gpu=100) + list_path_numbers = [] + while True: + data = batcher.get_batch() + if data is None: + break + list_path_numbers.append(data[0].shape[1]) + + list_path_numbers1 = [] + while True: + data = batcher.get_batch() + if data is None: + break + list_path_numbers1.append(data[0].shape[1]) + assert list_path_numbers == list_path_numbers1 + + + + + + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c82ffac --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +numpy==1.16.2 +Pillow==6.0.0 +protobuf==3.7.1 +six==1.12.0 +tensorboardX==1.6 +tqdm==4.31.1 diff --git a/run.py b/run.py new file mode 100644 index 0000000..4b868c4 --- /dev/null +++ b/run.py @@ -0,0 +1,19 @@ +from main.playground.model2.CompositionalVectorAlgorithm import CompositionalVectorAlgorithm + + +def test_fb(): + cvsm = CompositionalVectorAlgorithm("freebase", "data/fb15k237/cvsm_entity", + entity_type2vec_filename=None, + pooling_method="sat", attention_method="sat", early_stopping_metric="map") + cvsm.train_and_test() + + +def test_wn(): + cvsm = CompositionalVectorAlgorithm("wordnet", experiment_dir="data/wn18rr/cvsm_entity", + entity_type2vec_filename="data/wn18rr/entity_type2vec.pkl", + pooling_method="sat", attention_method="sat", early_stopping_metric="map") + cvsm.train_and_test() + + +if __name__ == "__main__": + test_wn()