Event-Extraction/models/Zero-Shot Transfer Learning for Event Extraction/utils.py

import random
import string
import numpy

__author__ = 'wilbur'


# read type index, e.g., Attack -- 14
def read_type_index(file_name):
    type_2_index = {}
    index_2_type = {}

    f = open(file_name, 'r')
    for line in f:
        parts = line.strip().split('\t')
        type = parts[2]
        index = parts[0]
        type_2_index[type] = index
        index_2_type[index] = type
    return type_2_index, index_2_type


# read relation index, e.g., ARG0 -- 1
def read_relation_index(file_name):
    rel_2_index = {}
    index_2_rel = {}

    f = open(file_name, 'r')
    for line in f:
        parts = line.strip().split('\t')
        rel = parts[1]
        index = parts[0]
        rel_2_index[rel] = index
        index_2_rel[index] = rel
    return rel_2_index, index_2_rel


# load train data to lists
def load_training_data(file_name):
    f = open(file_name, 'r')
    doc_id_list = []
    type_list = []
    trigger_list = []
    left_word_list = []
    relation_list = []
    right_word_list = []
    for line in f:
        line = line.strip()
        parts = line.split(' :: ')
        doc_id = parts[0]
        type = parts[1]
        trigger = parts[2]

        doc_id_list.append(doc_id)
        type_list.append(type)
        trigger_list.append(trigger)

        left_words = []
        right_words = []
        relations = []

        if len(parts) > 3:
            structure = parts[3]

            structure_parts = structure.strip().split('	')

            for unit in structure_parts:
                unit_parts = unit.strip().split('##')
                word1 = unit_parts[0]
                relation = unit_parts[1]
                word2 = unit_parts[2]

                left_words.append(word1)
                relations.append(relation)
                right_words.append(word2)

        left_word_list.append(left_words)
        relation_list.append(relations)
        right_word_list.append(right_words)
    return doc_id_list, type_list, trigger_list, left_word_list, relation_list, right_word_list


# loading the whole word2vec file
def load_word_vec(file_name):
    word_vectors = {}
    vector_size = 0
    vocab = []
    f = open(file_name, 'r')
    for line in f:
        parts = line.split()
        if len(parts) > 2:
            word = string.lower(parts[0])
            parts.pop(0)
            word_vectors[word] = parts
            vector_size = len(parts)
            vocab.append(word)
    f.close()
    return [word_vectors, vector_size]


# load all type labels
def load_types(type_file):
    all_types = {}
    all_type_structures = {}
    f = open(type_file, 'r')
    for line in f:
        parts = line.strip().split("\t")
        index = parts[0]
        trigger = parts[2]
        all_types[index] = trigger
        structures = []
        for j in range(3, len(parts)):
            s = trigger + "\t" + parts[j]
            structures.append(s)
        all_type_structures[index] = structures
    return all_types, all_type_structures


# load all type labels
def load_types_1(type_file):
    all_types = {}
    all_type_structures = {}
    f = open(type_file, 'r')
    for line in f:
        parts = line.strip().split("\t")
        index = parts[0]
        trigger = parts[2]
        all_types[index] = trigger
        structures = []
        for j in range(3, len(parts)):
            s = trigger + "\t" + parts[j]
            structures.append(s)
            all_type_structures[index] = structures
    return all_types, all_type_structures


# get the matrix for all input parts
def get_input_context_matrix(left_word, right_word, word_vectors, representation_size, context_size):
    matrix = numpy.zeros(shape=(representation_size * 2, context_size))
    i = 0

    while i < context_size:
        word1 = left_word[i].lower()
        word2 = right_word[i].lower()

        # current word
        if not word1 in word_vectors:
            if str(word1).find('-') == -1:
                word1 = "<unk>"
            else:
                idx = word1.find('-')
                word1 = word1[:idx]
                if not word1 in word_vectors:
                    word1 = "<unk>"
        curVector1 = word_vectors[word1]
        for j in range(0, representation_size):
            if j > len(curVector1):
                print "ERROR: mismatch in word vector lengths: " + str(len(curVector1)) + " vs " + representation_size
                exit()
            elem = float(curVector1[j])
            matrix[j, i] = elem

        if not word2 in word_vectors:
            if str(word2).find('-') == -1:
                word2 = "<unk>"
            else:
                idx = word2.find('-')
                word2 = word2[:idx]
                if not word2 in word_vectors:
                    word2 = "<unk>"
        cur_vector_2 = word_vectors[word2]
        for j in range(0, representation_size):
            if j > len(curVector1):
                print "ERROR: mismatch in word vector lengths: " + str(len(cur_vector_2)) + " vs " + representation_size
                exit()
            elem = float(cur_vector_2[j])
            matrix[j + representation_size, i] = elem
        i += 1
    return matrix


# get the matrix for all triggers
def get_input_trigger_matrix(trigger, word_vectors, representation_size):
    matrix = numpy.zeros(shape=(representation_size, 1))

    trigger = trigger.lower()

    if not trigger in word_vectors:
        if str(trigger).find('-') == -1:
            trigger = "<unk>"
        else:
            idx = trigger.find('-')
            trigger = trigger[:idx]
            if not trigger in word_vectors:
                trigger = "<unk>"
    cur_vector_1 = word_vectors[trigger]
    for j in range(0, representation_size):
        if j > len(cur_vector_1):
            print "ERROR: mismatch in word vector lengths: " + str(len(cur_vector_1)) + " vs " + representation_size
            exit()
        elem = float(cur_vector_1[j])
        matrix[j, 0] = elem
    return matrix


# load all initial random relation vectors
def load_relation_vectors(relation_vec_file, representation_size):
    matrix = numpy.zeros(shape=(representation_size, 1))
    f = open(relation_vec_file, 'r')
    index = 0
    for line in f:
        parts = line.strip().split()
        for j in range(0, len(parts)):
            elem = float(parts[j])
            matrix[j, index] = elem
        index += 1
    return matrix


# generate random relation vectors
def random_init_rel_vec(relations_file, size):
    f = open(relations_file, 'r')
    index = 0
    for line in f:
        index += 1

    rel_vec_matrix = numpy.empty(shape=(index, size * size))

    for i in range(0, index):
        for j in range(0, size * size):
            d1 = random.random() - 0.5
            rel_vec_matrix[i, j] = d1
    return rel_vec_matrix


# generate random relation vectors
def random_init_rel_vec_factor(relations_file, size):
    f = open(relations_file, 'r')
    index = 0
    for line in f:
        index += 1

    rel_vec_matrix = numpy.empty(shape=(index, size))

    for i in range(0, index):
        for j in range(0, size):
            d1 = random.random() - 0.5
            rel_vec_matrix[i, j] = d1
    return rel_vec_matrix


# get the relation binary vectors for each relation
def get_binary_relation_matrix(relation, relation_size, context_size, rel_2_index):
    matrix = numpy.zeros(shape=(relation_size, context_size))
    i = 0
    while i < context_size:
        rel = relation[i]
        idx = int(rel_2_index[rel])
        for j in range(0, relation_size):
            if idx == j:
                matrix[j, i] = 1
            else:
                matrix[j, i] = 0
        i += 1
    return matrix


# get all input matrix for nn
def input_matrix(type_list, trigger_list, left_word_list, relation_list, right_word_list, representation_size,
                 context_size, relation_size, type_file, word_2_vec, rel_2_index):
    num_samples = len(left_word_list)

    print "processing" + str(num_samples) + " examples per epoch"

    type_2_index, index_2_type = read_type_index(type_file)
    type_size = len(type_2_index)

    representation_size_1 = representation_size * 2

    input_context_matrix = numpy.empty(shape=(num_samples, representation_size_1 * context_size))
    input_trigger_matrix = numpy.empty(shape=(num_samples, representation_size * 1))
    result_index_matrix = []
    result_vector_matrix = numpy.empty(shape=(num_samples, type_size * 1))
    relation_binary_matrix = numpy.empty(shape=(num_samples, relation_size * context_size))

    for sample in range(0, num_samples):
        left_word = left_word_list[sample]
        relation = relation_list[sample]
        right_word = right_word_list[sample]

        trigger = trigger_list[sample]
        type = type_list[sample]

        matrix = numpy.zeros(shape=(type_size, 1))
        for m in range(0, type_size):
            matrix[m, 0] = 0
        type = type.lower()
        if type_2_index.has_key(type):
            type_index = type_2_index[type]
            result_index_matrix.append(type_index)

            type_index_1 = int(type_index) - 1
            matrix[type_index_1, 0] = 1
        else:
            result_index_matrix.append(0)

        while len(left_word) < context_size:
            left_word.append("PADDING")
            relation.append(":other")
            right_word.append("PADDING")

        matrix_word_pair = \
            get_input_context_matrix(left_word, right_word, word_2_vec, representation_size, context_size)
        matrix_word_pair = numpy.reshape(matrix_word_pair, representation_size_1 * context_size)
        input_context_matrix[sample, :] = matrix_word_pair

        matrix_trigger = get_input_trigger_matrix(trigger, word_2_vec, representation_size)
        matrix_trigger = numpy.reshape(matrix_trigger, representation_size * 1)
        input_trigger_matrix[sample, :] = matrix_trigger

        matrix_relation = get_binary_relation_matrix(relation, relation_size, context_size, rel_2_index)
        matrix_relation = numpy.reshape(matrix_relation, relation_size * context_size)
        relation_binary_matrix[sample, :] = matrix_relation

        matrix = numpy.reshape(matrix, type_size)
        result_vector_matrix[sample, :] = matrix

    return result_index_matrix, result_vector_matrix, input_context_matrix, input_trigger_matrix, relation_binary_matrix


def get_train_type_flag(train_type_flag):
    f = open(train_type_flag, 'r')
    map = {}
    for line in f:
        parts = line.strip().split("\t")
        type = parts[0]
        map[type] = type
    return map


# get all input matrix for nn, add neg examples
def input_matrix_1(type_list, trigger_list, left_word_list, relation_list, right_word_list, representation_size,
                   context_size, relation_size, type_file, word_2_vec, rel_2_index, train_type_flag):
    num_samples = len(left_word_list)

    train_type_flags = get_train_type_flag(train_type_flag)

    print "processing" + str(num_samples) + " examples per epoch"

    type_2_index, index_2_type = read_type_index(type_file)
    type_size = len(type_2_index)

    representation_size_1 = representation_size * 2

    input_context_matrix = numpy.empty(shape=(num_samples, representation_size_1 * context_size))
    input_trigger_matrix = numpy.empty(shape=(num_samples, representation_size * 1))
    result_index_matrix = []
    result_vector_matrix = numpy.empty(shape=(num_samples, type_size * 1))
    relation_binary_matrix = numpy.empty(shape=(num_samples, relation_size * context_size))
    pos_neg_matrix = numpy.empty(shape=(num_samples, 1))

    for sample in range(0, num_samples):
        left_word = left_word_list[sample]
        relation = relation_list[sample]
        right_word = right_word_list[sample]

        trigger = trigger_list[sample]
        type = type_list[sample]

        matrix = numpy.zeros(shape=(type_size, 1))
        for m in range(0, type_size):
            matrix[m, 0] = 0

        if train_type_flags.has_key(type):

            pos_neg_matrix[sample, 0] = 1

            type = type.lower()
            type_index = type_2_index[type]
            result_index_matrix.append(type_index)

            type_index_1 = int(type_index) - 1
            matrix[type_index_1, 0] = 1
        else:
            result_index_matrix.append(0)
            pos_neg_matrix[sample, 0] = 0

        ## each relation will be annotated as a binary vector amond all types
        while len(left_word) < context_size:
            left_word.append("PADDING")
            relation.append(":other")
            right_word.append("PADDING")

        matrix_word_pair = get_input_context_matrix(left_word, right_word, word_2_vec, representation_size,
                                                    context_size)
        matrix_word_pair = numpy.reshape(matrix_word_pair, representation_size_1 * context_size)
        input_context_matrix[sample, :] = matrix_word_pair

        matrix_trigger = get_input_trigger_matrix(trigger, word_2_vec, representation_size)
        matrix_trigger = numpy.reshape(matrix_trigger, representation_size * 1)
        input_trigger_matrix[sample, :] = matrix_trigger

        matrix_relation = get_binary_relation_matrix(relation, relation_size, context_size, rel_2_index)
        matrix_relation = numpy.reshape(matrix_relation, relation_size * context_size)
        relation_binary_matrix[sample, :] = matrix_relation

        matrix = numpy.reshape(matrix, type_size)
        result_vector_matrix[sample, :] = matrix

    return result_index_matrix, result_vector_matrix, input_context_matrix, input_trigger_matrix, \
           relation_binary_matrix, pos_neg_matrix


# get all input matrix for nn, add neg examples
def input_matrix_1_test(type_list, trigger_list, left_word_list, relation_list, right_word_list, representation_size,
                        context_size, relation_size, type_file, word_2_vec, rel_2_index, train_type_flag):
    num_samples = len(left_word_list)

    train_type_flags = get_train_type_flag(train_type_flag)

    print "processing" + str(num_samples) + " examples per epoch"

    type_2_index, index_2_type = read_type_index(type_file)
    type_size = len(type_2_index)

    representation_size_1 = representation_size * 2

    intput_context_matrix = numpy.empty(shape=(num_samples, representation_size_1 * context_size))
    input_trigger_matrix = numpy.empty(shape=(num_samples, representation_size * 1))
    result_index_matrix = []
    result_vector_matrix = numpy.empty(shape=(num_samples, type_size * 1))
    relation_binary_matrix = numpy.empty(shape=(num_samples, relation_size * context_size))
    pos_neg_matrix = numpy.empty(shape=(num_samples, 1))

    for sample in range(0, num_samples):
        left_word = left_word_list[sample]
        relation = relation_list[sample]
        right_word = right_word_list[sample]

        trigger = trigger_list[sample]
        type = type_list[sample]

        matrix = numpy.zeros(shape=(type_size, 1))
        for m in range(0, type_size):
            matrix[m, 0] = 0

        pos_neg_matrix[sample, 0] = 1

        type = type.lower()
        type_index = type_2_index[type]
        result_index_matrix.append(type_index)

        type_index_1 = int(type_index) - 1
        matrix[type_index_1, 0] = 1

        # each relation will be annotated as a binary vector amond all types
        while len(left_word) < context_size:
            left_word.append("PADDING")
            relation.append(":other")
            right_word.append("PADDING")

        matrix_word_pair = get_input_context_matrix(left_word, right_word, word_2_vec, representation_size,
                                                    context_size)
        matrix_word_pair = numpy.reshape(matrix_word_pair, representation_size_1 * context_size)
        intput_context_matrix[sample, :] = matrix_word_pair

        matrix_trigger = get_input_trigger_matrix(trigger, word_2_vec, representation_size)
        matrix_trigger = numpy.reshape(matrix_trigger, representation_size * 1)
        input_trigger_matrix[sample, :] = matrix_trigger

        matrix_relation = get_binary_relation_matrix(relation, relation_size, context_size, rel_2_index)
        matrix_relation = numpy.reshape(matrix_relation, relation_size * context_size)
        relation_binary_matrix[sample, :] = matrix_relation

        matrix = numpy.reshape(matrix, type_size)
        result_vector_matrix[sample, :] = matrix

    return result_index_matrix, result_vector_matrix, intput_context_matrix, input_trigger_matrix, \
           relation_binary_matrix, pos_neg_matrix


def get_word_vec(phrase, word_vectors, vector_size):
    vec = []
    parts = str(phrase).lower().strip().split(" ")
    for i in range(0, vector_size):
        vec.append(0)

    count = 0
    for i in range(0, len(parts)):
        word = parts[i]
        if word != "<empty>":
            if word in word_vectors:
                count += 1.0
                cur_vector = word_vectors[word]
                for j in range(0, vector_size):
                    elem = float(cur_vector[j])
                    vec[j] += elem

    if count > 0:
        for j in range(0, vector_size):
            vec[j] = vec[j] / count
    else:
        for j in range(0, vector_size):
            elem = random.random()
            vec[j] = float(elem)
            word_vectors[word] = vec
    return vec


def type_matrix(all_type_list, all_type_structure_list, word_2_vec_file, context):
    word_vectors, vector_size = load_word_vec(word_2_vec_file)
    num_of_types = len(all_type_list)
    rep_size = vector_size * 2
    input_type_structure_matrix = numpy.empty(shape=(num_of_types, rep_size * context))
    input_type_matrix = numpy.empty(shape=(num_of_types, vector_size))
    for i in range(0, num_of_types):
        type = all_type_list[str(i + 1)]
        type_vec = get_word_vec(type, word_vectors, vector_size)
        for j in range(0, vector_size):
            input_type_matrix[i, j] = type_vec[j]

        type_structures = all_type_structure_list[str(i + 1)]

        while len(type_structures) < context:
            type_structures.append("PADDING" + "\t" + "PADDING")

        for j in range(0, context):
            structure = type_structures[j]
            parts = structure.strip().split("\t")
            vec1 = get_word_vec(parts[0], word_vectors, vector_size)
            vec2 = get_word_vec(parts[1], word_vectors, vector_size)

            for m in range(0, vector_size):
                input_type_structure_matrix[i, j * rep_size + m] = vec1[m]
                input_type_structure_matrix[i, j * rep_size + vector_size + m] = vec2[m]

    return input_type_matrix, input_type_structure_matrix


def read_arg_paths(file_name):
    f = open(file_name, 'r')
    index_2_path = {}
    index_2_arg = {}
    type_2_arg_2_index = {}

    for line in f:
        parts = line.strip().split('\t')
        index = parts[0]
        type = parts[1]
        nor_type = parts[2]
        arg = parts[3]
        normalize_arg = parts[4]
        path = type + "\t" + normalize_arg

        index_2_path[index] = path
        index_2_arg[index] = normalize_arg
        if type_2_arg_2_index.has_key(nor_type):
            arg_2_index = type_2_arg_2_index[nor_type]
            arg_2_index[arg] = index
            type_2_arg_2_index[nor_type] = arg_2_index
        else:
            arg_2_index = {arg: index}
            type_2_arg_2_index[nor_type] = arg_2_index
    return index_2_path, index_2_arg, type_2_arg_2_index


def load_arg_data(file_name):
    f = open(file_name, 'r')

    trigger_list = []
    trigger_type_list = []
    arg_list = []
    arg_path_left_list = []
    arg_path_rel_list = []
    arg_path_right_list = []
    arg_role_list = []

    for line in f:
        parts = line.strip().split(" :: ")
        # print line
        if len(parts) < 7:
            print "<7  " + str(line)
        trigger = parts[3]
        trigger_type = parts[2]
        arg = parts[6]
        arg_path = parts[7]
        arg_role = parts[5]

        trigger_list.append(trigger)
        trigger_type_list.append(trigger_type)
        arg_list.append(arg)
        arg_role_list.append(arg_role)

        path_left_list = []
        path_rel_list = []
        path_right_list = []

        paths = arg_path.strip().split('\t')

        for structure in paths:
            structure_array = structure.strip().split('##')
            left_word = structure_array[0]
            rel = structure_array[1]
            right_word = structure_array[2]

            path_left_list.append(left_word)
            path_rel_list.append(rel)
            path_right_list.append(right_word)

        arg_path_left_list.append(path_left_list)
        arg_path_rel_list.append(path_rel_list)
        arg_path_right_list.append(path_right_list)

    return trigger_list, trigger_type_list, arg_list, arg_path_left_list, arg_path_rel_list, arg_path_right_list, \
           arg_role_list


def get_types_for_train(train_type_flag, type_label_file):
    type_2_index, index_2_type = read_type_index(type_label_file)
    types = []
    for i in range(len(type_2_index)):
        types.append(0)

    f = open(train_type_flag, 'r')
    for line in f:
        parts = line.strip().split("\t")
        type = parts[0].lower()
        index = type_2_index[type]
        index1 = int(index) - 1
        types[index1] = 1

    return types


def input_arg_matrix(trigger_list, trigger_type_list, arg_list, arg_path_left_list, arg_path_rel_list,
                     arg_path_right_list, arg_role_list, word_2_vec, role_list, trigger_role_2_index, vector_size,
                     context_size, relation_size, rel_2_index, roles_flag, trigger_role_matrix, label_file):
    num_samples = len(arg_list)

    type_2_index, index_2_type = read_type_index(label_file)

    print "processing" + str(num_samples) + " examples per epoch"

    # index2Path, index2Arg, type2Arg2Index = readArgPaths(argPathFile)
    arg_size = len(role_list)

    vector_size_1 = vector_size * 2

    pos_neg_matrix = numpy.empty(shape=(num_samples, 1))

    intput_arg_context_matrix = numpy.empty(shape=(num_samples, vector_size_1 * context_size))

    input_arg_matrix = numpy.empty(shape=(num_samples, vector_size * 1))
    role_index_matrix = []
    role_vector_matrix = numpy.empty(shape=(num_samples, arg_size * 1))
    relation_binary_matrix = numpy.empty(shape=(num_samples, relation_size * context_size))

    limited_roles_train_matrix = numpy.empty(shape=(num_samples, arg_size))

    for sample in range(0, num_samples):
        left_word = arg_path_left_list[sample]
        relation = arg_path_rel_list[sample]
        right_word = arg_path_right_list[sample]

        arg = arg_list[sample]
        role = arg_role_list[sample]
        trigger_type = trigger_type_list[sample]

        trigger_type_lower = trigger_type.lower()

        trigger_index = int(type_2_index[trigger_type_lower]) - 1
        limited_roles = trigger_role_matrix[trigger_index, :]
        limited_roles_train_matrix[sample, :] = limited_roles

        role_lower = role.lower()
        role_index = 0
        key1 = role_lower
        key2 = trigger_type_lower + "##" + role_lower
        if trigger_role_2_index.has_key(key1):
            role_index = trigger_role_2_index[key1]
        if trigger_role_2_index.has_key(key2):
            role_index = trigger_role_2_index[key2]
        role_index_tmp = int(role_index)

        role_index_matrix.append(role_index_tmp)

        matrix = numpy.zeros(shape=(arg_size, 1))
        for m in range(0, arg_size):
            matrix[m, 0] = 0
        role_index_1 = int(role_index) - 1
        matrix[role_index_1, 0] = 1

        pos_neg_matrix[sample, 0] = roles_flag[role_index_1]

        # each relation will be annotated as a binary vector amond all types
        while len(left_word) < context_size:
            left_word.append("PADDING")
            relation.append(":other")
            right_word.append("PADDING")

        matrix_word_pair = get_input_context_matrix(left_word, right_word, word_2_vec, vector_size, context_size)
        matrix_word_pair = numpy.reshape(matrix_word_pair, vector_size_1 * context_size)
        intput_arg_context_matrix[sample, :] = matrix_word_pair

        matrix_arg = get_input_trigger_matrix(arg, word_2_vec, vector_size)
        matrix_arg = numpy.reshape(matrix_arg, vector_size * 1)
        input_arg_matrix[sample, :] = matrix_arg

        matrix_relation = get_binary_relation_matrix(relation, relation_size, context_size, rel_2_index)
        matrix_relation = numpy.reshape(matrix_relation, relation_size * context_size)
        relation_binary_matrix[sample, :] = matrix_relation

        matrix = numpy.reshape(matrix, arg_size)
        role_vector_matrix[sample, :] = matrix

    return role_index_matrix, role_vector_matrix, intput_arg_context_matrix, input_arg_matrix, relation_binary_matrix, \
           pos_neg_matrix, limited_roles_train_matrix


def input_arg_matrix_test(trigger_list, trigger_type_list, arg_list, arg_path_left_list, arg_path_rel_list,
                          arg_path_right_list, arg_role_list, word_2_vec, role_list, trigger_role_2_index, vector_size,
                          context_size, relation_size, rel_2_index, roles_flag, trigger_role_matrix, label_file):
    num_samples = len(arg_list)
    type_2_index, index_2_type = read_type_index(label_file)
    print "processing" + str(num_samples) + " examples per epoch"

    arg_size = len(role_list)
    vector_size_1 = vector_size * 2
    pos_neg_matrix = numpy.empty(shape=(num_samples, 1))
    intput_arg_context_matrix = numpy.empty(shape=(num_samples, vector_size_1 * context_size))

    input_arg_matrix = numpy.empty(shape=(num_samples, vector_size * 1))
    role_index_matrix = []
    role_vector_matrix = numpy.empty(shape=(num_samples, arg_size * 1))
    relation_binary_matrix = numpy.empty(shape=(num_samples, relation_size * context_size))
    limited_roles_train_matrix = numpy.empty(shape=(num_samples, arg_size))

    for sample in range(0, num_samples):
        left_word = arg_path_left_list[sample]
        relation = arg_path_rel_list[sample]
        right_word = arg_path_right_list[sample]

        arg = arg_list[sample]
        role = arg_role_list[sample]
        trigger_type = trigger_type_list[sample]

        trigger_type_lower = trigger_type.lower()

        trigger_index = int(type_2_index[trigger_type_lower]) - 1
        limited_roles = trigger_role_matrix[trigger_index, :]
        limited_roles_train_matrix[sample, :] = limited_roles

        role_lower = role.lower()
        role_index = 0
        key1 = role_lower
        key2 = trigger_type_lower + "##" + role_lower
        if trigger_role_2_index.has_key(key1):
            role_index = trigger_role_2_index[key1]
        if trigger_role_2_index.has_key(key2):
            role_index = trigger_role_2_index[key2]
        role_index_tmp = int(role_index)
        role_index_matrix.append(role_index_tmp)

        matrix = numpy.zeros(shape=(arg_size, 1))
        for m in range(0, arg_size):
            matrix[m, 0] = 0
        role_index_1 = int(role_index) - 1
        matrix[role_index_1, 0] = 1

        pos_neg_matrix[sample, 0] = roles_flag[role_index_1]

        # each relation will be annotated as a binary vector amond all types
        while len(left_word) < context_size:
            left_word.append("PADDING")
            relation.append(":other")
            right_word.append("PADDING")

        matrix_word_pair = get_input_context_matrix(left_word, right_word, word_2_vec, vector_size, context_size)
        matrix_word_pair = numpy.reshape(matrix_word_pair, vector_size_1 * context_size)
        intput_arg_context_matrix[sample, :] = matrix_word_pair

        matrix_arg = get_input_trigger_matrix(arg, word_2_vec, vector_size)
        matrix_arg = numpy.reshape(matrix_arg, vector_size * 1)
        input_arg_matrix[sample, :] = matrix_arg

        matrix_relation = get_binary_relation_matrix(relation, relation_size, context_size, rel_2_index)
        matrix_relation = numpy.reshape(matrix_relation, relation_size * context_size)
        relation_binary_matrix[sample, :] = matrix_relation

        matrix = numpy.reshape(matrix, arg_size)
        role_vector_matrix[sample, :] = matrix

    return role_index_matrix, role_vector_matrix, intput_arg_context_matrix, input_arg_matrix, relation_binary_matrix, \
           pos_neg_matrix, limited_roles_train_matrix


def load_roles(arg_path_file_specific, arg_path_file_generic):
    f1 = open(arg_path_file_specific, 'r')
    f2 = open(arg_path_file_generic, 'r')
    all_arg_role_list = {}
    index_2_role = {}
    trigger_role_2_index = {}
    index_2_norm_role = {}
    trigger_norm_role_2_index = {}

    for line in f1:
        parts = line.strip().split('\t')
        index = parts[0]
        ori_type = parts[1]
        norm_type = parts[2]
        ori_role = parts[3]
        norm_role = parts[4]

        all_arg_role_list[index] = norm_role
        index_2_role[index] = ori_role
        trigger_role = str(ori_type) + "##" + str(ori_role)
        trigger_role_2_index[trigger_role] = index

        index_2_norm_role[index] = norm_role
        trigger_norm_role = str(ori_type) + "##" + str(norm_role)
        trigger_norm_role_2_index[trigger_norm_role] = index

    f1.close()

    for line in f2:
        parts = line.strip().split('\t')
        index = parts[0]
        ori_role = parts[1]
        norm_role = parts[1]

        all_arg_role_list[index] = norm_role
        index_2_role[index] = ori_role
        trigger_role_2_index[ori_role] = index

        index_2_norm_role[index] = norm_role
        trigger_norm_role_2_index[norm_role] = index
    f2.close()

    return all_arg_role_list, index_2_role, trigger_role_2_index, index_2_norm_role, trigger_norm_role_2_index


def load_roles_1(arg_path_file_merge):
    f1 = open(arg_path_file_merge, 'r')
    all_arg_role_list = {}
    all_trigger_role_structure = {}
    index_2_role = {}
    trigger_role_2_index = {}
    index_2_norm_role = {}
    trigger_norm_role_2_index = {}

    for line in f1:
        parts = line.strip().split('\t')
        index = parts[0]
        ori_type = parts[1]
        norm_type = parts[2]
        ori_role = parts[3]
        norm_role = parts[4]

        all_arg_role_list[index] = norm_role
        index_2_role[index] = ori_role
        trigger_role = str(ori_type) + "##" + str(ori_role)
        trigger_role_2_index[trigger_role] = index

        index_2_norm_role[index] = norm_role
        trigger_norm_role = str(ori_type) + "##" + str(norm_role)
        trigger_norm_role_2_index[trigger_norm_role] = index

        structure = []
        structure.append(norm_type + "\t" + norm_role)
        all_trigger_role_structure[index] = structure

    f1.close()

    return all_arg_role_list, all_trigger_role_structure, index_2_role, trigger_role_2_index, index_2_norm_role, \
           trigger_norm_role_2_index


def get_trigger_arg_matrix(trigger_role_matrix_file, type_size, role_size):
    trigger_role_matrix = numpy.zeros(shape=(type_size, role_size))
    f = open(trigger_role_matrix_file, 'r')
    for line in f:
        parts = line.strip().split('\t')
        trigger_index = int(parts[0]) - 1;

        units = parts[1].strip().split(' ')
        for i in range(0, len(units)):
            arg_index = int(units[i]) - 1
            trigger_role_matrix[trigger_index, arg_index] = 1
    return trigger_role_matrix


def role_matrix(all_arg_role_list, word_vector_file_type):
    word_vectors, vector_size = load_word_vec(word_vector_file_type)
    num_of_roles = len(all_arg_role_list)
    rep_size = vector_size * 2
    input_role_matrix = numpy.empty(shape=(num_of_roles, vector_size))
    for i in range(0, num_of_roles):
        role = all_arg_role_list[str(i + 1)]
        role_vec = get_word_vec(role, word_vectors, vector_size)
        for j in range(0, vector_size):
            input_role_matrix[i, j] = role_vec[j]

    return input_role_matrix


def role_matrix_1(all_arg_role_list, all_type_role_structure, word_vector_file_type, role_structure_context_size):
    word_vectors, vector_size = load_word_vec(word_vector_file_type)
    num_of_roles = len(all_arg_role_list)
    rep_size = vector_size * 2
    input_role_structure_matrix = numpy.empty(shape=(num_of_roles, rep_size * role_structure_context_size))
    input_role_matrix = numpy.empty(shape=(num_of_roles, vector_size))
    for i in range(0, num_of_roles):
        role = all_arg_role_list[str(i + 1)]
        roleVec = get_word_vec(role, word_vectors, vector_size)
        for j in range(0, vector_size):
            input_role_matrix[i, j] = roleVec[j]

        role_structures = all_type_role_structure[str(i + 1)]
        while len(role_structures) < role_structure_context_size:
            role_structures.append("PADDING" + "\t" + "PADDING")

        for j in range(0, role_structure_context_size):
            structure = role_structures[j]
            parts = structure.strip().split("\t")
            vec1 = get_word_vec(parts[0], word_vectors, vector_size)
            vec2 = get_word_vec(parts[1], word_vectors, vector_size)

            for m in range(0, vector_size):
                input_role_structure_matrix[i, j * rep_size + m] = vec1[m]
                input_role_structure_matrix[i, j * rep_size + vector_size + m] = vec2[m]
    return input_role_matrix, input_role_structure_matrix


def get_roles_for_train(train_role_flag, arg_path_file_specific, arg_path_file_generic):
    all_arg_role_list, index_2_role, trigger_role_2_index, index_2_norm_role, trigger_norm_role_2_index = \
        load_roles(arg_path_file_specific, arg_path_file_generic)
    types = []
    for i in range(0, len(index_2_role)):
        types.append(0)

    f = open(train_role_flag, 'r')
    for line in f:
        parts = line.strip().split("\t")
        type = parts[1].lower()
        for i in range(2, len(parts)):
            role = parts[i]
            trigger_role = type + "##" + role
            if trigger_role_2_index.has_key(role):
                index = int(trigger_role_2_index[role]) - 1
                types[index] = 1
            if trigger_role_2_index.has_key(trigger_role):
                index = int(trigger_role_2_index[trigger_role]) - 1
                types[index] = 1
    f.close()
    return types


def get_roles_for_train_1(train_role_flag, arg_path_file_merge):
    all_arg_role_list, all_trigger_role_structure, index_2_role, trigger_role_2_index, index_2_norm_role, \
    trigger_norm_role_2_index = load_roles_1(arg_path_file_merge)
    types = []
    for i in range(0, len(index_2_role)):
        types.append(0)

    f = open(train_role_flag, 'r')
    for line in f:
        parts = line.strip().split("\t")
        type = parts[1].lower()
        for i in range(2, len(parts)):
            role = parts[i]
            trigger_role = type + "##" + role
            if trigger_role_2_index.has_key(role):
                index = int(trigger_role_2_index[role]) - 1
                types[index] = 1
            if trigger_role_2_index.has_key(trigger_role):
                index = int(trigger_role_2_index[trigger_role]) - 1
                types[index] = 1
    f.close()
    return types