AttentivePathRanking/demo_fb15k237.py

import os

# import and build cython
import pyximport
pyximport.install()

from main.data.MIDFreebase15kReader import MIDFreebase15kReader
from main.data.TypedRelationInstances import TypedRelationInstances
from main.data.Vocabs import Vocabs
from main.graphs.AdjacencyGraph import AdjacencyGraph
from main.features.PathExtractor import PathExtractor
from main.data.Split import Split
from main.features.PathReader import PathReader
from main.experiments.CVSMDriver import CVSMDriver
from main.experiments.PRADriver import PRADriver
from main.playground.make_data_format import process_paths
from main.playground.model2.CompositionalVectorAlgorithm import CompositionalVectorAlgorithm

# This script is used to run FB15k237 experiments (only our method) with 1:10 postive to negative ratio.

if __name__ == "__main__":

    # location of Das et al.'s repo
    CVSM_RUN_DIR = "/home/weiyu/Research/Path_Baselines/CVSM/ChainsofReasoning"
    # location of Matt's PRA scala repo
    PRA_RUN_DIR = "/home/weiyu/Research/Path_Baselines/SFE/pra_scala"

    DATASET_NAME = "fb15k237"
    DATASET_FOLDER = os.path.join("data", DATASET_NAME)
    PRA_TEMPLATE_DIR = "pra_templates"

    FREEBASE_DIR = DATASET_FOLDER
    DOMAIN_FILENAME = os.path.join(DATASET_FOLDER, "domains.tsv")
    RANGE_FILENAME = os.path.join(DATASET_FOLDER, "ranges.tsv")
    EDGES_FILENAME = os.path.join(DATASET_FOLDER, "edges.txt")
    PRA_DIR = os.path.join(DATASET_FOLDER, "pra")
    SPLIT_DIR = os.path.join(DATASET_FOLDER, "split")
    CPR_PATH_DIR = os.path.join(DATASET_FOLDER, "cpr_paths")
    WORD2VEC_FILENAME = "data/word2vec/knowledge-vectors-skipgram1000.bin"
    ENTITY2VEC_FILENAME = os.path.join(DATASET_FOLDER, "synonym2vec.pkl")

    # ToDo: modify code to have cvsm_ret, cvsm_r, cvsm_rt, cvsm_re folders (r: relation, t: type, e: entity)
    CVSM_DATA_DIR = os.path.join(DATASET_FOLDER, "cvsm")
    PRA_PATH_DIR = os.path.join(DATASET_FOLDER, "pra_paths")
    RELATION_PATH_DIR = os.path.join(DATASET_FOLDER, "relation_paths")
    PATH_DIR = os.path.join(DATASET_FOLDER, "paths")
    NEW_PATH_DIR = os.path.join(DATASET_FOLDER, "new_paths")
    AUGMENT_PATH_DIR = os.path.join(DATASET_FOLDER, "paths_augment")

    CVSM_RET_DIR = os.path.join(DATASET_FOLDER, "cvsm_entity")
    ENTITY_TYPE2VEC_FILENAME = os.path.join(DATASET_FOLDER, "entity_type2vec.pkl")

    run_step = 1

    # 1. first run main/data/MIDFreebase15kReader.py to process data and generate necessary files
    if run_step == 1:
        fb = MIDFreebase15kReader(FREEBASE_DIR,
                                  filter=True,
                                  word2vec_filename=WORD2VEC_FILENAME)
        fb.read_data()
        fb.write_relation_domain_and_ranges()
        fb.write_edges()

        # get the dictionary from freebase mids to names
        # fb.get_mid_to_name()

    # 2. use PRA scala code and code here to create train/test/dev split and negative examples
    if run_step == 2:
        pra_driver = PRADriver(DATASET_FOLDER, PRA_TEMPLATE_DIR, PRA_RUN_DIR, DATASET_NAME)
        pra_driver.prepare_split()

    # 3. Extract paths with entities
    if run_step == 3:
        typed_relation_instances = TypedRelationInstances()
        typed_relation_instances.read_domains_and_ranges(DOMAIN_FILENAME, RANGE_FILENAME)
        typed_relation_instances.construct_from_labeled_edges(EDGES_FILENAME, entity_name_is_typed=False,
                                                              is_labeled=False)
        vocabs = Vocabs()
        vocabs.build_vocabs(typed_relation_instances)
        split = Split()
        split.read_splits(SPLIT_DIR, vocabs, entity_name_is_typed=True)
        graph = AdjacencyGraph()
        graph.build_graph(typed_relation_instances, vocabs)

        path_extractor = PathExtractor(max_length=4, include_entity=True, save_dir=PATH_DIR, include_path_len1=True,
                                       max_paths_per_pair=200, multiple_instances_per_pair=False,
                                       max_instances_per_pair=None, paths_sample_method="all_lengths")
        path_extractor.extract_paths(graph, split, vocabs)
        path_extractor.write_paths(split)
        # Note: we can extract path up to 6 but eliminate nodes with large fan-out

    # 4. Extract paths statistics
    if run_step == 4:
        typed_relation_instances = TypedRelationInstances()
        typed_relation_instances.read_domains_and_ranges(DOMAIN_FILENAME, RANGE_FILENAME)
        typed_relation_instances.construct_from_labeled_edges(EDGES_FILENAME, entity_name_is_typed=False,
                                                              is_labeled=False)
        vocabs = Vocabs()
        vocabs.build_vocabs(typed_relation_instances)
        split = Split()
        split.read_splits(SPLIT_DIR, vocabs, entity_name_is_typed=True)
        path_reader = PathReader(save_dir=PATH_DIR)
        path_reader.read_paths(split)

    # 5. Process data for running the model
    if run_step == 5:
        # first convert paths to cvsm format
        cvsm_driver = CVSMDriver(FREEBASE_DIR, CVSM_RUN_DIR, dataset="freebase", include_entity=True, has_entity=True,
                                 augment_data=False, include_entity_type=True)
        cvsm_driver.setup_cvsm_dir()

        # then vectorize cvsm format data
        process_paths(input_dir=os.path.join(CVSM_RET_DIR, "data/data_input"),
                      output_dir=os.path.join(CVSM_RET_DIR, "data/data_output"),
                      vocab_dir=os.path.join(CVSM_RET_DIR, "data/vocab"),
                      isOnlyRelation=False,
                      getOnlyRelation=False,
                      MAX_POSSIBLE_LENGTH_PATH=8,  # the max number of relations in a path + 1
                      NUM_ENTITY_TYPES_SLOTS=7,  # the number of types + 1 (the reason we +1 is to create a meaningless type for all entities)
                      pre_padding=True)

    # 6. test run of the model
    # use $tensorboard --logdir runs to see the training progress
    if run_step == 6:
        cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None, attention_method="sat", early_stopping_metric="map")
        # cvsm.train_and_test()

        # Uncomment if need to train only one relation
        cvsm.train(os.path.join(CVSM_RET_DIR, "data/data_output/|food|food|nutrients.|food|nutrition_fact|nutrient"))

    # 7. test different path pooling methods
    if run_step == 7:
        cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,
                                            pooling_method="lse", attention_method="sat",
                                            early_stopping_metric="map")
        cvsm.train_and_test()

        cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,
                                            pooling_method="avg", attention_method="sat",
                                            early_stopping_metric="map")
        cvsm.train_and_test()

        cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,
                                            pooling_method="max", attention_method="sat",
                                            early_stopping_metric="map")
        cvsm.train_and_test()

    # 8. visualize type attention
    if run_step == 8:
        cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,
                                            pooling_method="sat", attention_method="sat", early_stopping_metric="map",
                                            visualize=False, calculate_path_attn_stats=False, calculate_type_attn_stats=True,
                                            mid2name_filename="/home/weiyu/Research/ChainsOfReasoningWithAbstractEntities/data/fb15k237/mid2name.pkl",
                                            best_models={'|people|person|profession': {'epoch': 10}, '|award|award_winning_work|awards_won.|award|award_honor|award': {'epoch': 16}, '|tv|tv_program|regular_cast.|tv|regular_tv_appearance|actor': {'epoch': 23}, '|music|record_label|artist': {'epoch': 27}, '|award|award_category|nominees.|award|award_nomination|nominated_for': {'epoch': 15}, '|film|film|production_companies': {'epoch': 24}, '|film|film|genre': {'epoch': 13}, '|sports|sports_position|players.|sports|sports_team_roster|team': {'epoch': 25}, '|food|food|nutrients.|food|nutrition_fact|nutrient': {'epoch': 17}, '|education|educational_institution|students_graduates.|education|education|major_field_of_study': {'epoch': 21}})
        cvsm.train_and_test()

    # 9. run ablation on type selection
    if run_step == 9:
        cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,
                                            pooling_method="lse", attention_method="specific",
                                            early_stopping_metric="map")
        cvsm.train_and_test()
added complete code 2020-08-01 05:32:38 +08:00			`import os`

			`# import and build cython`
			`import pyximport`
			`pyximport.install()`

			`from main.data.MIDFreebase15kReader import MIDFreebase15kReader`
			`from main.data.TypedRelationInstances import TypedRelationInstances`
			`from main.data.Vocabs import Vocabs`
			`from main.graphs.AdjacencyGraph import AdjacencyGraph`
			`from main.features.PathExtractor import PathExtractor`
			`from main.data.Split import Split`
			`from main.features.PathReader import PathReader`
			`from main.experiments.CVSMDriver import CVSMDriver`
			`from main.experiments.PRADriver import PRADriver`
			`from main.playground.make_data_format import process_paths`
			`from main.playground.model2.CompositionalVectorAlgorithm import CompositionalVectorAlgorithm`

			`# This script is used to run FB15k237 experiments (only our method) with 1:10 postive to negative ratio.`

			`if __name__ == "__main__":`

			`# location of Das et al.'s repo`
			`CVSM_RUN_DIR = "/home/weiyu/Research/Path_Baselines/CVSM/ChainsofReasoning"`
			`# location of Matt's PRA scala repo`
			`PRA_RUN_DIR = "/home/weiyu/Research/Path_Baselines/SFE/pra_scala"`

			`DATASET_NAME = "fb15k237"`
			`DATASET_FOLDER = os.path.join("data", DATASET_NAME)`
			`PRA_TEMPLATE_DIR = "pra_templates"`

			`FREEBASE_DIR = DATASET_FOLDER`
			`DOMAIN_FILENAME = os.path.join(DATASET_FOLDER, "domains.tsv")`
			`RANGE_FILENAME = os.path.join(DATASET_FOLDER, "ranges.tsv")`
			`EDGES_FILENAME = os.path.join(DATASET_FOLDER, "edges.txt")`
			`PRA_DIR = os.path.join(DATASET_FOLDER, "pra")`
			`SPLIT_DIR = os.path.join(DATASET_FOLDER, "split")`
			`CPR_PATH_DIR = os.path.join(DATASET_FOLDER, "cpr_paths")`
			`WORD2VEC_FILENAME = "data/word2vec/knowledge-vectors-skipgram1000.bin"`
			`ENTITY2VEC_FILENAME = os.path.join(DATASET_FOLDER, "synonym2vec.pkl")`

			`# ToDo: modify code to have cvsm_ret, cvsm_r, cvsm_rt, cvsm_re folders (r: relation, t: type, e: entity)`
			`CVSM_DATA_DIR = os.path.join(DATASET_FOLDER, "cvsm")`
			`PRA_PATH_DIR = os.path.join(DATASET_FOLDER, "pra_paths")`
			`RELATION_PATH_DIR = os.path.join(DATASET_FOLDER, "relation_paths")`
			`PATH_DIR = os.path.join(DATASET_FOLDER, "paths")`
			`NEW_PATH_DIR = os.path.join(DATASET_FOLDER, "new_paths")`
			`AUGMENT_PATH_DIR = os.path.join(DATASET_FOLDER, "paths_augment")`

			`CVSM_RET_DIR = os.path.join(DATASET_FOLDER, "cvsm_entity")`
			`ENTITY_TYPE2VEC_FILENAME = os.path.join(DATASET_FOLDER, "entity_type2vec.pkl")`

			`run_step = 1`

			`# 1. first run main/data/MIDFreebase15kReader.py to process data and generate necessary files`
			`if run_step == 1:`
			`fb = MIDFreebase15kReader(FREEBASE_DIR,`
			`filter=True,`
			`word2vec_filename=WORD2VEC_FILENAME)`
			`fb.read_data()`
			`fb.write_relation_domain_and_ranges()`
			`fb.write_edges()`

			`# get the dictionary from freebase mids to names`
			`# fb.get_mid_to_name()`

			`# 2. use PRA scala code and code here to create train/test/dev split and negative examples`
			`if run_step == 2:`
			`pra_driver = PRADriver(DATASET_FOLDER, PRA_TEMPLATE_DIR, PRA_RUN_DIR, DATASET_NAME)`
			`pra_driver.prepare_split()`

			`# 3. Extract paths with entities`
			`if run_step == 3:`
			`typed_relation_instances = TypedRelationInstances()`
			`typed_relation_instances.read_domains_and_ranges(DOMAIN_FILENAME, RANGE_FILENAME)`
			`typed_relation_instances.construct_from_labeled_edges(EDGES_FILENAME, entity_name_is_typed=False,`
			`is_labeled=False)`
			`vocabs = Vocabs()`
			`vocabs.build_vocabs(typed_relation_instances)`
			`split = Split()`
			`split.read_splits(SPLIT_DIR, vocabs, entity_name_is_typed=True)`
			`graph = AdjacencyGraph()`
			`graph.build_graph(typed_relation_instances, vocabs)`

			`path_extractor = PathExtractor(max_length=4, include_entity=True, save_dir=PATH_DIR, include_path_len1=True,`
			`max_paths_per_pair=200, multiple_instances_per_pair=False,`
			`max_instances_per_pair=None, paths_sample_method="all_lengths")`
			`path_extractor.extract_paths(graph, split, vocabs)`
			`path_extractor.write_paths(split)`
			`# Note: we can extract path up to 6 but eliminate nodes with large fan-out`

			`# 4. Extract paths statistics`
			`if run_step == 4:`
			`typed_relation_instances = TypedRelationInstances()`
			`typed_relation_instances.read_domains_and_ranges(DOMAIN_FILENAME, RANGE_FILENAME)`
			`typed_relation_instances.construct_from_labeled_edges(EDGES_FILENAME, entity_name_is_typed=False,`
			`is_labeled=False)`
			`vocabs = Vocabs()`
			`vocabs.build_vocabs(typed_relation_instances)`
			`split = Split()`
			`split.read_splits(SPLIT_DIR, vocabs, entity_name_is_typed=True)`
			`path_reader = PathReader(save_dir=PATH_DIR)`
			`path_reader.read_paths(split)`

			`# 5. Process data for running the model`
			`if run_step == 5:`
			`# first convert paths to cvsm format`
			`cvsm_driver = CVSMDriver(FREEBASE_DIR, CVSM_RUN_DIR, dataset="freebase", include_entity=True, has_entity=True,`
			`augment_data=False, include_entity_type=True)`
			`cvsm_driver.setup_cvsm_dir()`

			`# then vectorize cvsm format data`
			`process_paths(input_dir=os.path.join(CVSM_RET_DIR, "data/data_input"),`
			`output_dir=os.path.join(CVSM_RET_DIR, "data/data_output"),`
			`vocab_dir=os.path.join(CVSM_RET_DIR, "data/vocab"),`
			`isOnlyRelation=False,`
			`getOnlyRelation=False,`
			`MAX_POSSIBLE_LENGTH_PATH=8, # the max number of relations in a path + 1`
			`NUM_ENTITY_TYPES_SLOTS=7, # the number of types + 1 (the reason we +1 is to create a meaningless type for all entities)`
			`pre_padding=True)`

			`# 6. test run of the model`
			`# use $tensorboard --logdir runs to see the training progress`
			`if run_step == 6:`
			`cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None, attention_method="sat", early_stopping_metric="map")`
			`# cvsm.train_and_test()`

			`# Uncomment if need to train only one relation`
			`cvsm.train(os.path.join(CVSM_RET_DIR, "data/data_output/\|food\|food\|nutrients.\|food\|nutrition_fact\|nutrient"))`

			`# 7. test different path pooling methods`
			`if run_step == 7:`
			`cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,`
			`pooling_method="lse", attention_method="sat",`
			`early_stopping_metric="map")`
			`cvsm.train_and_test()`

			`cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,`
			`pooling_method="avg", attention_method="sat",`
			`early_stopping_metric="map")`
			`cvsm.train_and_test()`

			`cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,`
			`pooling_method="max", attention_method="sat",`
			`early_stopping_metric="map")`
			`cvsm.train_and_test()`

			`# 8. visualize type attention`
			`if run_step == 8:`
			`cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,`
			`pooling_method="sat", attention_method="sat", early_stopping_metric="map",`
			`visualize=False, calculate_path_attn_stats=False, calculate_type_attn_stats=True,`
			`mid2name_filename="/home/weiyu/Research/ChainsOfReasoningWithAbstractEntities/data/fb15k237/mid2name.pkl",`
			best_models={'\|people\|person\|profession': {'epoch': 10}, '\|award\|award_winning_work\|awards_won.\|award\|award_honor\|award': {'epoch': 16}, '\|tv\|tv_program\|regular_cast.\|tv\|regular_tv_appearance\|actor': {'epoch': 23}, '\|music\|record_label\|artist': {'epoch': 27}, '\|award\|award_category\|nominees.\|award\|award_nomination\|nominated_for': {'epoch': 15}, '\|film\|film\|production_companies': {'epoch': 24}, '\|film\|film\|genre': {'epoch': 13}, '\|sports\|sports_position\|players.\|sports\|sports_team_roster\|team': {'epoch': 25}, '\|food\|food\|nutrients.\|food\|nutrition_fact\|nutrient': {'epoch': 17}, '\|education\|educational_institution\|students_graduates.\|education\|education\|major_field_of_study': {'epoch': 21}})
			`cvsm.train_and_test()`

			`# 9. run ablation on type selection`
			`if run_step == 9:`
			`cvsm = CompositionalVectorAlgorithm("freebase", CVSM_RET_DIR, None,`
			`pooling_method="lse", attention_method="specific",`
			`early_stopping_metric="map")`
			`cvsm.train_and_test()`