diff --git a/src/libnrl/graphsage/__init__.py b/src/libnrl/graphsage/__init__.py index ab04edc..05044f8 100644 --- a/src/libnrl/graphsage/__init__.py +++ b/src/libnrl/graphsage/__init__.py @@ -13,27 +13,65 @@ log_device_placement = False # we follow the opt parameters given by papers GCN and graphSAGE # note: citeseer+pubmed all follow the same parameters as cora, see their papers) # tensorflow + Adam optimizer + Random weight init + row norm of attr -epochs = 100 dim_1 = 64 #dim = dim1+dim2 = 128 for sage-mean and sage-gcn dim_2 = 64 -learning_rate = 0.001 -dropout = 0.5 -weight_decay = 0.0001 -batch_size = 128 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=512 samples_1 = 25 samples_2 = 10 +# key parameters during training +epochs = 100 +learning_rate = 0.001 #search [0.01, 0.001, 0.0001, 0.00001] +dropout = 0.5 +weight_decay = 5e-4 +batch_size = 512 #if run out of memory, try to reduce them, default=512 -#other parameters that paper did not mentioned, but we also follow the defaults https://github.com/williamleif/GraphSAGE -model_size = 'small' -max_degree = 100 -neg_sample_size = 20 -random_context= True -validate_batch_size = 64 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=256 +# key parameters durning val +validate_batch_size = 256 #if run out of memory, try to reduce them, default=256 validate_iter = 5000 max_total_steps = 10**10 -n2v_test_epochs = 1 +print_every = 50 + +#other parameters also follow the defaults https://github.com/williamleif/GraphSAGE +neg_sample_size = 20 identity_dim = 0 +n2v_test_epochs = 1 +random_context= False +model_size = 'small' +max_degree = 100 train_prefix = '' base_log_dir = '' -#print_every = 50 \ No newline at end of file +base_log_dir = '' + + + +''' +#core params.. +flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.') +flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.') +flags.DEFINE_string("model_size", "small", "Can be big or small; model specific def'ns") +flags.DEFINE_string('train_prefix', '', 'name of the object file that stores the training data. must be specified.') + +# left to default values in main experiments +flags.DEFINE_integer('epochs', 1, 'number of epochs to train.') +flags.DEFINE_float('dropout', 0.0, 'dropout rate (1 - keep probability).') +flags.DEFINE_float('weight_decay', 0.0, 'weight for l2 loss on embedding matrix.') +flags.DEFINE_integer('max_degree', 100, 'maximum node degree.') +flags.DEFINE_integer('samples_1', 25, 'number of samples in layer 1') +flags.DEFINE_integer('samples_2', 10, 'number of users samples in layer 2') +flags.DEFINE_integer('dim_1', 128, 'Size of output dim (final is 2x this, if using concat)') +flags.DEFINE_integer('dim_2', 128, 'Size of output dim (final is 2x this, if using concat)') +flags.DEFINE_boolean('random_context', True, 'Whether to use random context or direct edges') +flags.DEFINE_integer('neg_sample_size', 20, 'number of negative samples') +flags.DEFINE_integer('batch_size', 512, 'minibatch size.') +flags.DEFINE_integer('n2v_test_epochs', 1, 'Number of new SGD epochs for n2v.') +flags.DEFINE_integer('identity_dim', 0, 'Set to positive value to use identity embedding features of that dimension. Default 0.') + +#logging, saving, validation settings etc. +flags.DEFINE_boolean('save_embeddings', True, 'whether to save embeddings for all nodes after training') +flags.DEFINE_string('base_log_dir', '.', 'base directory for logging and saving embeddings') +flags.DEFINE_integer('validate_iter', 5000, "how often to run a validation minibatch.") +flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation sample.") +flags.DEFINE_integer('gpu', 1, "which gpu to use.") +flags.DEFINE_integer('print_every', 50, "How often to print training info.") +flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations") +''' \ No newline at end of file diff --git a/src/libnrl/graphsage/unsupervised_train.py b/src/libnrl/graphsage/unsupervised_train.py index 831ec7a..24bc65f 100644 --- a/src/libnrl/graphsage/unsupervised_train.py +++ b/src/libnrl/graphsage/unsupervised_train.py @@ -1,5 +1,8 @@ -from __future__ import division -from __future__ import print_function +''' minor amended by Chengbin with comments + so as to fit OpenANE framework \n + the key difference: auto save the best emb by looking at val loss \n + originally from https://github.com/williamleif/GraphSAGE \n +''' import os import time @@ -21,7 +24,6 @@ def evaluate(sess, model, minibatch_iter, size=None): feed_dict=feed_dict_val) return outs_val[0], outs_val[1], outs_val[2], (time.time() - t_test) -''' def incremental_evaluate(sess, model, minibatch_iter, size): t_test = time.time() finished = False @@ -36,41 +38,31 @@ def incremental_evaluate(sess, model, minibatch_iter, size): val_losses.append(outs_val[0]) val_mrrs.append(outs_val[2]) return np.mean(val_losses), np.mean(val_mrrs), (time.time() - t_test) -''' -def save_val_embeddings(sess, model, minibatch_iter, size, mod=""): +def save_val_embeddings(sess, model, minibatch_iter, size): val_embeddings = [] finished = False seen = set([]) #this as set to store already seen emb-node id! nodes = [] iter_num = 0 - name = "val" + #name = "val" while not finished: feed_dict_val, finished, edges = minibatch_iter.incremental_embed_feed_dict(size, iter_num) iter_num += 1 - outs_val = sess.run([model.loss, model.mrr, model.outputs1], - feed_dict=feed_dict_val) + outs_val = sess.run([model.loss, model.mrr, model.outputs1], feed_dict=feed_dict_val) #ONLY SAVE FOR embeds1 because of planetoid for i, edge in enumerate(edges): if not edge[0] in seen: val_embeddings.append(outs_val[-1][i,:]) nodes.append(edge[0]) #nodes: a list; has order seen.add(edge[0]) #seen: a set; NO order!!! - #if not os.path.exists(out_dir): - # os.makedirs(out_dir) val_embeddings = np.vstack(val_embeddings) - print(val_embeddings.shape) + #print(val_embeddings.shape) vectors = {} for i, embedding in enumerate(val_embeddings): vectors[nodes[i]] = embedding #warning: seen: a set; nodes: a list - return vectors - - ''' #if we want to save embs, modify the following code - np.save(out_dir + name + mod + ".npy", val_embeddings) - with open(out_dir + name + mod + ".txt", "w") as fp: - fp.write("\n".join(map(str,nodes))) - ''' + return vectors #return them and use graphsageAPI to save them def construct_placeholders(): # Define placeholders @@ -97,7 +89,6 @@ def train(train_data, test_data, model): # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1],))]) - random_context = False context_pairs = train_data[3] if random_context else None placeholders = construct_placeholders() minibatch = EdgeMinibatchIterator(G, @@ -201,7 +192,7 @@ def train(train_data, test_data, model): # Initialize session sess = tf.Session(config=config) merged = tf.summary.merge_all() - #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) + #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) #we ignore log file # Init variables sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) @@ -210,13 +201,15 @@ def train(train_data, test_data, model): train_shadow_mrr = None shadow_mrr = None - total_steps = 0 - avg_time = 0.0 epoch_val_costs = [] - train_adj_info = tf.assign(adj_info, minibatch.adj) val_adj_info = tf.assign(adj_info, minibatch.test_adj) + + vectors = None #to store best embs and return at the end + best_result = None + + t1 = time.time() for epoch in range(epochs): minibatch.shuffle() @@ -228,7 +221,7 @@ def train(train_data, test_data, model): val_cost = 0 val_mrr = 0 shadow_mrr = 0 - avg_time = 0 + while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict() @@ -258,35 +251,39 @@ def train(train_data, test_data, model): #if total_steps % print_every == 0: #summary_writer.add_summary(outs[0], total_steps) - - # Print results - avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) iter += 1 total_steps += 1 - if total_steps > max_total_steps: break - epoch += 1 + epoch += 1 + t2 = time.time() + #only print the last iter result at the end of each epoch print("Epoch:", '%04d' % epoch, "train_loss=", "{:.5f}".format(train_cost), - "train_mrr=", "{:.5f}".format(train_mrr), - "train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), # exponential moving average + #"train_mrr=", "{:.5f}".format(train_mrr), + #"train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), "val_loss=", "{:.5f}".format(val_cost), - "val_mrr=", "{:.5f}".format(val_mrr), - "val_mrr_ema=", "{:.5f}".format(shadow_mrr), # exponential moving average - "time=", "{:.5f}".format(avg_time)) + #"val_mrr=", "{:.5f}".format(val_mrr), + #"val_mrr_ema=", "{:.5f}".format(shadow_mrr), + "time cost", "{:.2f}".format(t2-t1)) - if total_steps > max_total_steps: - break - - print("Optimization Finished!") - - sess.run(val_adj_info.op) - #save_val_embeddings(sess, model, minibatch, validate_batch_size, log_dir()) - return save_val_embeddings(sess, model, minibatch, validate_batch_size) #return embs - - -def graphsage_save_embeddings(self, filename): #to do... - pass \ No newline at end of file + #no early stopping was used in original code---------------- auto-save-best-emb ------------------------------ + #instead, we will chose the best result by looking at smallest val loss + if epoch == 1: + best_result = val_cost + sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result... + vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size) + else: + if best_result > val_cost: #if val loss decreasing + best_result = val_cost + sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result... + vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size) + else: + print('val loss increasing @ ', epoch, ' w.r.t. last best epoch, and do not cover previous emb...') + + #sess.run(val_adj_info.op) #what is this for before get emb ? ignore it????? + #vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size) + print("Finished!") + return vectors \ No newline at end of file