graphsage early stopping test

This commit is contained in:
Chengbin Hou 2018-11-22 19:26:38 +00:00
parent fa2fd1f21f
commit 7e1039cc3c
2 changed files with 94 additions and 59 deletions

View File

@ -13,27 +13,65 @@ log_device_placement = False
# we follow the opt parameters given by papers GCN and graphSAGE # we follow the opt parameters given by papers GCN and graphSAGE
# note: citeseer+pubmed all follow the same parameters as cora, see their papers) # note: citeseer+pubmed all follow the same parameters as cora, see their papers)
# tensorflow + Adam optimizer + Random weight init + row norm of attr # tensorflow + Adam optimizer + Random weight init + row norm of attr
epochs = 100
dim_1 = 64 #dim = dim1+dim2 = 128 for sage-mean and sage-gcn dim_1 = 64 #dim = dim1+dim2 = 128 for sage-mean and sage-gcn
dim_2 = 64 dim_2 = 64
learning_rate = 0.001
dropout = 0.5
weight_decay = 0.0001
batch_size = 128 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=512
samples_1 = 25 samples_1 = 25
samples_2 = 10 samples_2 = 10
# key parameters during training
epochs = 100
learning_rate = 0.001 #search [0.01, 0.001, 0.0001, 0.00001]
dropout = 0.5
weight_decay = 5e-4
batch_size = 512 #if run out of memory, try to reduce them, default=512
#other parameters that paper did not mentioned, but we also follow the defaults https://github.com/williamleif/GraphSAGE # key parameters durning val
model_size = 'small' validate_batch_size = 256 #if run out of memory, try to reduce them, default=256
max_degree = 100
neg_sample_size = 20
random_context= True
validate_batch_size = 64 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=256
validate_iter = 5000 validate_iter = 5000
max_total_steps = 10**10 max_total_steps = 10**10
n2v_test_epochs = 1 print_every = 50
#other parameters also follow the defaults https://github.com/williamleif/GraphSAGE
neg_sample_size = 20
identity_dim = 0 identity_dim = 0
n2v_test_epochs = 1
random_context= False
model_size = 'small'
max_degree = 100
train_prefix = '' train_prefix = ''
base_log_dir = '' base_log_dir = ''
#print_every = 50 base_log_dir = ''
'''
#core params..
flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.')
flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.')
flags.DEFINE_string("model_size", "small", "Can be big or small; model specific def'ns")
flags.DEFINE_string('train_prefix', '', 'name of the object file that stores the training data. must be specified.')
# left to default values in main experiments
flags.DEFINE_integer('epochs', 1, 'number of epochs to train.')
flags.DEFINE_float('dropout', 0.0, 'dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 0.0, 'weight for l2 loss on embedding matrix.')
flags.DEFINE_integer('max_degree', 100, 'maximum node degree.')
flags.DEFINE_integer('samples_1', 25, 'number of samples in layer 1')
flags.DEFINE_integer('samples_2', 10, 'number of users samples in layer 2')
flags.DEFINE_integer('dim_1', 128, 'Size of output dim (final is 2x this, if using concat)')
flags.DEFINE_integer('dim_2', 128, 'Size of output dim (final is 2x this, if using concat)')
flags.DEFINE_boolean('random_context', True, 'Whether to use random context or direct edges')
flags.DEFINE_integer('neg_sample_size', 20, 'number of negative samples')
flags.DEFINE_integer('batch_size', 512, 'minibatch size.')
flags.DEFINE_integer('n2v_test_epochs', 1, 'Number of new SGD epochs for n2v.')
flags.DEFINE_integer('identity_dim', 0, 'Set to positive value to use identity embedding features of that dimension. Default 0.')
#logging, saving, validation settings etc.
flags.DEFINE_boolean('save_embeddings', True, 'whether to save embeddings for all nodes after training')
flags.DEFINE_string('base_log_dir', '.', 'base directory for logging and saving embeddings')
flags.DEFINE_integer('validate_iter', 5000, "how often to run a validation minibatch.")
flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation sample.")
flags.DEFINE_integer('gpu', 1, "which gpu to use.")
flags.DEFINE_integer('print_every', 50, "How often to print training info.")
flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations")
'''

View File

@ -1,5 +1,8 @@
from __future__ import division ''' minor amended by Chengbin with comments
from __future__ import print_function so as to fit OpenANE framework \n
the key difference: auto save the best emb by looking at val loss \n
originally from https://github.com/williamleif/GraphSAGE \n
'''
import os import os
import time import time
@ -21,7 +24,6 @@ def evaluate(sess, model, minibatch_iter, size=None):
feed_dict=feed_dict_val) feed_dict=feed_dict_val)
return outs_val[0], outs_val[1], outs_val[2], (time.time() - t_test) return outs_val[0], outs_val[1], outs_val[2], (time.time() - t_test)
'''
def incremental_evaluate(sess, model, minibatch_iter, size): def incremental_evaluate(sess, model, minibatch_iter, size):
t_test = time.time() t_test = time.time()
finished = False finished = False
@ -36,41 +38,31 @@ def incremental_evaluate(sess, model, minibatch_iter, size):
val_losses.append(outs_val[0]) val_losses.append(outs_val[0])
val_mrrs.append(outs_val[2]) val_mrrs.append(outs_val[2])
return np.mean(val_losses), np.mean(val_mrrs), (time.time() - t_test) return np.mean(val_losses), np.mean(val_mrrs), (time.time() - t_test)
'''
def save_val_embeddings(sess, model, minibatch_iter, size, mod=""): def save_val_embeddings(sess, model, minibatch_iter, size):
val_embeddings = [] val_embeddings = []
finished = False finished = False
seen = set([]) #this as set to store already seen emb-node id! seen = set([]) #this as set to store already seen emb-node id!
nodes = [] nodes = []
iter_num = 0 iter_num = 0
name = "val" #name = "val"
while not finished: while not finished:
feed_dict_val, finished, edges = minibatch_iter.incremental_embed_feed_dict(size, iter_num) feed_dict_val, finished, edges = minibatch_iter.incremental_embed_feed_dict(size, iter_num)
iter_num += 1 iter_num += 1
outs_val = sess.run([model.loss, model.mrr, model.outputs1], outs_val = sess.run([model.loss, model.mrr, model.outputs1], feed_dict=feed_dict_val)
feed_dict=feed_dict_val)
#ONLY SAVE FOR embeds1 because of planetoid #ONLY SAVE FOR embeds1 because of planetoid
for i, edge in enumerate(edges): for i, edge in enumerate(edges):
if not edge[0] in seen: if not edge[0] in seen:
val_embeddings.append(outs_val[-1][i,:]) val_embeddings.append(outs_val[-1][i,:])
nodes.append(edge[0]) #nodes: a list; has order nodes.append(edge[0]) #nodes: a list; has order
seen.add(edge[0]) #seen: a set; NO order!!! seen.add(edge[0]) #seen: a set; NO order!!!
#if not os.path.exists(out_dir):
# os.makedirs(out_dir)
val_embeddings = np.vstack(val_embeddings) val_embeddings = np.vstack(val_embeddings)
print(val_embeddings.shape) #print(val_embeddings.shape)
vectors = {} vectors = {}
for i, embedding in enumerate(val_embeddings): for i, embedding in enumerate(val_embeddings):
vectors[nodes[i]] = embedding #warning: seen: a set; nodes: a list vectors[nodes[i]] = embedding #warning: seen: a set; nodes: a list
return vectors return vectors #return them and use graphsageAPI to save them
''' #if we want to save embs, modify the following code
np.save(out_dir + name + mod + ".npy", val_embeddings)
with open(out_dir + name + mod + ".txt", "w") as fp:
fp.write("\n".join(map(str,nodes)))
'''
def construct_placeholders(): def construct_placeholders():
# Define placeholders # Define placeholders
@ -97,7 +89,6 @@ def train(train_data, test_data, model):
# pad with dummy zero vector # pad with dummy zero vector
features = np.vstack([features, np.zeros((features.shape[1],))]) features = np.vstack([features, np.zeros((features.shape[1],))])
random_context = False
context_pairs = train_data[3] if random_context else None context_pairs = train_data[3] if random_context else None
placeholders = construct_placeholders() placeholders = construct_placeholders()
minibatch = EdgeMinibatchIterator(G, minibatch = EdgeMinibatchIterator(G,
@ -201,7 +192,7 @@ def train(train_data, test_data, model):
# Initialize session # Initialize session
sess = tf.Session(config=config) sess = tf.Session(config=config)
merged = tf.summary.merge_all() merged = tf.summary.merge_all()
#summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) #we ignore log file
# Init variables # Init variables
sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj})
@ -210,13 +201,15 @@ def train(train_data, test_data, model):
train_shadow_mrr = None train_shadow_mrr = None
shadow_mrr = None shadow_mrr = None
total_steps = 0 total_steps = 0
avg_time = 0.0
epoch_val_costs = [] epoch_val_costs = []
train_adj_info = tf.assign(adj_info, minibatch.adj) train_adj_info = tf.assign(adj_info, minibatch.adj)
val_adj_info = tf.assign(adj_info, minibatch.test_adj) val_adj_info = tf.assign(adj_info, minibatch.test_adj)
vectors = None #to store best embs and return at the end
best_result = None
t1 = time.time()
for epoch in range(epochs): for epoch in range(epochs):
minibatch.shuffle() minibatch.shuffle()
@ -228,7 +221,7 @@ def train(train_data, test_data, model):
val_cost = 0 val_cost = 0
val_mrr = 0 val_mrr = 0
shadow_mrr = 0 shadow_mrr = 0
avg_time = 0
while not minibatch.end(): while not minibatch.end():
# Construct feed dictionary # Construct feed dictionary
feed_dict = minibatch.next_minibatch_feed_dict() feed_dict = minibatch.next_minibatch_feed_dict()
@ -258,35 +251,39 @@ def train(train_data, test_data, model):
#if total_steps % print_every == 0: #if total_steps % print_every == 0:
#summary_writer.add_summary(outs[0], total_steps) #summary_writer.add_summary(outs[0], total_steps)
# Print results
avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1)
iter += 1 iter += 1
total_steps += 1 total_steps += 1
if total_steps > max_total_steps: if total_steps > max_total_steps:
break break
epoch += 1 epoch += 1
t2 = time.time()
#only print the last iter result at the end of each epoch
print("Epoch:", '%04d' % epoch, print("Epoch:", '%04d' % epoch,
"train_loss=", "{:.5f}".format(train_cost), "train_loss=", "{:.5f}".format(train_cost),
"train_mrr=", "{:.5f}".format(train_mrr), #"train_mrr=", "{:.5f}".format(train_mrr),
"train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), # exponential moving average #"train_mrr_ema=", "{:.5f}".format(train_shadow_mrr),
"val_loss=", "{:.5f}".format(val_cost), "val_loss=", "{:.5f}".format(val_cost),
"val_mrr=", "{:.5f}".format(val_mrr), #"val_mrr=", "{:.5f}".format(val_mrr),
"val_mrr_ema=", "{:.5f}".format(shadow_mrr), # exponential moving average #"val_mrr_ema=", "{:.5f}".format(shadow_mrr),
"time=", "{:.5f}".format(avg_time)) "time cost", "{:.2f}".format(t2-t1))
if total_steps > max_total_steps: #no early stopping was used in original code---------------- auto-save-best-emb ------------------------------
break #instead, we will chose the best result by looking at smallest val loss
if epoch == 1:
print("Optimization Finished!") best_result = val_cost
sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result...
sess.run(val_adj_info.op) vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size)
#save_val_embeddings(sess, model, minibatch, validate_batch_size, log_dir()) else:
return save_val_embeddings(sess, model, minibatch, validate_batch_size) #return embs if best_result > val_cost: #if val loss decreasing
best_result = val_cost
sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result...
def graphsage_save_embeddings(self, filename): #to do... vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size)
pass else:
print('val loss increasing @ ', epoch, ' w.r.t. last best epoch, and do not cover previous emb...')
#sess.run(val_adj_info.op) #what is this for before get emb ? ignore it?????
#vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size)
print("Finished!")
return vectors