graphsage early stopping test
This commit is contained in:
parent
fa2fd1f21f
commit
7e1039cc3c
@ -13,27 +13,65 @@ log_device_placement = False
|
||||
# we follow the opt parameters given by papers GCN and graphSAGE
|
||||
# note: citeseer+pubmed all follow the same parameters as cora, see their papers)
|
||||
# tensorflow + Adam optimizer + Random weight init + row norm of attr
|
||||
epochs = 100
|
||||
dim_1 = 64 #dim = dim1+dim2 = 128 for sage-mean and sage-gcn
|
||||
dim_2 = 64
|
||||
learning_rate = 0.001
|
||||
dropout = 0.5
|
||||
weight_decay = 0.0001
|
||||
batch_size = 128 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=512
|
||||
samples_1 = 25
|
||||
samples_2 = 10
|
||||
|
||||
# key parameters during training
|
||||
epochs = 100
|
||||
learning_rate = 0.001 #search [0.01, 0.001, 0.0001, 0.00001]
|
||||
dropout = 0.5
|
||||
weight_decay = 5e-4
|
||||
batch_size = 512 #if run out of memory, try to reduce them, default=512
|
||||
|
||||
#other parameters that paper did not mentioned, but we also follow the defaults https://github.com/williamleif/GraphSAGE
|
||||
model_size = 'small'
|
||||
max_degree = 100
|
||||
neg_sample_size = 20
|
||||
random_context= True
|
||||
validate_batch_size = 64 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=256
|
||||
# key parameters durning val
|
||||
validate_batch_size = 256 #if run out of memory, try to reduce them, default=256
|
||||
validate_iter = 5000
|
||||
max_total_steps = 10**10
|
||||
n2v_test_epochs = 1
|
||||
print_every = 50
|
||||
|
||||
#other parameters also follow the defaults https://github.com/williamleif/GraphSAGE
|
||||
neg_sample_size = 20
|
||||
identity_dim = 0
|
||||
n2v_test_epochs = 1
|
||||
random_context= False
|
||||
model_size = 'small'
|
||||
max_degree = 100
|
||||
train_prefix = ''
|
||||
base_log_dir = ''
|
||||
#print_every = 50
|
||||
base_log_dir = ''
|
||||
|
||||
|
||||
|
||||
'''
|
||||
#core params..
|
||||
flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.')
|
||||
flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.')
|
||||
flags.DEFINE_string("model_size", "small", "Can be big or small; model specific def'ns")
|
||||
flags.DEFINE_string('train_prefix', '', 'name of the object file that stores the training data. must be specified.')
|
||||
|
||||
# left to default values in main experiments
|
||||
flags.DEFINE_integer('epochs', 1, 'number of epochs to train.')
|
||||
flags.DEFINE_float('dropout', 0.0, 'dropout rate (1 - keep probability).')
|
||||
flags.DEFINE_float('weight_decay', 0.0, 'weight for l2 loss on embedding matrix.')
|
||||
flags.DEFINE_integer('max_degree', 100, 'maximum node degree.')
|
||||
flags.DEFINE_integer('samples_1', 25, 'number of samples in layer 1')
|
||||
flags.DEFINE_integer('samples_2', 10, 'number of users samples in layer 2')
|
||||
flags.DEFINE_integer('dim_1', 128, 'Size of output dim (final is 2x this, if using concat)')
|
||||
flags.DEFINE_integer('dim_2', 128, 'Size of output dim (final is 2x this, if using concat)')
|
||||
flags.DEFINE_boolean('random_context', True, 'Whether to use random context or direct edges')
|
||||
flags.DEFINE_integer('neg_sample_size', 20, 'number of negative samples')
|
||||
flags.DEFINE_integer('batch_size', 512, 'minibatch size.')
|
||||
flags.DEFINE_integer('n2v_test_epochs', 1, 'Number of new SGD epochs for n2v.')
|
||||
flags.DEFINE_integer('identity_dim', 0, 'Set to positive value to use identity embedding features of that dimension. Default 0.')
|
||||
|
||||
#logging, saving, validation settings etc.
|
||||
flags.DEFINE_boolean('save_embeddings', True, 'whether to save embeddings for all nodes after training')
|
||||
flags.DEFINE_string('base_log_dir', '.', 'base directory for logging and saving embeddings')
|
||||
flags.DEFINE_integer('validate_iter', 5000, "how often to run a validation minibatch.")
|
||||
flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation sample.")
|
||||
flags.DEFINE_integer('gpu', 1, "which gpu to use.")
|
||||
flags.DEFINE_integer('print_every', 50, "How often to print training info.")
|
||||
flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations")
|
||||
'''
|
@ -1,5 +1,8 @@
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
''' minor amended by Chengbin with comments
|
||||
so as to fit OpenANE framework \n
|
||||
the key difference: auto save the best emb by looking at val loss \n
|
||||
originally from https://github.com/williamleif/GraphSAGE \n
|
||||
'''
|
||||
|
||||
import os
|
||||
import time
|
||||
@ -21,7 +24,6 @@ def evaluate(sess, model, minibatch_iter, size=None):
|
||||
feed_dict=feed_dict_val)
|
||||
return outs_val[0], outs_val[1], outs_val[2], (time.time() - t_test)
|
||||
|
||||
'''
|
||||
def incremental_evaluate(sess, model, minibatch_iter, size):
|
||||
t_test = time.time()
|
||||
finished = False
|
||||
@ -36,41 +38,31 @@ def incremental_evaluate(sess, model, minibatch_iter, size):
|
||||
val_losses.append(outs_val[0])
|
||||
val_mrrs.append(outs_val[2])
|
||||
return np.mean(val_losses), np.mean(val_mrrs), (time.time() - t_test)
|
||||
'''
|
||||
|
||||
def save_val_embeddings(sess, model, minibatch_iter, size, mod=""):
|
||||
def save_val_embeddings(sess, model, minibatch_iter, size):
|
||||
val_embeddings = []
|
||||
finished = False
|
||||
seen = set([]) #this as set to store already seen emb-node id!
|
||||
nodes = []
|
||||
iter_num = 0
|
||||
name = "val"
|
||||
#name = "val"
|
||||
while not finished:
|
||||
feed_dict_val, finished, edges = minibatch_iter.incremental_embed_feed_dict(size, iter_num)
|
||||
iter_num += 1
|
||||
outs_val = sess.run([model.loss, model.mrr, model.outputs1],
|
||||
feed_dict=feed_dict_val)
|
||||
outs_val = sess.run([model.loss, model.mrr, model.outputs1], feed_dict=feed_dict_val)
|
||||
#ONLY SAVE FOR embeds1 because of planetoid
|
||||
for i, edge in enumerate(edges):
|
||||
if not edge[0] in seen:
|
||||
val_embeddings.append(outs_val[-1][i,:])
|
||||
nodes.append(edge[0]) #nodes: a list; has order
|
||||
seen.add(edge[0]) #seen: a set; NO order!!!
|
||||
#if not os.path.exists(out_dir):
|
||||
# os.makedirs(out_dir)
|
||||
|
||||
val_embeddings = np.vstack(val_embeddings)
|
||||
print(val_embeddings.shape)
|
||||
#print(val_embeddings.shape)
|
||||
vectors = {}
|
||||
for i, embedding in enumerate(val_embeddings):
|
||||
vectors[nodes[i]] = embedding #warning: seen: a set; nodes: a list
|
||||
return vectors
|
||||
|
||||
''' #if we want to save embs, modify the following code
|
||||
np.save(out_dir + name + mod + ".npy", val_embeddings)
|
||||
with open(out_dir + name + mod + ".txt", "w") as fp:
|
||||
fp.write("\n".join(map(str,nodes)))
|
||||
'''
|
||||
return vectors #return them and use graphsageAPI to save them
|
||||
|
||||
def construct_placeholders():
|
||||
# Define placeholders
|
||||
@ -97,7 +89,6 @@ def train(train_data, test_data, model):
|
||||
# pad with dummy zero vector
|
||||
features = np.vstack([features, np.zeros((features.shape[1],))])
|
||||
|
||||
random_context = False
|
||||
context_pairs = train_data[3] if random_context else None
|
||||
placeholders = construct_placeholders()
|
||||
minibatch = EdgeMinibatchIterator(G,
|
||||
@ -201,7 +192,7 @@ def train(train_data, test_data, model):
|
||||
# Initialize session
|
||||
sess = tf.Session(config=config)
|
||||
merged = tf.summary.merge_all()
|
||||
#summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)
|
||||
#summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) #we ignore log file
|
||||
|
||||
# Init variables
|
||||
sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj})
|
||||
@ -210,13 +201,15 @@ def train(train_data, test_data, model):
|
||||
|
||||
train_shadow_mrr = None
|
||||
shadow_mrr = None
|
||||
|
||||
total_steps = 0
|
||||
avg_time = 0.0
|
||||
epoch_val_costs = []
|
||||
|
||||
train_adj_info = tf.assign(adj_info, minibatch.adj)
|
||||
val_adj_info = tf.assign(adj_info, minibatch.test_adj)
|
||||
|
||||
vectors = None #to store best embs and return at the end
|
||||
best_result = None
|
||||
|
||||
t1 = time.time()
|
||||
for epoch in range(epochs):
|
||||
minibatch.shuffle()
|
||||
|
||||
@ -228,7 +221,7 @@ def train(train_data, test_data, model):
|
||||
val_cost = 0
|
||||
val_mrr = 0
|
||||
shadow_mrr = 0
|
||||
avg_time = 0
|
||||
|
||||
while not minibatch.end():
|
||||
# Construct feed dictionary
|
||||
feed_dict = minibatch.next_minibatch_feed_dict()
|
||||
@ -258,35 +251,39 @@ def train(train_data, test_data, model):
|
||||
|
||||
#if total_steps % print_every == 0:
|
||||
#summary_writer.add_summary(outs[0], total_steps)
|
||||
|
||||
# Print results
|
||||
avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1)
|
||||
|
||||
iter += 1
|
||||
total_steps += 1
|
||||
|
||||
if total_steps > max_total_steps:
|
||||
break
|
||||
|
||||
epoch += 1
|
||||
epoch += 1
|
||||
t2 = time.time()
|
||||
#only print the last iter result at the end of each epoch
|
||||
print("Epoch:", '%04d' % epoch,
|
||||
"train_loss=", "{:.5f}".format(train_cost),
|
||||
"train_mrr=", "{:.5f}".format(train_mrr),
|
||||
"train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), # exponential moving average
|
||||
#"train_mrr=", "{:.5f}".format(train_mrr),
|
||||
#"train_mrr_ema=", "{:.5f}".format(train_shadow_mrr),
|
||||
"val_loss=", "{:.5f}".format(val_cost),
|
||||
"val_mrr=", "{:.5f}".format(val_mrr),
|
||||
"val_mrr_ema=", "{:.5f}".format(shadow_mrr), # exponential moving average
|
||||
"time=", "{:.5f}".format(avg_time))
|
||||
#"val_mrr=", "{:.5f}".format(val_mrr),
|
||||
#"val_mrr_ema=", "{:.5f}".format(shadow_mrr),
|
||||
"time cost", "{:.2f}".format(t2-t1))
|
||||
|
||||
if total_steps > max_total_steps:
|
||||
break
|
||||
|
||||
print("Optimization Finished!")
|
||||
|
||||
sess.run(val_adj_info.op)
|
||||
#save_val_embeddings(sess, model, minibatch, validate_batch_size, log_dir())
|
||||
return save_val_embeddings(sess, model, minibatch, validate_batch_size) #return embs
|
||||
|
||||
|
||||
def graphsage_save_embeddings(self, filename): #to do...
|
||||
pass
|
||||
#no early stopping was used in original code---------------- auto-save-best-emb ------------------------------
|
||||
#instead, we will chose the best result by looking at smallest val loss
|
||||
if epoch == 1:
|
||||
best_result = val_cost
|
||||
sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result...
|
||||
vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size)
|
||||
else:
|
||||
if best_result > val_cost: #if val loss decreasing
|
||||
best_result = val_cost
|
||||
sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result...
|
||||
vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size)
|
||||
else:
|
||||
print('val loss increasing @ ', epoch, ' w.r.t. last best epoch, and do not cover previous emb...')
|
||||
|
||||
#sess.run(val_adj_info.op) #what is this for before get emb ? ignore it?????
|
||||
#vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size)
|
||||
print("Finished!")
|
||||
return vectors
|
Loading…
Reference in New Issue
Block a user