diff --git a/.gitignore b/.gitignore index 134a714..4121346 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ db.ini deploy_key_rsa log bash +emb #zeyu-------------------------------- diff --git a/LICENSE b/LICENSE index dfc6c1c..35cc95e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017 THUNLP +Copyright (c) 2018 Chengbin HOU Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 0970a37..6c4aa47 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,66 @@ -# OpenANE: The first open source toolkit specialized in Attributed Network Embedding (ANE) -authors: Chengbin Hou & Zeyu Dong 2018 -Email Correspondence: chengbin.hou10 AT foxmail.com +# OpenANE: The first open source framework specialized in Attributed Network Embedding (ANE) +We reproduce several ANE (Attributed Network Embedding) as well as PNE (Pure Network Embedding) methods in one framework, where they all share the same I/O and downstream tasks. We start this project based on the excellent [OpenNE](https://github.com/thunlp/OpenNE) project that integrates several PNE methods under the same framework. However, OpenANE not only integrates those PNE methods from OpenNE, but also provides the state-of-the-art ANE methods that consider both structural and attribute information during embedding. -## Methods -ABRW: -GCN: -GraphSAGE: -ASNE: -TADW: -AANE: -DeepWalk: -Node2Vec: -LINE: -AttrPure: -AttrComb: -etc.... +authors: Chengbin HOU (chengbin.hou10@foxmail.com) & Zeyu DONG 2018 -## Requirements +## Motivation +In many real-world scenarios, a network often comes with node attributes such as the paper's title in a citation network and user profiles in a social network. PNE methods that only consider structural information cannot make use of attribute information which may further improve the quality of node embedding. + +From engineering perspective, by offering more APIs to handle attribute information in graph.py and utils.py, OpenANE shall be very easy to use for embedding an attributed network. Of course, OpenANE can also deal with pure network: 1) by calling PNE methods; and 2) by assigning ones as the attribute for all nodes and then calling ANE methods (but some ANE methods may fail). + +## Methods (todo... Chengbin) +[ABRW](https://github.com/houchengbin/ABRW), +[SAGE-GCN](https://github.com/williamleif/GraphSAGE), +[SAGE-Mean](https://github.com/williamleif/GraphSAGE), +[ASNE](https://github.com/lizi-git/ASNE), +[TADW](https://github.com/thunlp/OpenNE), +[AANE](https://github.com/xhuang31/AANE_Python), +[DeepWalk](https://github.com/thunlp/OpenNE), +[Node2Vec](https://github.com/thunlp/OpenNE), +[LINE](https://github.com/thunlp/OpenNE), +[GraRep](https://github.com/thunlp/OpenNE), +AttrPure, +AttrComb, + +## Requirements (todo... Zeyu; double check) pip install -r requirements.txt ## Usages -python src/main.py --method abrw +#### To obtain your node embeddings and evaluate them by classification downstream tasks +python src/main.py --method abrw --emb-file cora_abrw_emb --save-emb +#### To have an intuitive feeling of node embeddings (todo... Zeyu if possible; need tf installed) +python src/viz.py --emb-file cora_abrw_emb --label-file data/cora_label -## Datasets -Cora -Refer to xxxxx for other datasets such as Citeseer, PubMed, Facebook_Stanford and Facebook_MIT. +## Parameters +#### the meaning of each parameter +please see main.py +#### searching optimal value of parameter (todo... Chengbin) +ABRW +SAGE-GCN -## Acknowledgement -We start this project based on https://github.com/thunlp/OpenNE and thanks to their excellent project. +## Testing (todo... Zeyu) +Currently, we use the default parameter.... + +.... summary of parameters .... + +.... table --- results .... + + +## Datasets (todo...Chengbin) +We provide Cora for ... and other datasets e.g. Facebook_MIT, refer to [NetEmb-Datasets](https://github.com/houchengbin/NetEmb-datasets) + +### Your own dataset? +#### FILE for structural information (each row): +adjlist: node_id1 node_id2 node_id3 -> (the edges between (id1, id2) and (id1, id3)) + +OR edgelist: node_id1 node_id2 weight(optional) -> one edge (id1, id2) +#### FILE for attribute information (each row): +node_id1 attr1 attr2 ... attrM + +#### FILE for label (each row): +node_id1 label(s) + +## Want to contribute? +We highly welcome and appreciate your contributions on fixing bugs, reproducing new ANE methods, etc. And together, we hope this OpenANE framework would become influential on both academic research and industrial usage. + +## Recommended References (todo... Chengbin) diff --git a/src/libnrl/asne.py b/src/libnrl/asne.py index 7ff19ed..7ce1a92 100644 --- a/src/libnrl/asne.py +++ b/src/libnrl/asne.py @@ -1,86 +1,39 @@ -# -*- coding: utf-8 -*- ''' -Tensorflow implementation of Social Network Embedding framework (SNE) -@author: Lizi Liao (liaolizi.llz@gmail.com) -part of code was originally forked from https://github.com/lizi-git/ASNE +ANE method: Attributed Social Network Embedding (ASNE) modified by Chengbin Hou 2018 1) convert OpenANE data format to ASNE data format -2) compatible with latest tensorflow 1.2 -3) add more comments -4) support eval testing set during each xx epoches -5) as ASNE paper stated, we add two hidden layers with softsign activation func +2) compatible with latest tensorflow 1.10.0 +3) add early stopping +4) as ASNE paper stated, we add two hidden layers with softsign activation func + +part of code was originally forked from https://github.com/lizi-git/ASNE ''' import math import numpy as np import tensorflow as tf from sklearn.base import BaseEstimator, TransformerMixin -from .classify import ncClassifier, lpClassifier, read_node_label -from sklearn.linear_model import LogisticRegression - -def format_data_from_OpenANE_to_ASNE(g, dim): - ''' - convert OpenANE data format to ASNE data format - g: OpenANE graph data structure - dim: final embedding dim - ''' - attr_Matrix = g.getX() - #attr_Matrix = g.preprocessAttrInfo(attr_Matrix, dim=200, method='svd') #similar to aane, the same preprocessing - #print('with this preprocessing, ASNE can get better result, as well as, faster speed----------------') - id_N = attr_Matrix.shape[0] #n nodes - attr_M = attr_Matrix.shape[1] #m features - - edge_num = len(g.G.edges) #total edges for traning - X={} #one-to-one correspondence - X['data_id_list'] = np.zeros(edge_num) #start node list for traning - X['data_label_list'] = np.zeros(edge_num) #end node list for training - X['data_attr_list'] = np.zeros([edge_num, attr_M]) #attr corresponds to start node - edgelist = [edge for edge in g.G.edges] - i = 0 - for edge in edgelist: #traning sample = start node, end node, start node attr - X['data_id_list'][i] = edge[0] - X['data_label_list'][i] = edge[1] - X['data_attr_list'][i] = attr_Matrix[ g.look_up_dict[edge[0]] ][:] - i += 1 - X['data_id_list'] = X['data_id_list'].reshape(-1).astype(int) - X['data_label_list'] = X['data_label_list'].reshape(-1,1).astype(int) - - nodes={} #one-to-one correspondence - nodes['node_id'] = g.look_back_list #n nodes - nodes['node_attr'] = list(attr_Matrix) #m features -> n*m - - id_embedding_size = int(dim/2) - attr_embedding_size = int(dim/2) - print('id_embedding_size', id_embedding_size, 'attr_embedding_size', attr_embedding_size) - return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size - - -def add_layer(inputs, in_size, out_size, activation_function=None): - # add one more layer and return the output of this layer - Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated - biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) - Wx_plus_b = tf.matmul(inputs, Weights) + biases - if activation_function is None: - outputs = Wx_plus_b - else: - outputs = activation_function(Wx_plus_b) - return outputs - +import time +#from .classify import ncClassifier, lpClassifier, read_node_label +#from sklearn.linear_model import LogisticRegression class ASNE(BaseEstimator, TransformerMixin): - def __init__(self, graph, dim, alpha = 1.0, batch_size=128, learning_rate=0.001, - n_neg_samples=10, epoch=100, random_seed=2018, X_test=0, Y_test=0, task='nc', nc_ratio=0.5, lp_ratio=0.9, label_file=''): - # bind params to class + def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10, + early_stopping=2000): #it seems that overfitting can get better result? try other early_stopping... to do... + + t1 = time.time() X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim) + t2 = time.time() + print(f'transform data format from OpenANE to ASNE; time cost: {(t2-t1):.2f}s') + self.node_N = id_N #n self.attr_M = attr_M #m self.X_train = X #{'data_id_list': [], 'data_label_list': [], 'data_attr_list': []} self.nodes = nodes #{'node_id': [], 'node_attr: []'} self.id_embedding_size = id_embedding_size # set to dim/2 self.attr_embedding_size = attr_embedding_size # set to dim/2 - self.vectors = {} - self.dim = dim + self.vectors = {} #final embs self.look_back_list = graph.look_back_list #from OpenANE data stcuture self.alpha = alpha #set to 1.0 by default @@ -88,19 +41,11 @@ class ASNE(BaseEstimator, TransformerMixin): self.batch_size = batch_size #set to 128 by default self.learning_rate = learning_rate self.epoch = epoch #set to 20 by default - self.random_seed = random_seed - self._init_graph() #init all variables in a tensorflow graph - self.task = task - self.nc_ratio = nc_ratio - self.lp_ratio = lp_ratio - if self.task == 'lp': #if not lp task, we do not need to keep testing edges - self.X_test = X_test - self.Y_test = Y_test - self.train() #train our tf asne model----------------- - elif self.task == 'nc' or self.task == 'nclp': - self.X_nc_label, self.Y_nc_label = read_node_label(label_file) - self.train() #train our tf asne model----------------- + self._init_graph() #init all variables in a tensorflow graph + self.early_stopping = early_stopping #early stopping if training loss increased for xx iterations + self.train() + def _init_graph(self): ''' @@ -110,7 +55,7 @@ class ASNE(BaseEstimator, TransformerMixin): #with self.graph.as_default(), tf.device('/gpu:0'): with self.graph.as_default(): # Set graph level random seed - tf.set_random_seed(self.random_seed) + #tf.set_random_seed(2018) # Input data. self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1 self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M @@ -126,25 +71,26 @@ class ASNE(BaseEstimator, TransformerMixin): self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1) # batch_size * (id_dim + attr_dim) #an error due to old tf! - - ## can add hidden_layers component here! + ''' + ## can add hidden_layers component here!---------------------------------- #0) no hidden layer #1) 128 - #2) 256+128 ##--------paper stated it used two hidden layers with activation function softsign.... + #2) 256+128 ##--------paper stated it used two hidden layers with softsign #3) 512+256+128 - len_h1_in = self.id_embedding_size+self.attr_embedding_size - len_h1_out = 256 + len_h1_in = self.id_embedding_size + self.attr_embedding_size + len_h1_out = 256 #or self.id_embedding_size + self.attr_embedding_size # if only add h1 len_h2_in = len_h1_out - len_h2_out = 128 + len_h2_out = self.id_embedding_size + self.attr_embedding_size self.h1 = add_layer(inputs=self.embed_layer, in_size=len_h1_in, out_size=len_h1_out, activation_function=tf.nn.softsign) self.h2 = add_layer(inputs=self.h1, in_size=len_h2_in, out_size=len_h2_out, activation_function=tf.nn.softsign) - + ## ------------------------------------------------------------------------- + ''' # Compute the loss, using a sample of the negative labels each time. - self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], - inputs = self.h2, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) + self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], #if one needs to change layers + inputs = self.embed_layer, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) #try inputs = self.embed_layer or self.h1 or self.h2 or ... # Optimizer. - self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) #tune these parameters? + self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) # print("AdamOptimizer") # init @@ -171,14 +117,18 @@ class ASNE(BaseEstimator, TransformerMixin): start_index = np.random.randint(0, len(data) - batch_size) return data[start_index:(start_index + batch_size)] - def train(self): # fit a dataset - self.Embeddings = [] - print('Using in + out embedding') - for epoch in range( self.epoch ): - total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) #total_batch*batch_size = numOFlinks?? - # print('total_batch in 1 epoch: ', total_batch) - # Loop over all batches + def train(self): + self.Embeddings = [] + total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) + iter_count = 0 + train_loss_best = 0 + train_loss_keep_increasing = 0 + early_stopping = self.early_stopping #early stopping if training loss increased + + for epoch in range(self.epoch): + t1 = time.time() + for i in range(total_batch): # generate a batch data batch_xs = {} @@ -188,25 +138,40 @@ class ASNE(BaseEstimator, TransformerMixin): batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)] # Fit training using batch data - cost = self.partial_fit(batch_xs) - - # Display logs per epoch - Embeddings_out = self.getEmbedding('out_embedding', self.nodes) - Embeddings_in = self.getEmbedding('embed_layer', self.nodes) - self.Embeddings = Embeddings_out + Embeddings_in #simply mean them and as final embedding; try concat? to do... - #print('training tensorflow asne model, epoc: ', epoch+1 , ' / ', self.epoch) - #to save training time, we delete eval testing data @ each epoch + train_loss = self.partial_fit(batch_xs) + iter_count += 1 + if iter_count == 1: + train_loss_best = train_loss + else: + if train_loss_best > train_loss: # training loss decreasing + train_loss_best = train_loss + train_loss_keep_increasing = 0 # reset + else: # training loss increasing + train_loss_keep_increasing += 1 + if train_loss_keep_increasing > early_stopping: # early stopping + print(f'early stopping @ iter {iter_count}; take out embs and return') + Embeddings_out = self.getEmbedding('out_embedding', self.nodes) + Embeddings_in = self.getEmbedding('embed_layer', self.nodes) + self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do... + ind = 0 + for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list + self.vectors[id] = self.Embeddings[ind] + ind += 1 + return self.vectors + else: + pass + t2 = time.time() + print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',) - #-----------for each xx epoches; save embeddings {node_id1: [], node_id2: [], ...}---------- - if (epoch+1)%1 == 0 and epoch != 0: #for every xx epoches, try eval - print('@@@ epoch ------- ', epoch+1 , ' / ', self.epoch) - ind = 0 - for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list - self.vectors[id] = self.Embeddings[ind] - ind += 1 - #self.eval(vectors=self.vectors) - print('please note that: the fianl embedding returned and its output file are not the best embedding!') - print('for the best embeddings, please check which epoch got the best eval metric(s)......') + print(f'finish all {self.epoch} epochs; take out embs and return') + Embeddings_out = self.getEmbedding('out_embedding', self.nodes) + Embeddings_in = self.getEmbedding('embed_layer', self.nodes) + self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do... + ind = 0 + for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list + self.vectors[id] = self.Embeddings[ind] + ind += 1 + return self.vectors def getEmbedding(self, type, nodes): @@ -224,21 +189,59 @@ class ASNE(BaseEstimator, TransformerMixin): ''' fout = open(filename, 'w') node_num = len(self.vectors.keys()) - fout.write("{} {}\n".format(node_num, self.dim)) + fout.write("{} {}\n".format(node_num, self.id_embedding_size+self.attr_embedding_size)) for node, vec in self.vectors.items(): fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec]))) fout.close() - def eval(self, vectors): - #------nc task - if self.task == 'nc' or self.task == 'nclp': - print("Training nc classifier using {:.2f}% node labels...".format(self.nc_ratio*100)) - clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones - clf.split_train_evaluate(self.X_nc_label, self.Y_nc_label, self.nc_ratio) - #------lp task - if self.task == 'lp': - #X_test, Y_test = read_edge_label(args.label_file) #enable this if you want to load your own lp testing data, see classfiy.py - print("During embedding we have used {:.2f}% links and the remaining will be left for lp evaluation...".format(self.lp_ratio*100)) - clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm - clf.evaluate(self.X_test, self.Y_test) +# ---------------------------------------------- ASNE utils ------------------------------------------------ +def format_data_from_OpenANE_to_ASNE(g, dim): + ''' convert OpenANE data format to ASNE data format ''' + attr_Matrix = g.get_attr_mat(is_sparse=False) + id_N = attr_Matrix.shape[0] #n nodes + attr_M = attr_Matrix.shape[1] #m features + X = {} + X['data_id_list'] = [] + X['data_label_list'] = [] + X['data_attr_list'] = [] + edgelist = [edge for edge in g.G.edges] + print('If an edge only have one direction, double it......') + cnt = 0 + for edge in edgelist: #traning sample = start node, end node, start node attr + X['data_id_list'].append(edge[0]) + X['data_label_list'].append(edge[1]) + X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[0]] ][:]) + cnt += 1 + if (edge[1], edge[0]) not in edgelist: # double! as paper said-------------- + X['data_id_list'].append(edge[1]) + X['data_label_list'].append(edge[0]) + X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[1]] ][:]) + cnt += 1 + print(f'edges before doubling: {g.get_num_edges()}') + print(f'edges after doubling: {cnt}') + + X['data_id_list'] = np.array(X['data_id_list']).reshape(-1).astype(int) + X['data_label_list'] = np.array(X['data_label_list']).reshape(-1,1).astype(int) + X['data_attr_list'] = np.array(X['data_attr_list']).reshape(cnt,attr_M) + + nodes={} + nodes['node_id'] = g.look_back_list + nodes['node_attr'] = attr_Matrix + + id_embedding_size = int(dim/2) + attr_embedding_size = int(dim/2) + print('id_embedding_size', id_embedding_size, '\nattr_embedding_size', attr_embedding_size) + return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size + + +def add_layer(inputs, in_size, out_size, activation_function=None): + # add one more layer and return the output of this layer + Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated + biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) + Wx_plus_b = tf.matmul(inputs, Weights) + biases + if activation_function is None: + outputs = Wx_plus_b + else: + outputs = activation_function(Wx_plus_b) + return outputs \ No newline at end of file diff --git a/src/libnrl/graphsage/__init__.py b/src/libnrl/graphsage/__init__.py index 59385f9..05044f8 100644 --- a/src/libnrl/graphsage/__init__.py +++ b/src/libnrl/graphsage/__init__.py @@ -1,40 +1,77 @@ -from __future__ import print_function -from __future__ import division -import numpy as np -import tensorflow as tf -#default parameters +''' global parameters for graphsage models + tune these parameters here if needed + if needed use: from libnrl.graphsage.__init__ import * +''' + #seed = 2018 #np.random.seed(seed) #tf.set_random_seed(seed) log_device_placement = False + # follow the orignal code by the paper author https://github.com/williamleif/GraphSAGE # we follow the opt parameters given by papers GCN and graphSAGE # note: citeseer+pubmed all follow the same parameters as cora, see their papers) # tensorflow + Adam optimizer + Random weight init + row norm of attr - -epochs = 100 -dim_1 = 64 #dim = dim1+dim2 = 128 +dim_1 = 64 #dim = dim1+dim2 = 128 for sage-mean and sage-gcn dim_2 = 64 samples_1 = 25 samples_2 = 10 + +# key parameters during training +epochs = 100 +learning_rate = 0.001 #search [0.01, 0.001, 0.0001, 0.00001] dropout = 0.5 -weight_decay = 0.0001 -learning_rate = 0.0001 -batch_size = 128 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=512 -normalize = True #row norm of node attributes/features +weight_decay = 5e-4 +batch_size = 512 #if run out of memory, try to reduce them, default=512 -#other parameters that paper did not mentioned, but we also follow the defaults https://github.com/williamleif/GraphSAGE -model_size = 'small' -max_degree = 100 -neg_sample_size = 20 - -random_context= True -validate_batch_size = 64 #if run out of memory, try to reduce them, but we use the default e.g. 64, default=256 +# key parameters durning val +validate_batch_size = 256 #if run out of memory, try to reduce them, default=256 validate_iter = 5000 max_total_steps = 10**10 -n2v_test_epochs = 1 +print_every = 50 + +#other parameters also follow the defaults https://github.com/williamleif/GraphSAGE +neg_sample_size = 20 identity_dim = 0 +n2v_test_epochs = 1 +random_context= False +model_size = 'small' +max_degree = 100 train_prefix = '' base_log_dir = '' -#print_every = 50 \ No newline at end of file +base_log_dir = '' + + + +''' +#core params.. +flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.') +flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.') +flags.DEFINE_string("model_size", "small", "Can be big or small; model specific def'ns") +flags.DEFINE_string('train_prefix', '', 'name of the object file that stores the training data. must be specified.') + +# left to default values in main experiments +flags.DEFINE_integer('epochs', 1, 'number of epochs to train.') +flags.DEFINE_float('dropout', 0.0, 'dropout rate (1 - keep probability).') +flags.DEFINE_float('weight_decay', 0.0, 'weight for l2 loss on embedding matrix.') +flags.DEFINE_integer('max_degree', 100, 'maximum node degree.') +flags.DEFINE_integer('samples_1', 25, 'number of samples in layer 1') +flags.DEFINE_integer('samples_2', 10, 'number of users samples in layer 2') +flags.DEFINE_integer('dim_1', 128, 'Size of output dim (final is 2x this, if using concat)') +flags.DEFINE_integer('dim_2', 128, 'Size of output dim (final is 2x this, if using concat)') +flags.DEFINE_boolean('random_context', True, 'Whether to use random context or direct edges') +flags.DEFINE_integer('neg_sample_size', 20, 'number of negative samples') +flags.DEFINE_integer('batch_size', 512, 'minibatch size.') +flags.DEFINE_integer('n2v_test_epochs', 1, 'Number of new SGD epochs for n2v.') +flags.DEFINE_integer('identity_dim', 0, 'Set to positive value to use identity embedding features of that dimension. Default 0.') + +#logging, saving, validation settings etc. +flags.DEFINE_boolean('save_embeddings', True, 'whether to save embeddings for all nodes after training') +flags.DEFINE_string('base_log_dir', '.', 'base directory for logging and saving embeddings') +flags.DEFINE_integer('validate_iter', 5000, "how often to run a validation minibatch.") +flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation sample.") +flags.DEFINE_integer('gpu', 1, "which gpu to use.") +flags.DEFINE_integer('print_every', 50, "How often to print training info.") +flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations") +''' \ No newline at end of file diff --git a/src/libnrl/graphsage/graphsageAPI.py b/src/libnrl/graphsage/graphsageAPI.py index c40e127..d29852c 100644 --- a/src/libnrl/graphsage/graphsageAPI.py +++ b/src/libnrl/graphsage/graphsageAPI.py @@ -1,112 +1,120 @@ -# -*- coding: utf-8 -*- +''' +# author: Chengbin Hou @ SUSTech 2018 \n +# to tune parameters, refer to graphsage->__init__.py \n +# we provide utils to transform the orignal data into graphSAGE format \n +# the APIs are designed for unsupervised, \n +# for supervised way, plz refer and complete 'to do...' \n +# currently only support 'mean' and 'gcn' model \n ''' -#----------------------------------------------------------------------------- -# author: Chengbin Hou @ SUSTech 2018 -# Email: Chengbin.Hou10@foxmail.com -# we provide utils to transform the orignal data into graphSAGE format -# you may easily use these APIs as what we demostrated in main.py of OpenANE -# the APIs are designed for unsupervised, for supervised way, plz complete 'label' to do codes... -#----------------------------------------------------------------------------- -''' + from networkx.readwrite import json_graph import json import random import networkx as nx import numpy as np -from libnrl.graphsage import unsupervised_train +from libnrl.graphsage.__init__ import * #import default parameters -def add_train_val_test_to_G(graph, test_perc=0.0, val_perc=0.1): #due to unsupervised, we do not need test data - G = graph.G #take out nx G - random.seed(2018) - num_nodes = nx.number_of_nodes(G) - test_ind = random.sample(range(0, num_nodes), int(num_nodes*test_perc)) - val_ind = random.sample(range(0, num_nodes), int(num_nodes*val_perc)) - for ind in range(0, num_nodes): - id = graph.look_back_list[ind] - if ind in test_ind: - G.nodes[id]['test'] = True - G.nodes[id]['val'] = False - elif ind in val_ind: - G.nodes[id]['test'] = False - G.nodes[id]['val'] = True +class graphSAGE(object): + def __init__(self, graph, sage_model='mean', is_supervised=False): + self.graph = graph + self.normalize = True #row normalization of node attributes + self.num_walks = 50 + self.walk_len = 5 + + self.add_train_val_test_to_G(test_perc=0.0, val_perc=0.1) #if unsupervised, no test data + train_data = self.tranform_data_for_graphsage() #obtain graphSAGE required training data + + self.vectors = None + if not is_supervised: + from libnrl.graphsage import unsupervised_train + self.vectors = unsupervised_train.train(train_data=train_data, test_data=None, model=sage_model) else: - G.nodes[id]['test'] = False - G.nodes[id]['val'] = False - - ## Make sure the graph has edge train_removed annotations - ## (some datasets might already have this..) - print("Loaded data.. now preprocessing..") - for edge in G.edges(): - if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or - G.node[edge[0]]['test'] or G.node[edge[1]]['test']): - G[edge[0]][edge[1]]['train_removed'] = True - else: - G[edge[0]][edge[1]]['train_removed'] = False - return G + #to do... + #from libnrl.graphsage import supervised_train + #self.vectors = supervised_train.train() + pass -def run_random_walks(G, num_walks=50, walk_len=5): - nodes = [n for n in G.nodes() if not G.node[n]["val"] and not G.node[n]["test"]] - G = G.subgraph(nodes) - pairs = [] - for count, node in enumerate(nodes): - if G.degree(node) == 0: - continue - for i in range(num_walks): - curr_node = node - for j in range(walk_len): - if len(list(G.neighbors(curr_node))) == 0: #isolated nodes! often appeared in real-world - break - next_node = random.choice(list(G.neighbors(curr_node))) #changed due to compatibility - #next_node = random.choice(G.neighbors(curr_node)) - # self co-occurrences are useless - if curr_node != node: - pairs.append((node,curr_node)) - curr_node = next_node - if count % 1000 == 0: - print("Done walks for", count, "nodes") - return pairs -def tranform_data_for_graphsage(graph): - G = add_train_val_test_to_G(graph) #given OpenANE graph --> obtain graphSAGE graph - #G_json = json_graph.node_link_data(G) #train_data[0] in unsupervised_train.py + def add_train_val_test_to_G(self, test_perc=0.0, val_perc=0.1): + ''' add if 'val' and/or 'test' to each node in G ''' + G = self.graph.G + num_nodes = nx.number_of_nodes(G) + test_ind = random.sample(range(0, num_nodes), int(num_nodes*test_perc)) + val_ind = random.sample(range(0, num_nodes), int(num_nodes*val_perc)) + for ind in range(0, num_nodes): + id = self.graph.look_back_list[ind] + if ind in test_ind: + G.nodes[id]['test'] = True + G.nodes[id]['val'] = False + elif ind in val_ind: + G.nodes[id]['test'] = False + G.nodes[id]['val'] = True + else: + G.nodes[id]['test'] = False + G.nodes[id]['val'] = False + ## Make sure the graph has edge train_removed annotations + ## (some datasets might already have this..) + print("Loaded data.. now preprocessing..") + for edge in G.edges(): + if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or + G.node[edge[0]]['test'] or G.node[edge[1]]['test']): + G[edge[0]][edge[1]]['train_removed'] = True + else: + G[edge[0]][edge[1]]['train_removed'] = False + return G - id_map = graph.look_up_dict - #conversion = lambda n : int(n) # compatible with networkx >2.0 - #id_map = {conversion(k):int(v) for k,v in id_map.items()} # due to graphSAGE requirement + def tranform_data_for_graphsage(self): + ''' OpenANE graph -> graphSAGE required format ''' + id_map = self.graph.look_up_dict + G = self.graph.G + feats = np.array([G.nodes[id]['attr'] for id in id_map.keys()]) + normalize = self.normalize + if normalize and not feats is None: + print("------------- row norm of node attributes ------------------", normalize) + from sklearn.preprocessing import StandardScaler + train_inds = [id_map[n] for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] + train_feats = feats[train_inds] + scaler = StandardScaler() + scaler.fit(train_feats) + feats = scaler.transform(feats) + #feats1 = nx.get_node_attributes(G,'test') + #feats2 = nx.get_node_attributes(G,'val') + walks = [] + walks = self.run_random_walks(num_walks=self.num_walks, walk_len=self.walk_len) + class_map = 0 #to do... use sklearn to make class into binary form, no need for unsupervised... + return G, feats, id_map, walks, class_map - feats = np.array([G.nodes[id]['attr'] for id in id_map.keys()]) - normalize = True #have decleared in __init__.py - if normalize and not feats is None: - print("-------------row norm of node attributes/features------------------") - from sklearn.preprocessing import StandardScaler - train_inds = [id_map[n] for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] - train_feats = feats[train_inds] - scaler = StandardScaler() - scaler.fit(train_feats) - feats = scaler.transform(feats) - #feats1 = nx.get_node_attributes(G,'test') - #feats2 = nx.get_node_attributes(G,'val') + def run_random_walks(self, num_walks=50, walk_len=5): + ''' generate random walks ''' + G = self.graph.G + nodes = [n for n in G.nodes() if not G.node[n]["val"] and not G.node[n]["test"]] + G = G.subgraph(nodes) + pairs = [] + for count, node in enumerate(nodes): + if G.degree(node) == 0: + continue + for i in range(num_walks): + curr_node = node + for j in range(walk_len): + if len(list(G.neighbors(curr_node))) == 0: #isolated nodes! often appeared in real-world + break + next_node = random.choice(list(G.neighbors(curr_node))) #changed due to compatibility + #next_node = random.choice(G.neighbors(curr_node)) + # self co-occurrences are useless + if curr_node != node: + pairs.append((node,curr_node)) + curr_node = next_node + if count % 1000 == 0: + print("Done walks for", count, "nodes") + return pairs - walks = [] - walks = run_random_walks(G, num_walks=50, walk_len=5) #use the defualt parameter in graphSAGE - - class_map = 0 #to do... use sklearn to make class into binary form, no need for unsupervised... - return G, feats, id_map, walks, class_map - -def graphsage_unsupervised_train(graph, graphsage_model = 'graphsage_mean'): - train_data = tranform_data_for_graphsage(graph) - #from unsupervised_train.py - vectors = unsupervised_train.train(train_data, test_data=None, model = graphsage_model) - return vectors - -''' -def save_embeddings(self, filename): - fout = open(filename, 'w') - node_num = len(self.vectors.keys()) - fout.write("{} {}\n".format(node_num, self.size)) - for node, vec in self.vectors.items(): - fout.write("{} {}\n".format(node, - ' '.join([str(x) for x in vec]))) - fout.close() -''' \ No newline at end of file + def save_embeddings(self, filename): + ''' save embeddings to file ''' + fout = open(filename, 'w') + node_num = len(self.vectors.keys()) + emb_dim = len(next(iter(self.vectors.values()))) + fout.write("{} {}\n".format(node_num, emb_dim)) + for node, vec in self.vectors.items(): + fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec]))) + fout.close() \ No newline at end of file diff --git a/src/libnrl/graphsage/unsupervised_train.py b/src/libnrl/graphsage/unsupervised_train.py index 91b9b88..24bc65f 100644 --- a/src/libnrl/graphsage/unsupervised_train.py +++ b/src/libnrl/graphsage/unsupervised_train.py @@ -1,5 +1,8 @@ -from __future__ import division -from __future__ import print_function +''' minor amended by Chengbin with comments + so as to fit OpenANE framework \n + the key difference: auto save the best emb by looking at val loss \n + originally from https://github.com/williamleif/GraphSAGE \n +''' import os import time @@ -21,7 +24,6 @@ def evaluate(sess, model, minibatch_iter, size=None): feed_dict=feed_dict_val) return outs_val[0], outs_val[1], outs_val[2], (time.time() - t_test) -''' def incremental_evaluate(sess, model, minibatch_iter, size): t_test = time.time() finished = False @@ -36,41 +38,31 @@ def incremental_evaluate(sess, model, minibatch_iter, size): val_losses.append(outs_val[0]) val_mrrs.append(outs_val[2]) return np.mean(val_losses), np.mean(val_mrrs), (time.time() - t_test) -''' -def save_val_embeddings(sess, model, minibatch_iter, size, mod=""): +def save_val_embeddings(sess, model, minibatch_iter, size): val_embeddings = [] finished = False seen = set([]) #this as set to store already seen emb-node id! nodes = [] iter_num = 0 - name = "val" + #name = "val" while not finished: feed_dict_val, finished, edges = minibatch_iter.incremental_embed_feed_dict(size, iter_num) iter_num += 1 - outs_val = sess.run([model.loss, model.mrr, model.outputs1], - feed_dict=feed_dict_val) + outs_val = sess.run([model.loss, model.mrr, model.outputs1], feed_dict=feed_dict_val) #ONLY SAVE FOR embeds1 because of planetoid for i, edge in enumerate(edges): if not edge[0] in seen: val_embeddings.append(outs_val[-1][i,:]) nodes.append(edge[0]) #nodes: a list; has order seen.add(edge[0]) #seen: a set; NO order!!! - #if not os.path.exists(out_dir): - # os.makedirs(out_dir) val_embeddings = np.vstack(val_embeddings) - print(val_embeddings.shape) + #print(val_embeddings.shape) vectors = {} for i, embedding in enumerate(val_embeddings): vectors[nodes[i]] = embedding #warning: seen: a set; nodes: a list - return vectors - - ''' #if we want to save embs, modify the following code - np.save(out_dir + name + mod + ".npy", val_embeddings) - with open(out_dir + name + mod + ".txt", "w") as fp: - fp.write("\n".join(map(str,nodes))) - ''' + return vectors #return them and use graphsageAPI to save them def construct_placeholders(): # Define placeholders @@ -85,11 +77,10 @@ def construct_placeholders(): } return placeholders - -def train(train_data, test_data=None, model='graphsage_mean'): +def train(train_data, test_data, model): print('---------- the graphsage model we used: ', model) - print('---------- parameters we sued: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size, normalize', - epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size, normalize) + print('---------- parameters we sued: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size', + epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size) G = train_data[0] features = train_data[1] #note: features are in order of graph.look_up_list, since id_map = {k: v for v, k in enumerate(graph.look_back_list)} id_map = train_data[2] @@ -98,7 +89,6 @@ def train(train_data, test_data=None, model='graphsage_mean'): # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1],))]) - random_context = False context_pairs = train_data[3] if random_context else None placeholders = construct_placeholders() minibatch = EdgeMinibatchIterator(G, @@ -110,7 +100,7 @@ def train(train_data, test_data=None, model='graphsage_mean'): adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") - if model == 'graphsage_mean': + if model == 'mean': # Create model sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, samples_1, dim_1), @@ -141,7 +131,7 @@ def train(train_data, test_data=None, model='graphsage_mean'): concat=False, logging=True) - elif model == 'graphsage_seq': #LSTM as stated in paper? very slow anyway... + elif model == 'seq': #LSTM as stated in paper? very slow anyway... sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, samples_1, dim_1), SAGEInfo("node", sampler, samples_2, dim_2)] @@ -156,7 +146,7 @@ def train(train_data, test_data=None, model='graphsage_mean'): model_size=model_size, logging=True) - elif model == 'graphsage_maxpool': + elif model == 'maxpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, samples_1, dim_1), SAGEInfo("node", sampler, samples_2, dim_2)] @@ -170,7 +160,7 @@ def train(train_data, test_data=None, model='graphsage_mean'): model_size=model_size, identity_dim = identity_dim, logging=True) - elif model == 'graphsage_meanpool': + elif model == 'meanpool': sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, samples_1, dim_1), SAGEInfo("node", sampler, samples_2, dim_2)] @@ -202,7 +192,7 @@ def train(train_data, test_data=None, model='graphsage_mean'): # Initialize session sess = tf.Session(config=config) merged = tf.summary.merge_all() - #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) + #summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) #we ignore log file # Init variables sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) @@ -211,13 +201,15 @@ def train(train_data, test_data=None, model='graphsage_mean'): train_shadow_mrr = None shadow_mrr = None - total_steps = 0 - avg_time = 0.0 epoch_val_costs = [] - train_adj_info = tf.assign(adj_info, minibatch.adj) val_adj_info = tf.assign(adj_info, minibatch.test_adj) + + vectors = None #to store best embs and return at the end + best_result = None + + t1 = time.time() for epoch in range(epochs): minibatch.shuffle() @@ -229,7 +221,7 @@ def train(train_data, test_data=None, model='graphsage_mean'): val_cost = 0 val_mrr = 0 shadow_mrr = 0 - avg_time = 0 + while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict() @@ -259,35 +251,39 @@ def train(train_data, test_data=None, model='graphsage_mean'): #if total_steps % print_every == 0: #summary_writer.add_summary(outs[0], total_steps) - - # Print results - avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) iter += 1 total_steps += 1 - if total_steps > max_total_steps: break - epoch += 1 + epoch += 1 + t2 = time.time() + #only print the last iter result at the end of each epoch print("Epoch:", '%04d' % epoch, "train_loss=", "{:.5f}".format(train_cost), - "train_mrr=", "{:.5f}".format(train_mrr), - "train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), # exponential moving average + #"train_mrr=", "{:.5f}".format(train_mrr), + #"train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), "val_loss=", "{:.5f}".format(val_cost), - "val_mrr=", "{:.5f}".format(val_mrr), - "val_mrr_ema=", "{:.5f}".format(shadow_mrr), # exponential moving average - "time=", "{:.5f}".format(avg_time)) + #"val_mrr=", "{:.5f}".format(val_mrr), + #"val_mrr_ema=", "{:.5f}".format(shadow_mrr), + "time cost", "{:.2f}".format(t2-t1)) - if total_steps > max_total_steps: - break - - print("Optimization Finished!") - - sess.run(val_adj_info.op) - #save_val_embeddings(sess, model, minibatch, validate_batch_size, log_dir()) - return save_val_embeddings(sess, model, minibatch, validate_batch_size) #return embs - - -def graphsage_save_embeddings(self, filename): #to do... - pass \ No newline at end of file + #no early stopping was used in original code---------------- auto-save-best-emb ------------------------------ + #instead, we will chose the best result by looking at smallest val loss + if epoch == 1: + best_result = val_cost + sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result... + vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size) + else: + if best_result > val_cost: #if val loss decreasing + best_result = val_cost + sess.run(val_adj_info.op) #what is this for before get emb ? if ignore it, get worse result... + vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size) + else: + print('val loss increasing @ ', epoch, ' w.r.t. last best epoch, and do not cover previous emb...') + + #sess.run(val_adj_info.op) #what is this for before get emb ? ignore it????? + #vectors = save_val_embeddings(sess, model, minibatch, validate_batch_size) + print("Finished!") + return vectors \ No newline at end of file diff --git a/src/main.py b/src/main.py index ced48d0..35a30ae 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ STEP2: prepare data --> STEP3: learn node embeddings --> STEP4: downstream evaluations -python src/main.py --method abrw --save-emb False +python src/main.py --method abrw by Chengbin Hou 2018 ''' @@ -65,8 +65,9 @@ def parse_args(): parser.add_argument('--emb-file', default='emb/unnamed_node_embs.txt', help='node embeddings file; suggest: data_method_dim_embs.txt') #-------------------------------------------------method settings----------------------------------------------------------- - parser.add_argument('--method', default='abrw', choices=['node2vec', 'deepwalk', 'line', 'gcn', 'grarep', 'tadw', - 'abrw', 'asne', 'aane', 'attrpure', 'attrcomb', 'graphsage'], + parser.add_argument('--method', default='abrw', choices=['deepwalk', 'node2vec', 'line', 'grarep', + 'abrw', 'attrpure', 'attrcomb', 'tadw', 'aane', + 'sagemean','sagegcn', 'gcn', 'asne'], help='choices of Network Embedding methods') parser.add_argument('--ABRW-topk', default=30, type=int, help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]') @@ -134,8 +135,8 @@ def main(args): elif args.graph_format == 'edgelist': g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed) #load node attribute info------ - is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'graphsage' or - args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane') + is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or + args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane') if is_ane: assert args.attribute_file != '' g.read_node_attr(args.attribute_file) @@ -188,60 +189,53 @@ def main(args): elif args.method == 'line': #if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score model = line.LINE(graph=g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio, label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro') + + elif args.method == 'sagemean': #other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v + model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False) + elif args.method == 'sagegcn': #parameters for graphsage models are in 'graphsage' -> '__init__.py' + model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False) elif args.method == 'asne': - if args.task == 'nc': - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) - else: - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) - elif args.method == 'graphsage': #we follow the default parameters, see __inti__.py in graphsage file - model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'graphsage_mean') - elif args.method == 'gcn': - model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'gcn') #graphsage-gcn + model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10) else: - print('no method was found...') + print('method not found...') exit(0) - ''' - elif args.method == 'gcn': #OR use graphsage-gcn as in graphsage method... - assert args.label_file != '' #must have node label - assert args.feature_file != '' #different from previous ANE methods - g.read_node_label(args.label_file) #gcn is an end-to-end supervised ANE methoed - model = gcnAPI.GCN(graph=g, dropout=args.dropout, - weight_decay=args.weight_decay, hidden1=args.hidden, - epochs=args.epochs, clf_ratio=args.label_reserved) - #gcn does not have model.save_embeddings() func - ''' + t2 = time.time() + print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s') + if args.save_emb: model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime())) print(f'Save node embeddings in file: {args.emb_file}') - t2 = time.time() - print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s') + + ''' + #to do.... semi-supervised methods: gcn, graphsage, etc... + if args.method == 'gcn': #semi-supervised gcn + assert args.label_file != '' + assert args.feature_file != '' + g.read_node_label(args.label_file) + model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved) + print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs + exit(0) + ''' #---------------------------------------STEP4: downstream task----------------------------------------------- print('\nSTEP4: start evaluating ......: ') t1 = time.time() - if args.method != 'semi_supervised_gcn': #except semi-supervised methods, we will get emb first, and then eval emb - vectors = 0 - if args.method == 'graphsage' or args.method == 'gcn': #to do... run without this 'if' - vectors = model - else: - vectors = model.vectors #for other methods.... - del model, g - #------lp task - if args.task == 'lp' or args.task == 'lp_and_nc': - #X_test_lp, Y_test_lp = read_edge_label(args.label_file) #if you want to load your own lp testing data - print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' - + ' (by default, also generate equal negative links for testing)') - clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm - clf.evaluate(test_node_pairs, test_edge_labels) - #------nc task - if args.task == 'nc' or args.task == 'lp_and_nc': - X, Y = read_node_label(args.label_file) - print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%') - clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones - clf.split_train_evaluate(X, Y, args.label_reserved) + vectors = model.vectors + del model, g + #------lp task + if args.task == 'lp' or args.task == 'lp_and_nc': + #X_test_lp, Y_test_lp = read_edge_label(args.label_file) #if you want to load your own lp testing data + print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + + ' (by default, also generate equal negative links for testing)') + clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm + clf.evaluate(test_node_pairs, test_edge_labels) + #------nc task + if args.task == 'nc' or args.task == 'lp_and_nc': + X, Y = read_node_label(args.label_file) + print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%') + clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones + clf.split_train_evaluate(X, Y, args.label_reserved) t2 = time.time() print(f'STEP4: end evaluating; time cost: {(t2-t1):.2f}s')