diff --git a/src/libnrl/asne.py b/src/libnrl/asne.py index 7ff19ed..7ce1a92 100644 --- a/src/libnrl/asne.py +++ b/src/libnrl/asne.py @@ -1,86 +1,39 @@ -# -*- coding: utf-8 -*- ''' -Tensorflow implementation of Social Network Embedding framework (SNE) -@author: Lizi Liao (liaolizi.llz@gmail.com) -part of code was originally forked from https://github.com/lizi-git/ASNE +ANE method: Attributed Social Network Embedding (ASNE) modified by Chengbin Hou 2018 1) convert OpenANE data format to ASNE data format -2) compatible with latest tensorflow 1.2 -3) add more comments -4) support eval testing set during each xx epoches -5) as ASNE paper stated, we add two hidden layers with softsign activation func +2) compatible with latest tensorflow 1.10.0 +3) add early stopping +4) as ASNE paper stated, we add two hidden layers with softsign activation func + +part of code was originally forked from https://github.com/lizi-git/ASNE ''' import math import numpy as np import tensorflow as tf from sklearn.base import BaseEstimator, TransformerMixin -from .classify import ncClassifier, lpClassifier, read_node_label -from sklearn.linear_model import LogisticRegression - -def format_data_from_OpenANE_to_ASNE(g, dim): - ''' - convert OpenANE data format to ASNE data format - g: OpenANE graph data structure - dim: final embedding dim - ''' - attr_Matrix = g.getX() - #attr_Matrix = g.preprocessAttrInfo(attr_Matrix, dim=200, method='svd') #similar to aane, the same preprocessing - #print('with this preprocessing, ASNE can get better result, as well as, faster speed----------------') - id_N = attr_Matrix.shape[0] #n nodes - attr_M = attr_Matrix.shape[1] #m features - - edge_num = len(g.G.edges) #total edges for traning - X={} #one-to-one correspondence - X['data_id_list'] = np.zeros(edge_num) #start node list for traning - X['data_label_list'] = np.zeros(edge_num) #end node list for training - X['data_attr_list'] = np.zeros([edge_num, attr_M]) #attr corresponds to start node - edgelist = [edge for edge in g.G.edges] - i = 0 - for edge in edgelist: #traning sample = start node, end node, start node attr - X['data_id_list'][i] = edge[0] - X['data_label_list'][i] = edge[1] - X['data_attr_list'][i] = attr_Matrix[ g.look_up_dict[edge[0]] ][:] - i += 1 - X['data_id_list'] = X['data_id_list'].reshape(-1).astype(int) - X['data_label_list'] = X['data_label_list'].reshape(-1,1).astype(int) - - nodes={} #one-to-one correspondence - nodes['node_id'] = g.look_back_list #n nodes - nodes['node_attr'] = list(attr_Matrix) #m features -> n*m - - id_embedding_size = int(dim/2) - attr_embedding_size = int(dim/2) - print('id_embedding_size', id_embedding_size, 'attr_embedding_size', attr_embedding_size) - return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size - - -def add_layer(inputs, in_size, out_size, activation_function=None): - # add one more layer and return the output of this layer - Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated - biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) - Wx_plus_b = tf.matmul(inputs, Weights) + biases - if activation_function is None: - outputs = Wx_plus_b - else: - outputs = activation_function(Wx_plus_b) - return outputs - +import time +#from .classify import ncClassifier, lpClassifier, read_node_label +#from sklearn.linear_model import LogisticRegression class ASNE(BaseEstimator, TransformerMixin): - def __init__(self, graph, dim, alpha = 1.0, batch_size=128, learning_rate=0.001, - n_neg_samples=10, epoch=100, random_seed=2018, X_test=0, Y_test=0, task='nc', nc_ratio=0.5, lp_ratio=0.9, label_file=''): - # bind params to class + def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10, + early_stopping=2000): #it seems that overfitting can get better result? try other early_stopping... to do... + + t1 = time.time() X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim) + t2 = time.time() + print(f'transform data format from OpenANE to ASNE; time cost: {(t2-t1):.2f}s') + self.node_N = id_N #n self.attr_M = attr_M #m self.X_train = X #{'data_id_list': [], 'data_label_list': [], 'data_attr_list': []} self.nodes = nodes #{'node_id': [], 'node_attr: []'} self.id_embedding_size = id_embedding_size # set to dim/2 self.attr_embedding_size = attr_embedding_size # set to dim/2 - self.vectors = {} - self.dim = dim + self.vectors = {} #final embs self.look_back_list = graph.look_back_list #from OpenANE data stcuture self.alpha = alpha #set to 1.0 by default @@ -88,19 +41,11 @@ class ASNE(BaseEstimator, TransformerMixin): self.batch_size = batch_size #set to 128 by default self.learning_rate = learning_rate self.epoch = epoch #set to 20 by default - self.random_seed = random_seed - self._init_graph() #init all variables in a tensorflow graph - self.task = task - self.nc_ratio = nc_ratio - self.lp_ratio = lp_ratio - if self.task == 'lp': #if not lp task, we do not need to keep testing edges - self.X_test = X_test - self.Y_test = Y_test - self.train() #train our tf asne model----------------- - elif self.task == 'nc' or self.task == 'nclp': - self.X_nc_label, self.Y_nc_label = read_node_label(label_file) - self.train() #train our tf asne model----------------- + self._init_graph() #init all variables in a tensorflow graph + self.early_stopping = early_stopping #early stopping if training loss increased for xx iterations + self.train() + def _init_graph(self): ''' @@ -110,7 +55,7 @@ class ASNE(BaseEstimator, TransformerMixin): #with self.graph.as_default(), tf.device('/gpu:0'): with self.graph.as_default(): # Set graph level random seed - tf.set_random_seed(self.random_seed) + #tf.set_random_seed(2018) # Input data. self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1 self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M @@ -126,25 +71,26 @@ class ASNE(BaseEstimator, TransformerMixin): self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1) # batch_size * (id_dim + attr_dim) #an error due to old tf! - - ## can add hidden_layers component here! + ''' + ## can add hidden_layers component here!---------------------------------- #0) no hidden layer #1) 128 - #2) 256+128 ##--------paper stated it used two hidden layers with activation function softsign.... + #2) 256+128 ##--------paper stated it used two hidden layers with softsign #3) 512+256+128 - len_h1_in = self.id_embedding_size+self.attr_embedding_size - len_h1_out = 256 + len_h1_in = self.id_embedding_size + self.attr_embedding_size + len_h1_out = 256 #or self.id_embedding_size + self.attr_embedding_size # if only add h1 len_h2_in = len_h1_out - len_h2_out = 128 + len_h2_out = self.id_embedding_size + self.attr_embedding_size self.h1 = add_layer(inputs=self.embed_layer, in_size=len_h1_in, out_size=len_h1_out, activation_function=tf.nn.softsign) self.h2 = add_layer(inputs=self.h1, in_size=len_h2_in, out_size=len_h2_out, activation_function=tf.nn.softsign) - + ## ------------------------------------------------------------------------- + ''' # Compute the loss, using a sample of the negative labels each time. - self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], - inputs = self.h2, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) + self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], #if one needs to change layers + inputs = self.embed_layer, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) #try inputs = self.embed_layer or self.h1 or self.h2 or ... # Optimizer. - self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) #tune these parameters? + self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) # print("AdamOptimizer") # init @@ -171,14 +117,18 @@ class ASNE(BaseEstimator, TransformerMixin): start_index = np.random.randint(0, len(data) - batch_size) return data[start_index:(start_index + batch_size)] - def train(self): # fit a dataset - self.Embeddings = [] - print('Using in + out embedding') - for epoch in range( self.epoch ): - total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) #total_batch*batch_size = numOFlinks?? - # print('total_batch in 1 epoch: ', total_batch) - # Loop over all batches + def train(self): + self.Embeddings = [] + total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) + iter_count = 0 + train_loss_best = 0 + train_loss_keep_increasing = 0 + early_stopping = self.early_stopping #early stopping if training loss increased + + for epoch in range(self.epoch): + t1 = time.time() + for i in range(total_batch): # generate a batch data batch_xs = {} @@ -188,25 +138,40 @@ class ASNE(BaseEstimator, TransformerMixin): batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)] # Fit training using batch data - cost = self.partial_fit(batch_xs) - - # Display logs per epoch - Embeddings_out = self.getEmbedding('out_embedding', self.nodes) - Embeddings_in = self.getEmbedding('embed_layer', self.nodes) - self.Embeddings = Embeddings_out + Embeddings_in #simply mean them and as final embedding; try concat? to do... - #print('training tensorflow asne model, epoc: ', epoch+1 , ' / ', self.epoch) - #to save training time, we delete eval testing data @ each epoch + train_loss = self.partial_fit(batch_xs) + iter_count += 1 + if iter_count == 1: + train_loss_best = train_loss + else: + if train_loss_best > train_loss: # training loss decreasing + train_loss_best = train_loss + train_loss_keep_increasing = 0 # reset + else: # training loss increasing + train_loss_keep_increasing += 1 + if train_loss_keep_increasing > early_stopping: # early stopping + print(f'early stopping @ iter {iter_count}; take out embs and return') + Embeddings_out = self.getEmbedding('out_embedding', self.nodes) + Embeddings_in = self.getEmbedding('embed_layer', self.nodes) + self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do... + ind = 0 + for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list + self.vectors[id] = self.Embeddings[ind] + ind += 1 + return self.vectors + else: + pass + t2 = time.time() + print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',) - #-----------for each xx epoches; save embeddings {node_id1: [], node_id2: [], ...}---------- - if (epoch+1)%1 == 0 and epoch != 0: #for every xx epoches, try eval - print('@@@ epoch ------- ', epoch+1 , ' / ', self.epoch) - ind = 0 - for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list - self.vectors[id] = self.Embeddings[ind] - ind += 1 - #self.eval(vectors=self.vectors) - print('please note that: the fianl embedding returned and its output file are not the best embedding!') - print('for the best embeddings, please check which epoch got the best eval metric(s)......') + print(f'finish all {self.epoch} epochs; take out embs and return') + Embeddings_out = self.getEmbedding('out_embedding', self.nodes) + Embeddings_in = self.getEmbedding('embed_layer', self.nodes) + self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do... + ind = 0 + for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list + self.vectors[id] = self.Embeddings[ind] + ind += 1 + return self.vectors def getEmbedding(self, type, nodes): @@ -224,21 +189,59 @@ class ASNE(BaseEstimator, TransformerMixin): ''' fout = open(filename, 'w') node_num = len(self.vectors.keys()) - fout.write("{} {}\n".format(node_num, self.dim)) + fout.write("{} {}\n".format(node_num, self.id_embedding_size+self.attr_embedding_size)) for node, vec in self.vectors.items(): fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec]))) fout.close() - def eval(self, vectors): - #------nc task - if self.task == 'nc' or self.task == 'nclp': - print("Training nc classifier using {:.2f}% node labels...".format(self.nc_ratio*100)) - clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones - clf.split_train_evaluate(self.X_nc_label, self.Y_nc_label, self.nc_ratio) - #------lp task - if self.task == 'lp': - #X_test, Y_test = read_edge_label(args.label_file) #enable this if you want to load your own lp testing data, see classfiy.py - print("During embedding we have used {:.2f}% links and the remaining will be left for lp evaluation...".format(self.lp_ratio*100)) - clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm - clf.evaluate(self.X_test, self.Y_test) +# ---------------------------------------------- ASNE utils ------------------------------------------------ +def format_data_from_OpenANE_to_ASNE(g, dim): + ''' convert OpenANE data format to ASNE data format ''' + attr_Matrix = g.get_attr_mat(is_sparse=False) + id_N = attr_Matrix.shape[0] #n nodes + attr_M = attr_Matrix.shape[1] #m features + X = {} + X['data_id_list'] = [] + X['data_label_list'] = [] + X['data_attr_list'] = [] + edgelist = [edge for edge in g.G.edges] + print('If an edge only have one direction, double it......') + cnt = 0 + for edge in edgelist: #traning sample = start node, end node, start node attr + X['data_id_list'].append(edge[0]) + X['data_label_list'].append(edge[1]) + X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[0]] ][:]) + cnt += 1 + if (edge[1], edge[0]) not in edgelist: # double! as paper said-------------- + X['data_id_list'].append(edge[1]) + X['data_label_list'].append(edge[0]) + X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[1]] ][:]) + cnt += 1 + print(f'edges before doubling: {g.get_num_edges()}') + print(f'edges after doubling: {cnt}') + + X['data_id_list'] = np.array(X['data_id_list']).reshape(-1).astype(int) + X['data_label_list'] = np.array(X['data_label_list']).reshape(-1,1).astype(int) + X['data_attr_list'] = np.array(X['data_attr_list']).reshape(cnt,attr_M) + + nodes={} + nodes['node_id'] = g.look_back_list + nodes['node_attr'] = attr_Matrix + + id_embedding_size = int(dim/2) + attr_embedding_size = int(dim/2) + print('id_embedding_size', id_embedding_size, '\nattr_embedding_size', attr_embedding_size) + return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size + + +def add_layer(inputs, in_size, out_size, activation_function=None): + # add one more layer and return the output of this layer + Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated + biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) + Wx_plus_b = tf.matmul(inputs, Weights) + biases + if activation_function is None: + outputs = Wx_plus_b + else: + outputs = activation_function(Wx_plus_b) + return outputs \ No newline at end of file diff --git a/src/main.py b/src/main.py index f066d04..35a30ae 100644 --- a/src/main.py +++ b/src/main.py @@ -194,14 +194,8 @@ def main(args): model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False) elif args.method == 'sagegcn': #parameters for graphsage models are in 'graphsage' -> '__init__.py' model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False) - elif args.method == 'asne': - if args.task == 'nc': - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) - else: - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) + model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10) else: print('method not found...') exit(0) @@ -215,9 +209,9 @@ def main(args): ''' #to do.... semi-supervised methods: gcn, graphsage, etc... if args.method == 'gcn': #semi-supervised gcn - assert args.label_file != '' #must have node label - assert args.feature_file != '' #different from previous ANE methods - g.read_node_label(args.label_file) #gcn is an end-to-end supervised ANE methoed + assert args.label_file != '' + assert args.feature_file != '' + g.read_node_label(args.label_file) model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved) print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs exit(0)