diff --git a/requirements.txt b/requirements.txt index 1c5e87b..8c4f577 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,13 @@ # tested in python==3.6.6 numpy==1.14.5 +scipy==1.1.0 tensorflow==1.10.0 # to do... compatible with latest tf and tf-gpu tensorboard==1.10.0 -networkx==2.2 -gensim==3.0.1 +networkx==2.3 +gensim==3.7.3 scikit-learn==0.19.0 # to do... compatible with >0.20 pandas==0.23.0 +psutil==5.6.3 # Enable GPU: # If using anaconda, run `conda install tensorflow-gpu==1.10.0` @@ -15,3 +17,47 @@ pandas==0.23.0 # Or simply build from docker image: docker pull tensorflow/tensorflow:1.10.0-gpu-py3 # ref: https://www.tensorflow.org/install/docker#gpu_support + + +''' +Package Version +--------------- -------- +absl-py 0.7.1 +astor 0.8.0 +boto 2.49.0 +boto3 1.9.160 +botocore 1.12.160 +certifi 2019.3.9 +chardet 3.0.4 +decorator 4.4.0 +docutils 0.14 +gast 0.2.2 +gensim 3.7.3 +grpcio 1.21.1 +idna 2.8 +jmespath 0.9.4 +Markdown 3.1.1 +mkl-fft 1.0.12 +mkl-random 1.0.2 +networkx 2.3 +numpy 1.14.5 +pandas 0.23.0 +pip 19.1.1 +protobuf 3.8.0 +psutil 5.6.3 +python-dateutil 2.8.0 +pytz 2019.1 +requests 2.22.0 +s3transfer 0.2.0 +scikit-learn 0.19.0 +scipy 1.1.0 +setuptools 39.1.0 +six 1.12.0 +smart-open 1.8.4 +tensorboard 1.10.0 +tensorflow 1.10.0 +termcolor 1.1.0 +urllib3 1.25.3 +Werkzeug 0.15.4 +wheel 0.33.4 +''' \ No newline at end of file diff --git a/src/libnrl/aane.py b/src/libnrl/aane.py index 9d53697..fc949cf 100644 --- a/src/libnrl/aane.py +++ b/src/libnrl/aane.py @@ -3,6 +3,17 @@ ANE method: Accelerated Attributed Network Embedding (AANE) modified by Chengbin Hou 2018 +note: We tried this method in a HPC via pbs, + however, we don't know why it is particularly slow, even we observed multiple cores were used... + We then tried this method in a small individual linux server. It works well. + If you find the same problem, just try this method in other computers. + Usually, Cora dataset only requires 20s/iter using my PC with 4 cores. + + However, when we run AANE for the large-scale dataset e.g. dblp (~60k nodes) in a Linux server with 40 cores, + it cost over 8000 seconds for each iteration... + For the reason, please see author's comments in https://github.com/xhuang31/AANE_Python/issues/5 + + originally from https://github.com/xhuang31/AANE_Python """ @@ -33,7 +44,7 @@ class AANE: $Revision: 1.0.2 $ $Date: 2018/02/19 00:00:00 $ """ - def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=5, mode='comb', *varargs): + def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=2, mode='comb', *varargs): self.dim = dim self.look_back_list = graph.look_back_list # look back node id for Net and Attr self.lambd = lambd # Initial regularization parameter diff --git a/src/libnrl/abrw.py b/src/libnrl/abrw.py index 225b9e5..a15cd19 100644 --- a/src/libnrl/abrw.py +++ b/src/libnrl/abrw.py @@ -1,5 +1,6 @@ """ -ANE method: Attributed Biased Random Walks; +ANE method: Adap-ANE: Adaptive Attributed Network Embedding + based on previous Attributed Biased Random Walks https://arxiv.org/abs/1811.11728v2 by Chengbin Hou & Zeyu Dong 2018 """ @@ -10,24 +11,46 @@ import warnings import numpy as np from gensim.models import Word2Vec from scipy import sparse +from sklearn.preprocessing import normalize +from sklearn.neighbors import NearestNeighbors +import psutil from . import walker from .utils import pairwise_similarity, row_as_probdist + warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') +def deg2beta_mapping(deg_list, alpha=np.e): + ''' + ***adaptive beta*** + based on node degree for balancing structure info and attribute info + map node degree [0:+inf) -> beta [1:0] + input: deg_list: a scalar or list + alpha: default e; [0, +inf) but we suggest trying 0.5, 1, e, 10, 100, ... + output: beta_list: a scalar or list + ''' + base_list = (1.0 + np.power(deg_list, alpha)) + beta_list = np.power(base_list, -1/alpha) # characteristic curve of adaptive beta + # print('deg_list', deg_list[:50]) + # print('beta_list', np.around(beta_list, decimals=3)[:50]) + return beta_list + + class ABRW(object): - def __init__(self, graph, dim, alpha, topk, number_walks, walk_length, **kwargs): + def __init__(self, graph, dim, topk, beta, beta_mode, alpha, number_walks, walk_length, **kwargs): self.g = graph self.dim = dim - self.alpha = float(alpha) self.topk = int(topk) + self.beta = float(beta) + self.beta_mode = int(beta_mode) + self.alpha = float(alpha) self.number_walks = number_walks self.walk_length = walk_length # obtain biased transition mat ----------- - self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(), X=self.g.get_attr_mat()) + self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(dense_output=False), X=self.g.get_attr_mat(dense_output=False)) # aim to generate a sequences of walks/sentences # apply weighted random walks on the reconstructed network based on biased transition mat @@ -54,38 +77,127 @@ class ABRW(object): ''' given: A and X --> T_A and T_X research question: how to combine A and X in a more principled way - genral idea: Attribute Biased Random Walk - i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X - result: ABRW-trainsition matrix; T + our idea: T = (1-beta)*T_A + beta*T_X + mode 1: fixed beta + mode 2: adaptive beta baed on average degree + mode 3: adaptive beta based on each node degree ''' print("obtaining biased transition matrix where each row sums up to 1.0...") - preserve_zeros = False - T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row - print('Preserve zero rows of the adj matrix: ', preserve_zeros) - - t1 = time.time() - X_sim = pairwise_similarity(X) # attr similarity mat; X_sim is a square mat, but X is not - - t2 = time.time() - print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node') - cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1) - X_sim[(X_sim < cutoff)] = 0 # improve both accuracy and efficiency - X_sim = sparse.csr_matrix(X_sim) - - t3 = time.time() - T_X = row_as_probdist(X_sim) - - t4 = time.time() - print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s; row norm time: {(t4-t3):.2f}s') - del A, X, X_sim - - # =====================================information fusion via transition matrices======================================== - print('------alpha for P = alpha * T_A + (1-alpha) * T_X------: ', self.alpha) + # norm adj mat; For isolated node, return all-zeros row, so that T_A is not a strict transition matrix + # preserve_all_zero_row=False gives similar result, but is less efficient + t0 = time.time() + T_A = row_as_probdist(A, dense_output=False, preserve_all_zero_row=True) # **sparse mat** + T_X = None + n = self.g.get_num_nodes() - alp = np.array(n * [self.alpha]) # for vectorized computation - alp[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 0 - T = sparse.diags(alp).dot(T_A) + sparse.diags(1 - alp).dot(T_X) # sparse version + free_memory = psutil.virtual_memory().available + print('free_memory ', free_memory) + # n*n*8 is the bytes required by pairwise similarity matrix; 2e9 = 2GB ROM remained for safety reason + # if your computer have 200G memory, there should be no problem for graph with 100k nodes + # this naive implementation is **faster** than BallTree implementation, thanks to numpy + #if n*n*8 + n*n*8 + n*5000*8 + 2e9 < free_memory and n < 1e5: # X_sim[n,n] dense + A[n,n] if dense + X[n,5000] if dense with max 5000 feats + 2e9 for safety + if False: + print('naive implementation + intro-select ') + t1 = time.time() + X_sim = pairwise_similarity(X.todense()) + # sparse operator; reduce time and space complexity & remove less useful dissimilar nodes + t2 = time.time() + print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node') + cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1).reshape(-1,1) # introselect average speed O(1); see link below + X_sim[(X_sim < cutoff)] = 0 # https://docs.scipy.org/doc/numpy/reference/generated/numpy.partition.html + X_sim = sparse.csr_matrix(X_sim) + X_sim.setdiag(0) + # norm attr mat; note: T_X mush be a strict transition matrix, thanks to the strict transition matrix of X_sim + t3 = time.time() + T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat** + t4 = time.time() + print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s') + print(f'adj row norm time: {(t1-t0):.2f}s; attr row norm time: {(t4-t3):.2f}s') + print('all naive implementation time: ', t4-t1) + del A, X, X_sim, cutoff + # a scalable w.r.t. both time and space + # but might be slightly slower when n is small e.g. n<100k + # BallTree time complexity O( nlong(n) ) + else: + print('BallTree implementation + multiprocessor query') + t1 = time.time() + X = normalize(X.todense(), norm='l2', axis=1) + t2 = time.time() + print('normalize time: ',t2-t1) + # after normalization -> Euclidean distance = cosine distance (inverse of cosine similarity) + neigh = NearestNeighbors(n_neighbors=self.topk, algorithm='ball_tree', leaf_size=40, metric='minkowski', p=2, n_jobs=-1) + neigh.fit(X) + t3 = time.time() + print('BallTree time: ',t3-t2) + dist, ind = neigh.kneighbors(X[:]) # Euclidean dist, indices + # print('dist',dist) + # print('ind',ind) + t4 = time.time() + print('query time: ',t4-t3) + sim = 1-np.multiply(dist, dist)/2 # cosine distance -> cosine similarity + # print('sim: ',sim) + t5 = time.time() + print('cosine distance -> cosine similarity time: ',t5-t4) + row = [] + col = [] + data = [] + for i in range(n): + row.extend( [i]* self.topk ) + col.extend( ind[i] ) + data.extend( sim[i] ) + t6 = time.time() + print('sparse matrix data & ind construction for loop time: ',t6-t5) + zero_row_ind = np.where(~X.any(axis=1))[0] + # print('zero_row_ind',zero_row_ind) + X_sim = sparse.csc_matrix((data, (row, col)), shape=(n, n)) + for col in zero_row_ind: + X_sim.data[X_sim.indptr[col]:X_sim.indptr[col+1]] = 0 + X_sim = sparse.csr_matrix(X_sim) + for row in zero_row_ind: + X_sim.data[X_sim.indptr[row]:X_sim.indptr[row+1]] = 0 + X_sim.setdiag(0) + X_sim.eliminate_zeros() + t7 = time.time() + # print(X_sim.todense()) + print('sparse.csr_matrix time:',t7-t6) + T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat** + t8 = time.time() + print('BallTree implementation ALL time',t8-t1) + del A, X, X_sim, data, row, col, neigh, sim + + + # ============================================== information fusion via transition matrices ======================================================= + print('about beta, beta_mode, alpha: ', self.beta, self.beta_mode, self.alpha) + b = None + + # mode 1: fixed beta, except if T_A has any zero rows, set beta=1.0 + if self.beta_mode == 1: + print('====== fixed beta: T = (1-beta)*T_A + beta*T_X where beta= ', self.beta) + b = np.array(n * [self.beta]) # vectored computing + b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0 # if T_A has any zero rows, set beta=0 + + # mode 2: adaptive beta baed on average degree which reflects the richness of structural info + if self.beta_mode == 2: + print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+ave_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha) + if self.g.G.is_directed(): + print('directed graph, TODO...') + exit(0) + ave_deg = len(self.g.G.edges()) * 2.0 / len(self.g.G.nodes()) # see def http://konect.uni-koblenz.de/statistics/avgdegree + b = deg2beta_mapping(ave_deg, alpha=self.alpha) # mapping by the characteristic curve of adaptive beta + b = np.array(n * [b]) + b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0 + + # mode 3: adaptive beta based on each node degree + if self.beta_mode == 3: + print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+node_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha) + if self.g.G.is_directed(): + print('directed graph, TODO...') + exit(0) + node_deg_list = [deg*2 for (node, deg) in self.g.G.degree()] # *2 due to undirected graph; in consistant with ave_deg after mapping + b = deg2beta_mapping(node_deg_list, alpha=self.alpha) # mapping by the characteristic curve of adaptive beta + + T = sparse.diags(1.0-b).dot(T_A) + sparse.diags(b).dot(T_X) t5 = time.time() print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s') return T @@ -97,3 +209,19 @@ class ABRW(object): for node, vec in self.vectors.items(): fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) fout.close() + + +# ------------------------ utils draw_characteristic_curve --------------------------- +def draw_characteristic_curve(): + import matplotlib.pyplot as plt + deg_list = np.arange(0, 100, 0.01) + beta_list_1 = deg2beta_mapping(deg_list, alpha=0.5) + beta_list_2 = deg2beta_mapping(deg_list, alpha=1) + beta_list_3 = deg2beta_mapping(deg_list, alpha=np.e) + beta_list_4 = deg2beta_mapping(deg_list, alpha=10) + plt.plot(deg_list, beta_list_1, label='alpha=0.5') + plt.plot(deg_list, beta_list_2, label='alpha=1') + plt.plot(deg_list, beta_list_3, label='alpha=np.e') + plt.plot(deg_list, beta_list_4, label='alpha=10') + plt.legend() + plt.show() \ No newline at end of file diff --git a/src/libnrl/asne.py b/src/libnrl/asne.py index 2b1211d..13afaae 100644 --- a/src/libnrl/asne.py +++ b/src/libnrl/asne.py @@ -20,7 +20,7 @@ from sklearn.base import BaseEstimator, TransformerMixin class ASNE(BaseEstimator, TransformerMixin): def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10, - early_stopping=2000): # it seems that overfitting can get better result? try other early_stopping... to do... + early_stopping=2000): t1 = time.time() X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim) @@ -121,7 +121,7 @@ class ASNE(BaseEstimator, TransformerMixin): iter_count = 0 train_loss_best = 0 train_loss_keep_increasing = 0 - early_stopping = self.early_stopping # early stopping if training loss increased + early_stopping = self.early_stopping # early stopping if training loss increased for early_stopping times for epoch in range(self.epoch): t1 = time.time() @@ -136,6 +136,13 @@ class ASNE(BaseEstimator, TransformerMixin): # Fit training using batch data train_loss = self.partial_fit(batch_xs) + ''' + # no early stopping; + # as the original code did https://github.com/lizi-git/ASNE/blob/master/SNE.py + # it seems that the more epochs are, the better results are... + # e.g. 10 epochs are obviously better than 1 epoch, + # but after approximately 50 epochs, there is no much gain or loss... + # so we run for all 100 epochs to ensure the best performance iter_count += 1 if iter_count == 1: train_loss_best = train_loss @@ -157,6 +164,7 @@ class ASNE(BaseEstimator, TransformerMixin): return self.vectors else: pass + ''' t2 = time.time() print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',) @@ -195,7 +203,7 @@ class ASNE(BaseEstimator, TransformerMixin): def format_data_from_OpenANE_to_ASNE(g, dim): ''' convert OpenANE data format to ASNE data format ''' - attr_Matrix = g.get_attr_mat(is_sparse=False) + attr_Matrix = g.get_attr_mat(dense_output=True) id_N = attr_Matrix.shape[0] # n nodes attr_M = attr_Matrix.shape[1] # m features diff --git a/src/libnrl/attrpure.py b/src/libnrl/attrpure.py index 1848e08..a18633f 100644 --- a/src/libnrl/attrpure.py +++ b/src/libnrl/attrpure.py @@ -21,7 +21,7 @@ class ATTRPURE(object): self.vectors[key] = embeddings[ind] def train(self): - X = self.g.get_attr_mat().todense() + X = self.g.get_attr_mat() X_compressed = None if self.mode == 'pca': X_compressed = dim_reduction(X, dim=self.dim, method='pca') diff --git a/src/libnrl/downstream.py b/src/libnrl/downstream.py index 9199fff..fc6ffef 100644 --- a/src/libnrl/downstream.py +++ b/src/libnrl/downstream.py @@ -100,6 +100,21 @@ class lpClassifier(object): roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5 print("roc=", "{:.9f}".format(roc)) +def cosine_similarity(a, b): + from numpy import dot + from numpy.linalg import norm + ''' cosine similarity; can be used as score function; vector by vector; + If consider similarity for all pairs, + pairwise_similarity() implementation may be more efficient + ''' + a = np.reshape(a,-1) + b = np.reshape(b,-1) + if norm(a)*norm(b) == 0: + return 0.0 + else: + return dot(a, b)/(norm(a)*norm(b)) + +''' def norm(a): sum = 0.0 for i in range(len(a)): @@ -111,6 +126,7 @@ def cosine_similarity(a, b): for i in range(len(a)): sum = sum + a[i] * b[i] return sum / (norm(a) * norm(b) + 1e-100) +''' ''' def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0): diff --git a/src/libnrl/graph.py b/src/libnrl/graph.py index af57000..2218b96 100644 --- a/src/libnrl/graph.py +++ b/src/libnrl/graph.py @@ -92,24 +92,24 @@ class Graph(object): # ------------------------------------------------------------------------------------------ # --------------------commonly used APIs that will not modify graph------------------------- # ------------------------------------------------------------------------------------------ - def get_adj_mat(self, is_sparse=True): + def get_adj_mat(self, dense_output=True): """ return adjacency matrix; \n use 'csr' format for sparse matrix \n """ - if is_sparse: - return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64') - else: + if dense_output: return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64') + else: + return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64') - def get_attr_mat(self, is_sparse=True): + def get_attr_mat(self, dense_output=True): """ return attribute matrix; \n use 'csr' format for sparse matrix \n """ attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())]) - if is_sparse: - return sp.csr_matrix(attr_dense_narray, dtype='float64') - else: + if dense_output: return np.matrix(attr_dense_narray, dtype='float64') + else: + return sp.csr_matrix(attr_dense_narray, dtype='float64') def get_num_nodes(self): """ return the number of nodes """ diff --git a/src/libnrl/graphsage/__init__.py b/src/libnrl/graphsage/__init__.py index afb5be0..f6c1c13 100644 --- a/src/libnrl/graphsage/__init__.py +++ b/src/libnrl/graphsage/__init__.py @@ -1,6 +1,10 @@ -''' global parameters for graphsage models - tune these parameters here if needed - if needed use: from libnrl.graphsage.__init__ import * +''' +global parameters for graphsage models +tune these parameters here if needed +if needed use: from libnrl.graphsage.__init__ import * +we mostly follow the original code: +https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py +and https://github.com/tkipf/gcn/blob/master/gcn/train.py ''' # seed = 2018 @@ -8,10 +12,7 @@ # tf.set_random_seed(seed) log_device_placement = False - -# follow the original code by the paper author https://github.com/williamleif/GraphSAGE # we follow the opt parameters given by papers GCN and graphSAGE -# note: citeseer+pubmed all follow the same parameters as cora, see their papers) # tensorflow + Adam optimizer + Random weight init + row norm of attr dim_1 = 64 # dim = dim1+dim2 = 128 for sage-mean and sage-gcn dim_2 = 64 @@ -19,19 +20,20 @@ samples_1 = 25 samples_2 = 10 # key parameters during training -epochs = 100 -learning_rate = 0.001 # search [0.01, 0.001, 0.0001, 0.00001] -dropout = 0.5 -weight_decay = 5e-4 -batch_size = 512 # if run out of memory, try to reduce them, default=512 +epochs = 50 # max epoch, we found it converges in a few epochs, and the more links are, the less epochs are required + # so we set run for all 50 epochs and take out the embeddings with the best val loss +learning_rate = 0.0001 # search [0.01, 0.001, 0.0001] +dropout = 0.5 # dropout rate (1 - keep probability) +batch_size = 128 # if run out of memory, try to reduce them, default=512 +weight_decay = 1e-6 # weight for L2 loss on embedding matrix # key parameters durning val -validate_batch_size = 256 # if run out of memory, try to reduce them, default=256 +validate_batch_size = 128 # if run out of memory, try to reduce them, default=256 validate_iter = 5000 max_total_steps = 10**10 print_every = 50 -# other parameters also follow the defaults https://github.com/williamleif/GraphSAGE +# other parameters: also follow the defaults https://github.com/williamleif/GraphSAGE neg_sample_size = 20 identity_dim = 0 n2v_test_epochs = 1 @@ -44,6 +46,7 @@ base_log_dir = '' ''' +https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py #core params.. flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.') flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.') @@ -73,4 +76,19 @@ flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation flags.DEFINE_integer('gpu', 1, "which gpu to use.") flags.DEFINE_integer('print_every', 50, "How often to print training info.") flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations") + +---------------------------------------------------------------------------------------------------------- + +https://github.com/tkipf/gcn/blob/master/gcn/train.py +flags = tf.app.flags +FLAGS = flags.FLAGS +flags.DEFINE_string('dataset', 'cora', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' +flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense' +flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') +flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') +flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') +flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).') +flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') +flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') +flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') ''' diff --git a/src/libnrl/graphsage/graphsageAPI.py b/src/libnrl/graphsage/graphsageAPI.py index 2753263..0be813e 100644 --- a/src/libnrl/graphsage/graphsageAPI.py +++ b/src/libnrl/graphsage/graphsageAPI.py @@ -9,6 +9,7 @@ ''' import random +import time import networkx as nx import numpy as np @@ -24,7 +25,11 @@ class graphSAGE(object): self.walk_len = 5 self.add_train_val_test_to_G(test_perc=0.0, val_perc=0.1) # if unsupervised, no test data + + t1 = time.time() train_data = self.tranform_data_for_graphsage() # obtain graphSAGE required training data + t2 = time.time() + print(f'transform data format from OpenANE to SAGE; time cost: {(t2-t1):.2f}s') self.vectors = None if not is_supervised: diff --git a/src/libnrl/graphsage/unsupervised_train.py b/src/libnrl/graphsage/unsupervised_train.py index a496941..598d9ee 100644 --- a/src/libnrl/graphsage/unsupervised_train.py +++ b/src/libnrl/graphsage/unsupervised_train.py @@ -81,7 +81,7 @@ def construct_placeholders(): def train(train_data, test_data, model): print('---------- the graphsage model we used: ', model) - print('---------- parameters we sued: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size', + print('---------- parameters we used: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size', epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size) G = train_data[0] features = train_data[1] # note: features are in order of graph.look_up_list, since id_map = {k: v for v, k in enumerate(graph.look_back_list)} diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py index 55521e1..6d8a73d 100644 --- a/src/libnrl/tadw.py +++ b/src/libnrl/tadw.py @@ -29,7 +29,7 @@ class TADW(object): def getAdj(self): A = self.g.get_adj_mat() # by default, return a sparse matrix - return np.array(row_as_probdist(A, dense_output=True, preserve_zeros=True)) # only support np.array, otherwise dim error... + return np.array(row_as_probdist(A, dense_output=True, preserve_all_zero_row=True)) # only support np.array, otherwise dim error... def getT(self): g = self.g.G diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index 106b433..614cb5b 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -1,5 +1,5 @@ """ -commonly used ulits +commonly used utils by Chengbin Hou & Zeyu Dong """ @@ -11,7 +11,7 @@ from scipy import sparse # ---------------------------------ulits for calculation-------------------------------- -def row_as_probdist(mat, dense_output=False, preserve_zeros=False): +def row_as_probdist(mat, dense_output=True, preserve_all_zero_row=False): """Make each row of matrix sums up to 1.0, i.e., a probability distribution. Support both dense and sparse matrix. @@ -36,7 +36,8 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False): row_sum[zero_rows] = 1 diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0])) mat = diag.dot(mat) - if not preserve_zeros: + if not preserve_all_zero_row: + print('For all-zero row, replace each 0 with value 1/dim(row)... not preserving zero i.e. a strict transition matrix') mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1]))) if dense_output and sparse.issparse(mat): @@ -44,7 +45,7 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False): return mat -def pairwise_similarity(mat, type='cosine'): +def pairwise_similarity(mat, type='cosine'): # for efficiency, plz given dense mat as the input if type == 'cosine': # support sprase and dense mat from sklearn.metrics.pairwise import cosine_similarity result = cosine_similarity(mat, dense_output=True) diff --git a/src/libnrl/walker.py b/src/libnrl/walker.py index a092cd6..b810b40 100644 --- a/src/libnrl/walker.py +++ b/src/libnrl/walker.py @@ -20,10 +20,11 @@ class WeightedWalker: ''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method ''' - def __init__(self, node_id_map, transition_mat, workers): - self.look_back_list = node_id_map + def __init__(self, node_id_map, transition_mat, workers, parallel_walks=10): # recommend: parallel_walks = number_walks + self.look_back_list = node_id_map # if memory error due to python multiprocessor module, plz reduce parallel_walks self.T = transition_mat self.workers = workers + self.parallel_walks = parallel_walks self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix # alias sampling for ABRW------------------------- @@ -38,8 +39,10 @@ class WeightedWalker: t2 = time.time() print(f'Time for construct alias table: {(t2-t1):.2f}') - pool = multiprocessing.Pool(processes=self.workers) + pool = multiprocessing.Pool(processes=self.parallel_walks) all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks)) + pool.close() # Waiting for all subprocesses done.. + pool.join() all_walks = list(chain(*all_walks)) t3 = time.time() print(f'Time for all random walks: {(t3-t2):.2f}') # use multiple cores, total time < sum(time@itr) @@ -51,8 +54,12 @@ class WeightedWalker: def mp_rw_wrapper(self, walk_iter): walks = [] + random.seed() # *** for multiprocessor version + np.random.seed() # *** do NOT remove these 'random' operation + nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences... + random.shuffle(nodes) # *** which hence decrease performance t1 = time.time() - for node in self.nodes: + for node in nodes: walks.append(self.weighted_walk(start_node=node)) t2 = time.time() print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}') @@ -87,11 +94,12 @@ def deepwalk_walk_wrapper(class_instance, walk_length, start_node): class_instance.deepwalk_walk(walk_length, start_node) class BasicWalker: - def __init__(self, g, workers): + def __init__(self, g, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it self.g = g self.node_size = g.get_num_nodes() self.look_up_dict = g.look_up_dict self.workers = workers + self.parallel_walks = parallel_walks def deepwalk_walk(self, start_node): ''' @@ -112,8 +120,12 @@ class BasicWalker: def mp_rw_wrapper(self, walk_iter): walks = [] + random.seed() # *** for multiprocessor version + np.random.seed() # *** do NOT remove these 'random' operation + nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences... + random.shuffle(nodes) # *** which hence decrease performance t1 = time.time() - for node in self.nodes: + for node in nodes: walks.append(self.deepwalk_walk(start_node=node)) t2 = time.time() print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}') @@ -130,8 +142,10 @@ class BasicWalker: all_walks = None t1 = time.time() - pool = multiprocessing.Pool(processes=self.workers) + pool = multiprocessing.Pool(processes=self.parallel_walks) all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks)) + pool.close() # Waiting for all subprocesses done.. + pool.join() all_walks = list(chain(*all_walks)) t2 = time.time() print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr) @@ -140,11 +154,12 @@ class BasicWalker: # ===========================================node2vec-walker============================================ class Walker: - def __init__(self, g, p, q, workers): + def __init__(self, g, p, q, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it self.g = g self.p = p self.q = q self.workers = workers + self.parallel_walks = parallel_walks if self.g.get_isweighted(): # print('is weighted graph: ', self.g.get_isweighted()) @@ -180,8 +195,12 @@ class Walker: def mp_rw_wrapper(self, walk_iter): walks = [] + random.seed() # *** for multiprocessor version + np.random.seed() # *** do NOT remove these 'random' operation + nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences... + random.shuffle(nodes) # *** which hence decrease performance t1 = time.time() - for node in self.nodes: + for node in nodes: walks.append(self.node2vec_walk(start_node=node)) t2 = time.time() print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}') @@ -197,8 +216,10 @@ class Walker: all_walks = None t1 = time.time() - pool = multiprocessing.Pool(processes=self.workers) + pool = multiprocessing.Pool(processes=self.parallel_walks) all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks)) + pool.close() # Waiting for all subprocesses done.. + pool.join() all_walks = list(chain(*all_walks)) t2 = time.time() print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr) diff --git a/src/main.py b/src/main.py index ce8ea03..c71a95e 100644 --- a/src/main.py +++ b/src/main.py @@ -11,6 +11,7 @@ by Chengbin HOU 2018 ''' import time +import random from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from sklearn.linear_model import LogisticRegression # to do... try SVM... @@ -35,9 +36,9 @@ def parse_args(): help='node embeddings dimensions') parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'], help='choices of downstream tasks: none, lp, nc, lp_and_nc') - parser.add_argument('--link-remove', default=0.1, type=float, + parser.add_argument('--link-remove', default=0.2, type=float, help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]') - parser.add_argument('--label-reserved', default=0.7, type=float, + parser.add_argument('--label-reserved', default=0.5, type=float, help='for nc task, train/test split, a ratio ranging [0.0, 1.0]') parser.add_argument('--directed', default=False, action='store_true', help='directed or undirected graph') @@ -54,8 +55,12 @@ def parse_args(): help='choices of Network Embedding methods') parser.add_argument('--ABRW-topk', default=30, type=int, help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]') - parser.add_argument('--ABRW-alpha', default=0.8, type=float, - help='balance struc and attr info; ranging [0, 1]') + parser.add_argument('--ABRW-alpha', default=2.71828, type=float, + help='control the shape of characteristic curve of adaptive beta, ranging [0, inf]') + parser.add_argument('--ABRW-beta-mode', default=1, type=int, + help='1: fixed; 2: adaptive based on average degree; 3: adaptive based on each node degree') + parser.add_argument('--ABRW-beta', default=0.2, type=float, + help='balance struc and attr info; ranging [0, 1]; disabled if beta-mode 2 or 3') parser.add_argument('--AANE-lamb', default=0.05, type=float, help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AANE-rho', default=5, type=float, @@ -64,16 +69,16 @@ def parse_args(): help='max iter') parser.add_argument('--TADW-lamb', default=0.2, type=float, help='balance struc and attr info; ranging [0, inf]') - parser.add_argument('--TADW-maxiter', default=10, type=int, + parser.add_argument('--TADW-maxiter', default=20, type=int, help='max iter') parser.add_argument('--ASNE-lamb', default=1.0, type=float, help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AttrComb-mode', default='concat', type=str, help='choices of mode: concat, elementwise-mean, elementwise-max') parser.add_argument('--Node2Vec-p', default=0.5, type=float, # if p=q=1.0 node2vec = deepwalk - help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]') + help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]') parser.add_argument('--Node2Vec-q', default=0.5, type=float, - help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]') + help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]') parser.add_argument('--GraRep-kstep', default=4, type=int, help='use k-step transition probability matrix, error if dim%Kstep!=0') parser.add_argument('--LINE-order', default=3, type=int, @@ -87,10 +92,10 @@ def parse_args(): help='length of each random walk') parser.add_argument('--window-size', default=10, type=int, help='window size of skipgram model') - parser.add_argument('--workers', default=24, type=int, + parser.add_argument('--workers', default=36, type=int, help='# of parallel processes.') # for deep learning based methods; parameters about layers and neurons used are not specified here - parser.add_argument('--learning-rate', default=0.001, type=float, + parser.add_argument('--learning-rate', default=0.0001, type=float, help='learning rate') parser.add_argument('--batch-size', default=128, type=int, help='batch size') @@ -98,8 +103,6 @@ def parse_args(): help='epochs') parser.add_argument('--dropout', default=0.5, type=float, help='dropout rate (1 - keep probability)') - parser.add_argument('--weight-decay', type=float, default=0.0001, - help='weight for L2 loss on embedding matrix') args = parser.parse_args() return args @@ -111,12 +114,12 @@ def main(args): # ---------------------------------------STEP1: load data----------------------------------------------------- print('\nSTEP1: start loading data......') t1 = time.time() - # load graph structure info------ + # load graph structure info; by defalt, treat as undirected and unweighted graph ------ if args.graph_format == 'adjlist': g.read_adjlist(path=args.graph_file, directed=args.directed) elif args.graph_format == 'edgelist': g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed) - # load node attribute info------ + # load node attribute info ------ is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane') if is_ane: @@ -133,6 +136,10 @@ def main(args): test_edge_labels = [] if args.task == 'lp' or args.task == 'lp_and_nc': edges_removed = g.remove_edge(ratio=args.link_remove) + num_test_links = 0 + limit_percentage = 0.2 # at most, use 0.2 randomly removed links for testing + num_test_links = int( min(len(edges_removed), len(edges_removed)/args.link_remove*limit_percentage) ) + edges_removed = random.sample(edges_removed, num_test_links) test_node_pairs, test_edge_labels = generate_edges_for_linkpred(graph=g, edges_removed=edges_removed, balance_ratio=1.0) t2 = time.time() print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s') @@ -145,9 +152,9 @@ def main(args): t1 = time.time() model = None if args.method == 'abrw': - from libnrl import abrw # ANE method; Attributed Biased Random Walk - model = abrw.ABRW(graph=g, dim=args.dim, alpha=args.ABRW_alpha, topk=args.ABRW_topk, number_walks=args.number_walks, - walk_length=args.walk_length, window=args.window_size, workers=args.workers) + from libnrl import abrw # ANE method; (Adaptive) Attributed Biased Random Walk + model = abrw.ABRW(graph=g, dim=args.dim, topk=args.ABRW_topk, beta=args.ABRW_beta, beta_mode=args.ABRW_beta_mode, alpha=args.ABRW_alpha, + number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers) elif args.method == 'aane': from libnrl import aane # ANE method model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter, @@ -180,10 +187,10 @@ def main(args): elif args.method == 'asne': from libnrl import asne # ANE method model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10) - elif args.method == 'sagemean': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v + elif args.method == 'sagemean': # parameters for graphsage models are in 'graphsage' -> '__init__.py' from libnrl.graphsage import graphsageAPI # ANE method model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False) - elif args.method == 'sagegcn': # parameters for graphsage models are in 'graphsage' -> '__init__.py' + elif args.method == 'sagegcn': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v from libnrl.graphsage import graphsageAPI # ANE method model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False) else: @@ -204,7 +211,7 @@ def main(args): del model, g # ------lp task if args.task == 'lp' or args.task == 'lp_and_nc': - print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)') + print(f'Link Prediction task; the number of testing links {len(test_edge_labels)} i.e. at most 2*0.2*all_positive_links)') ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm ds_task.evaluate(test_node_pairs, test_edge_labels) # ------nc task