This commit is contained in:
houchengbin 2019-07-16 14:03:43 +01:00
parent d08aab00d0
commit c7f06f54e5
14 changed files with 356 additions and 95 deletions

View File

@ -1,11 +1,13 @@
# tested in python==3.6.6
numpy==1.14.5
scipy==1.1.0
tensorflow==1.10.0 # to do... compatible with latest tf and tf-gpu
tensorboard==1.10.0
networkx==2.2
gensim==3.0.1
networkx==2.3
gensim==3.7.3
scikit-learn==0.19.0 # to do... compatible with >0.20
pandas==0.23.0
psutil==5.6.3
# Enable GPU:
# If using anaconda, run `conda install tensorflow-gpu==1.10.0`
@ -15,3 +17,47 @@ pandas==0.23.0
# Or simply build from docker image: docker pull tensorflow/tensorflow:1.10.0-gpu-py3
# ref: https://www.tensorflow.org/install/docker#gpu_support
'''
Package Version
--------------- --------
absl-py 0.7.1
astor 0.8.0
boto 2.49.0
boto3 1.9.160
botocore 1.12.160
certifi 2019.3.9
chardet 3.0.4
decorator 4.4.0
docutils 0.14
gast 0.2.2
gensim 3.7.3
grpcio 1.21.1
idna 2.8
jmespath 0.9.4
Markdown 3.1.1
mkl-fft 1.0.12
mkl-random 1.0.2
networkx 2.3
numpy 1.14.5
pandas 0.23.0
pip 19.1.1
protobuf 3.8.0
psutil 5.6.3
python-dateutil 2.8.0
pytz 2019.1
requests 2.22.0
s3transfer 0.2.0
scikit-learn 0.19.0
scipy 1.1.0
setuptools 39.1.0
six 1.12.0
smart-open 1.8.4
tensorboard 1.10.0
tensorflow 1.10.0
termcolor 1.1.0
urllib3 1.25.3
Werkzeug 0.15.4
wheel 0.33.4
'''

View File

@ -3,6 +3,17 @@ ANE method: Accelerated Attributed Network Embedding (AANE)
modified by Chengbin Hou 2018
note: We tried this method in a HPC via pbs,
however, we don't know why it is particularly slow, even we observed multiple cores were used...
We then tried this method in a small individual linux server. It works well.
If you find the same problem, just try this method in other computers.
Usually, Cora dataset only requires 20s/iter using my PC with 4 cores.
However, when we run AANE for the large-scale dataset e.g. dblp (~60k nodes) in a Linux server with 40 cores,
it cost over 8000 seconds for each iteration...
For the reason, please see author's comments in https://github.com/xhuang31/AANE_Python/issues/5
originally from https://github.com/xhuang31/AANE_Python
"""
@ -33,7 +44,7 @@ class AANE:
$Revision: 1.0.2 $ $Date: 2018/02/19 00:00:00 $
"""
def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=5, mode='comb', *varargs):
def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=2, mode='comb', *varargs):
self.dim = dim
self.look_back_list = graph.look_back_list # look back node id for Net and Attr
self.lambd = lambd # Initial regularization parameter

View File

@ -1,5 +1,6 @@
"""
ANE method: Attributed Biased Random Walks;
ANE method: Adap-ANE: Adaptive Attributed Network Embedding
based on previous Attributed Biased Random Walks https://arxiv.org/abs/1811.11728v2
by Chengbin Hou & Zeyu Dong 2018
"""
@ -10,24 +11,46 @@ import warnings
import numpy as np
from gensim.models import Word2Vec
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
import psutil
from . import walker
from .utils import pairwise_similarity, row_as_probdist
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
def deg2beta_mapping(deg_list, alpha=np.e):
'''
***adaptive beta***
based on node degree for balancing structure info and attribute info
map node degree [0:+inf) -> beta [1:0]
input: deg_list: a scalar or list
alpha: default e; [0, +inf) but we suggest trying 0.5, 1, e, 10, 100, ...
output: beta_list: a scalar or list
'''
base_list = (1.0 + np.power(deg_list, alpha))
beta_list = np.power(base_list, -1/alpha) # characteristic curve of adaptive beta
# print('deg_list', deg_list[:50])
# print('beta_list', np.around(beta_list, decimals=3)[:50])
return beta_list
class ABRW(object):
def __init__(self, graph, dim, alpha, topk, number_walks, walk_length, **kwargs):
def __init__(self, graph, dim, topk, beta, beta_mode, alpha, number_walks, walk_length, **kwargs):
self.g = graph
self.dim = dim
self.alpha = float(alpha)
self.topk = int(topk)
self.beta = float(beta)
self.beta_mode = int(beta_mode)
self.alpha = float(alpha)
self.number_walks = number_walks
self.walk_length = walk_length
# obtain biased transition mat -----------
self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(), X=self.g.get_attr_mat())
self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(dense_output=False), X=self.g.get_attr_mat(dense_output=False))
# aim to generate a sequences of walks/sentences
# apply weighted random walks on the reconstructed network based on biased transition mat
@ -54,38 +77,127 @@ class ABRW(object):
'''
given: A and X --> T_A and T_X
research question: how to combine A and X in a more principled way
genral idea: Attribute Biased Random Walk
i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X
result: ABRW-trainsition matrix; T
our idea: T = (1-beta)*T_A + beta*T_X
mode 1: fixed beta
mode 2: adaptive beta baed on average degree
mode 3: adaptive beta based on each node degree
'''
print("obtaining biased transition matrix where each row sums up to 1.0...")
preserve_zeros = False
T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
print('Preserve zero rows of the adj matrix: ', preserve_zeros)
# norm adj mat; For isolated node, return all-zeros row, so that T_A is not a strict transition matrix
# preserve_all_zero_row=False gives similar result, but is less efficient
t0 = time.time()
T_A = row_as_probdist(A, dense_output=False, preserve_all_zero_row=True) # **sparse mat**
T_X = None
t1 = time.time()
X_sim = pairwise_similarity(X) # attr similarity mat; X_sim is a square mat, but X is not
t2 = time.time()
print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1)
X_sim[(X_sim < cutoff)] = 0 # improve both accuracy and efficiency
X_sim = sparse.csr_matrix(X_sim)
t3 = time.time()
T_X = row_as_probdist(X_sim)
t4 = time.time()
print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s; row norm time: {(t4-t3):.2f}s')
del A, X, X_sim
# =====================================information fusion via transition matrices========================================
print('------alpha for P = alpha * T_A + (1-alpha) * T_X------: ', self.alpha)
n = self.g.get_num_nodes()
alp = np.array(n * [self.alpha]) # for vectorized computation
alp[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 0
T = sparse.diags(alp).dot(T_A) + sparse.diags(1 - alp).dot(T_X) # sparse version
free_memory = psutil.virtual_memory().available
print('free_memory ', free_memory)
# n*n*8 is the bytes required by pairwise similarity matrix; 2e9 = 2GB ROM remained for safety reason
# if your computer have 200G memory, there should be no problem for graph with 100k nodes
# this naive implementation is **faster** than BallTree implementation, thanks to numpy
#if n*n*8 + n*n*8 + n*5000*8 + 2e9 < free_memory and n < 1e5: # X_sim[n,n] dense + A[n,n] if dense + X[n,5000] if dense with max 5000 feats + 2e9 for safety
if False:
print('naive implementation + intro-select ')
t1 = time.time()
X_sim = pairwise_similarity(X.todense())
# sparse operator; reduce time and space complexity & remove less useful dissimilar nodes
t2 = time.time()
print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1).reshape(-1,1) # introselect average speed O(1); see link below
X_sim[(X_sim < cutoff)] = 0 # https://docs.scipy.org/doc/numpy/reference/generated/numpy.partition.html
X_sim = sparse.csr_matrix(X_sim)
X_sim.setdiag(0)
# norm attr mat; note: T_X mush be a strict transition matrix, thanks to the strict transition matrix of X_sim
t3 = time.time()
T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat**
t4 = time.time()
print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s')
print(f'adj row norm time: {(t1-t0):.2f}s; attr row norm time: {(t4-t3):.2f}s')
print('all naive implementation time: ', t4-t1)
del A, X, X_sim, cutoff
# a scalable w.r.t. both time and space
# but might be slightly slower when n is small e.g. n<100k
# BallTree time complexity O( nlong(n) )
else:
print('BallTree implementation + multiprocessor query')
t1 = time.time()
X = normalize(X.todense(), norm='l2', axis=1)
t2 = time.time()
print('normalize time: ',t2-t1)
# after normalization -> Euclidean distance = cosine distance (inverse of cosine similarity)
neigh = NearestNeighbors(n_neighbors=self.topk, algorithm='ball_tree', leaf_size=40, metric='minkowski', p=2, n_jobs=-1)
neigh.fit(X)
t3 = time.time()
print('BallTree time: ',t3-t2)
dist, ind = neigh.kneighbors(X[:]) # Euclidean dist, indices
# print('dist',dist)
# print('ind',ind)
t4 = time.time()
print('query time: ',t4-t3)
sim = 1-np.multiply(dist, dist)/2 # cosine distance -> cosine similarity
# print('sim: ',sim)
t5 = time.time()
print('cosine distance -> cosine similarity time: ',t5-t4)
row = []
col = []
data = []
for i in range(n):
row.extend( [i]* self.topk )
col.extend( ind[i] )
data.extend( sim[i] )
t6 = time.time()
print('sparse matrix data & ind construction for loop time: ',t6-t5)
zero_row_ind = np.where(~X.any(axis=1))[0]
# print('zero_row_ind',zero_row_ind)
X_sim = sparse.csc_matrix((data, (row, col)), shape=(n, n))
for col in zero_row_ind:
X_sim.data[X_sim.indptr[col]:X_sim.indptr[col+1]] = 0
X_sim = sparse.csr_matrix(X_sim)
for row in zero_row_ind:
X_sim.data[X_sim.indptr[row]:X_sim.indptr[row+1]] = 0
X_sim.setdiag(0)
X_sim.eliminate_zeros()
t7 = time.time()
# print(X_sim.todense())
print('sparse.csr_matrix time:',t7-t6)
T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat**
t8 = time.time()
print('BallTree implementation ALL time',t8-t1)
del A, X, X_sim, data, row, col, neigh, sim
# ============================================== information fusion via transition matrices =======================================================
print('about beta, beta_mode, alpha: ', self.beta, self.beta_mode, self.alpha)
b = None
# mode 1: fixed beta, except if T_A has any zero rows, set beta=1.0
if self.beta_mode == 1:
print('====== fixed beta: T = (1-beta)*T_A + beta*T_X where beta= ', self.beta)
b = np.array(n * [self.beta]) # vectored computing
b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0 # if T_A has any zero rows, set beta=0
# mode 2: adaptive beta baed on average degree which reflects the richness of structural info
if self.beta_mode == 2:
print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+ave_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha)
if self.g.G.is_directed():
print('directed graph, TODO...')
exit(0)
ave_deg = len(self.g.G.edges()) * 2.0 / len(self.g.G.nodes()) # see def http://konect.uni-koblenz.de/statistics/avgdegree
b = deg2beta_mapping(ave_deg, alpha=self.alpha) # mapping by the characteristic curve of adaptive beta
b = np.array(n * [b])
b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0
# mode 3: adaptive beta based on each node degree
if self.beta_mode == 3:
print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+node_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha)
if self.g.G.is_directed():
print('directed graph, TODO...')
exit(0)
node_deg_list = [deg*2 for (node, deg) in self.g.G.degree()] # *2 due to undirected graph; in consistant with ave_deg after mapping
b = deg2beta_mapping(node_deg_list, alpha=self.alpha) # mapping by the characteristic curve of adaptive beta
T = sparse.diags(1.0-b).dot(T_A) + sparse.diags(b).dot(T_X)
t5 = time.time()
print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s')
return T
@ -97,3 +209,19 @@ class ABRW(object):
for node, vec in self.vectors.items():
fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
fout.close()
# ------------------------ utils draw_characteristic_curve ---------------------------
def draw_characteristic_curve():
import matplotlib.pyplot as plt
deg_list = np.arange(0, 100, 0.01)
beta_list_1 = deg2beta_mapping(deg_list, alpha=0.5)
beta_list_2 = deg2beta_mapping(deg_list, alpha=1)
beta_list_3 = deg2beta_mapping(deg_list, alpha=np.e)
beta_list_4 = deg2beta_mapping(deg_list, alpha=10)
plt.plot(deg_list, beta_list_1, label='alpha=0.5')
plt.plot(deg_list, beta_list_2, label='alpha=1')
plt.plot(deg_list, beta_list_3, label='alpha=np.e')
plt.plot(deg_list, beta_list_4, label='alpha=10')
plt.legend()
plt.show()

View File

@ -20,7 +20,7 @@ from sklearn.base import BaseEstimator, TransformerMixin
class ASNE(BaseEstimator, TransformerMixin):
def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
early_stopping=2000): # it seems that overfitting can get better result? try other early_stopping... to do...
early_stopping=2000):
t1 = time.time()
X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
@ -121,7 +121,7 @@ class ASNE(BaseEstimator, TransformerMixin):
iter_count = 0
train_loss_best = 0
train_loss_keep_increasing = 0
early_stopping = self.early_stopping # early stopping if training loss increased
early_stopping = self.early_stopping # early stopping if training loss increased for early_stopping times
for epoch in range(self.epoch):
t1 = time.time()
@ -136,6 +136,13 @@ class ASNE(BaseEstimator, TransformerMixin):
# Fit training using batch data
train_loss = self.partial_fit(batch_xs)
'''
# no early stopping;
# as the original code did https://github.com/lizi-git/ASNE/blob/master/SNE.py
# it seems that the more epochs are, the better results are...
# e.g. 10 epochs are obviously better than 1 epoch,
# but after approximately 50 epochs, there is no much gain or loss...
# so we run for all 100 epochs to ensure the best performance
iter_count += 1
if iter_count == 1:
train_loss_best = train_loss
@ -157,6 +164,7 @@ class ASNE(BaseEstimator, TransformerMixin):
return self.vectors
else:
pass
'''
t2 = time.time()
print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
@ -195,7 +203,7 @@ class ASNE(BaseEstimator, TransformerMixin):
def format_data_from_OpenANE_to_ASNE(g, dim):
''' convert OpenANE data format to ASNE data format '''
attr_Matrix = g.get_attr_mat(is_sparse=False)
attr_Matrix = g.get_attr_mat(dense_output=True)
id_N = attr_Matrix.shape[0] # n nodes
attr_M = attr_Matrix.shape[1] # m features

View File

@ -21,7 +21,7 @@ class ATTRPURE(object):
self.vectors[key] = embeddings[ind]
def train(self):
X = self.g.get_attr_mat().todense()
X = self.g.get_attr_mat()
X_compressed = None
if self.mode == 'pca':
X_compressed = dim_reduction(X, dim=self.dim, method='pca')

View File

@ -100,6 +100,21 @@ class lpClassifier(object):
roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5
print("roc=", "{:.9f}".format(roc))
def cosine_similarity(a, b):
from numpy import dot
from numpy.linalg import norm
''' cosine similarity; can be used as score function; vector by vector;
If consider similarity for all pairs,
pairwise_similarity() implementation may be more efficient
'''
a = np.reshape(a,-1)
b = np.reshape(b,-1)
if norm(a)*norm(b) == 0:
return 0.0
else:
return dot(a, b)/(norm(a)*norm(b))
'''
def norm(a):
sum = 0.0
for i in range(len(a)):
@ -111,6 +126,7 @@ def cosine_similarity(a, b):
for i in range(len(a)):
sum = sum + a[i] * b[i]
return sum / (norm(a) * norm(b) + 1e-100)
'''
'''
def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):

View File

@ -92,24 +92,24 @@ class Graph(object):
# ------------------------------------------------------------------------------------------
# --------------------commonly used APIs that will not modify graph-------------------------
# ------------------------------------------------------------------------------------------
def get_adj_mat(self, is_sparse=True):
def get_adj_mat(self, dense_output=True):
""" return adjacency matrix; \n
use 'csr' format for sparse matrix \n
"""
if is_sparse:
return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
else:
if dense_output:
return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64')
else:
return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
def get_attr_mat(self, is_sparse=True):
def get_attr_mat(self, dense_output=True):
""" return attribute matrix; \n
use 'csr' format for sparse matrix \n
"""
attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())])
if is_sparse:
return sp.csr_matrix(attr_dense_narray, dtype='float64')
else:
if dense_output:
return np.matrix(attr_dense_narray, dtype='float64')
else:
return sp.csr_matrix(attr_dense_narray, dtype='float64')
def get_num_nodes(self):
""" return the number of nodes """

View File

@ -1,6 +1,10 @@
''' global parameters for graphsage models
tune these parameters here if needed
if needed use: from libnrl.graphsage.__init__ import *
'''
global parameters for graphsage models
tune these parameters here if needed
if needed use: from libnrl.graphsage.__init__ import *
we mostly follow the original code:
https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py
and https://github.com/tkipf/gcn/blob/master/gcn/train.py
'''
# seed = 2018
@ -8,10 +12,7 @@
# tf.set_random_seed(seed)
log_device_placement = False
# follow the original code by the paper author https://github.com/williamleif/GraphSAGE
# we follow the opt parameters given by papers GCN and graphSAGE
# note: citeseer+pubmed all follow the same parameters as cora, see their papers)
# tensorflow + Adam optimizer + Random weight init + row norm of attr
dim_1 = 64 # dim = dim1+dim2 = 128 for sage-mean and sage-gcn
dim_2 = 64
@ -19,19 +20,20 @@ samples_1 = 25
samples_2 = 10
# key parameters during training
epochs = 100
learning_rate = 0.001 # search [0.01, 0.001, 0.0001, 0.00001]
dropout = 0.5
weight_decay = 5e-4
batch_size = 512 # if run out of memory, try to reduce them, default=512
epochs = 50 # max epoch, we found it converges in a few epochs, and the more links are, the less epochs are required
# so we set run for all 50 epochs and take out the embeddings with the best val loss
learning_rate = 0.0001 # search [0.01, 0.001, 0.0001]
dropout = 0.5 # dropout rate (1 - keep probability)
batch_size = 128 # if run out of memory, try to reduce them, default=512
weight_decay = 1e-6 # weight for L2 loss on embedding matrix
# key parameters durning val
validate_batch_size = 256 # if run out of memory, try to reduce them, default=256
validate_batch_size = 128 # if run out of memory, try to reduce them, default=256
validate_iter = 5000
max_total_steps = 10**10
print_every = 50
# other parameters also follow the defaults https://github.com/williamleif/GraphSAGE
# other parameters: also follow the defaults https://github.com/williamleif/GraphSAGE
neg_sample_size = 20
identity_dim = 0
n2v_test_epochs = 1
@ -44,6 +46,7 @@ base_log_dir = ''
'''
https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py
#core params..
flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.')
flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.')
@ -73,4 +76,19 @@ flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation
flags.DEFINE_integer('gpu', 1, "which gpu to use.")
flags.DEFINE_integer('print_every', 50, "How often to print training info.")
flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations")
----------------------------------------------------------------------------------------------------------
https://github.com/tkipf/gcn/blob/master/gcn/train.py
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', 'cora', 'Dataset string.') # 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
'''

View File

@ -9,6 +9,7 @@
'''
import random
import time
import networkx as nx
import numpy as np
@ -24,7 +25,11 @@ class graphSAGE(object):
self.walk_len = 5
self.add_train_val_test_to_G(test_perc=0.0, val_perc=0.1) # if unsupervised, no test data
t1 = time.time()
train_data = self.tranform_data_for_graphsage() # obtain graphSAGE required training data
t2 = time.time()
print(f'transform data format from OpenANE to SAGE; time cost: {(t2-t1):.2f}s')
self.vectors = None
if not is_supervised:

View File

@ -81,7 +81,7 @@ def construct_placeholders():
def train(train_data, test_data, model):
print('---------- the graphsage model we used: ', model)
print('---------- parameters we sued: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size',
print('---------- parameters we used: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size',
epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size)
G = train_data[0]
features = train_data[1] # note: features are in order of graph.look_up_list, since id_map = {k: v for v, k in enumerate(graph.look_back_list)}

View File

@ -29,7 +29,7 @@ class TADW(object):
def getAdj(self):
A = self.g.get_adj_mat() # by default, return a sparse matrix
return np.array(row_as_probdist(A, dense_output=True, preserve_zeros=True)) # only support np.array, otherwise dim error...
return np.array(row_as_probdist(A, dense_output=True, preserve_all_zero_row=True)) # only support np.array, otherwise dim error...
def getT(self):
g = self.g.G

View File

@ -1,5 +1,5 @@
"""
commonly used ulits
commonly used utils
by Chengbin Hou & Zeyu Dong
"""
@ -11,7 +11,7 @@ from scipy import sparse
# ---------------------------------ulits for calculation--------------------------------
def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
def row_as_probdist(mat, dense_output=True, preserve_all_zero_row=False):
"""Make each row of matrix sums up to 1.0, i.e., a probability distribution.
Support both dense and sparse matrix.
@ -36,7 +36,8 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
row_sum[zero_rows] = 1
diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
mat = diag.dot(mat)
if not preserve_zeros:
if not preserve_all_zero_row:
print('For all-zero row, replace each 0 with value 1/dim(row)... not preserving zero i.e. a strict transition matrix')
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
if dense_output and sparse.issparse(mat):
@ -44,7 +45,7 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
return mat
def pairwise_similarity(mat, type='cosine'):
def pairwise_similarity(mat, type='cosine'): # for efficiency, plz given dense mat as the input
if type == 'cosine': # support sprase and dense mat
from sklearn.metrics.pairwise import cosine_similarity
result = cosine_similarity(mat, dense_output=True)

View File

@ -20,10 +20,11 @@ class WeightedWalker:
''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method
'''
def __init__(self, node_id_map, transition_mat, workers):
self.look_back_list = node_id_map
def __init__(self, node_id_map, transition_mat, workers, parallel_walks=10): # recommend: parallel_walks = number_walks
self.look_back_list = node_id_map # if memory error due to python multiprocessor module, plz reduce parallel_walks
self.T = transition_mat
self.workers = workers
self.parallel_walks = parallel_walks
self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix
# alias sampling for ABRW-------------------------
@ -38,8 +39,10 @@ class WeightedWalker:
t2 = time.time()
print(f'Time for construct alias table: {(t2-t1):.2f}')
pool = multiprocessing.Pool(processes=self.workers)
pool = multiprocessing.Pool(processes=self.parallel_walks)
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
pool.close() # Waiting for all subprocesses done..
pool.join()
all_walks = list(chain(*all_walks))
t3 = time.time()
print(f'Time for all random walks: {(t3-t2):.2f}') # use multiple cores, total time < sum(time@itr)
@ -51,8 +54,12 @@ class WeightedWalker:
def mp_rw_wrapper(self, walk_iter):
walks = []
random.seed() # *** for multiprocessor version
np.random.seed() # *** do NOT remove these 'random' operation
nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences...
random.shuffle(nodes) # *** which hence decrease performance
t1 = time.time()
for node in self.nodes:
for node in nodes:
walks.append(self.weighted_walk(start_node=node))
t2 = time.time()
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
@ -87,11 +94,12 @@ def deepwalk_walk_wrapper(class_instance, walk_length, start_node):
class_instance.deepwalk_walk(walk_length, start_node)
class BasicWalker:
def __init__(self, g, workers):
def __init__(self, g, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it
self.g = g
self.node_size = g.get_num_nodes()
self.look_up_dict = g.look_up_dict
self.workers = workers
self.parallel_walks = parallel_walks
def deepwalk_walk(self, start_node):
'''
@ -112,8 +120,12 @@ class BasicWalker:
def mp_rw_wrapper(self, walk_iter):
walks = []
random.seed() # *** for multiprocessor version
np.random.seed() # *** do NOT remove these 'random' operation
nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences...
random.shuffle(nodes) # *** which hence decrease performance
t1 = time.time()
for node in self.nodes:
for node in nodes:
walks.append(self.deepwalk_walk(start_node=node))
t2 = time.time()
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
@ -130,8 +142,10 @@ class BasicWalker:
all_walks = None
t1 = time.time()
pool = multiprocessing.Pool(processes=self.workers)
pool = multiprocessing.Pool(processes=self.parallel_walks)
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
pool.close() # Waiting for all subprocesses done..
pool.join()
all_walks = list(chain(*all_walks))
t2 = time.time()
print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr)
@ -140,11 +154,12 @@ class BasicWalker:
# ===========================================node2vec-walker============================================
class Walker:
def __init__(self, g, p, q, workers):
def __init__(self, g, p, q, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it
self.g = g
self.p = p
self.q = q
self.workers = workers
self.parallel_walks = parallel_walks
if self.g.get_isweighted():
# print('is weighted graph: ', self.g.get_isweighted())
@ -180,8 +195,12 @@ class Walker:
def mp_rw_wrapper(self, walk_iter):
walks = []
random.seed() # *** for multiprocessor version
np.random.seed() # *** do NOT remove these 'random' operation
nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences...
random.shuffle(nodes) # *** which hence decrease performance
t1 = time.time()
for node in self.nodes:
for node in nodes:
walks.append(self.node2vec_walk(start_node=node))
t2 = time.time()
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
@ -197,8 +216,10 @@ class Walker:
all_walks = None
t1 = time.time()
pool = multiprocessing.Pool(processes=self.workers)
pool = multiprocessing.Pool(processes=self.parallel_walks)
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
pool.close() # Waiting for all subprocesses done..
pool.join()
all_walks = list(chain(*all_walks))
t2 = time.time()
print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr)

View File

@ -11,6 +11,7 @@ by Chengbin HOU 2018 <chengbin.hou10@foxmail.com>
'''
import time
import random
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from sklearn.linear_model import LogisticRegression # to do... try SVM...
@ -35,9 +36,9 @@ def parse_args():
help='node embeddings dimensions')
parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'],
help='choices of downstream tasks: none, lp, nc, lp_and_nc')
parser.add_argument('--link-remove', default=0.1, type=float,
parser.add_argument('--link-remove', default=0.2, type=float,
help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]')
parser.add_argument('--label-reserved', default=0.7, type=float,
parser.add_argument('--label-reserved', default=0.5, type=float,
help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
parser.add_argument('--directed', default=False, action='store_true',
help='directed or undirected graph')
@ -54,8 +55,12 @@ def parse_args():
help='choices of Network Embedding methods')
parser.add_argument('--ABRW-topk', default=30, type=int,
help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]')
parser.add_argument('--ABRW-alpha', default=0.8, type=float,
help='balance struc and attr info; ranging [0, 1]')
parser.add_argument('--ABRW-alpha', default=2.71828, type=float,
help='control the shape of characteristic curve of adaptive beta, ranging [0, inf]')
parser.add_argument('--ABRW-beta-mode', default=1, type=int,
help='1: fixed; 2: adaptive based on average degree; 3: adaptive based on each node degree')
parser.add_argument('--ABRW-beta', default=0.2, type=float,
help='balance struc and attr info; ranging [0, 1]; disabled if beta-mode 2 or 3')
parser.add_argument('--AANE-lamb', default=0.05, type=float,
help='balance struc and attr info; ranging [0, inf]')
parser.add_argument('--AANE-rho', default=5, type=float,
@ -64,16 +69,16 @@ def parse_args():
help='max iter')
parser.add_argument('--TADW-lamb', default=0.2, type=float,
help='balance struc and attr info; ranging [0, inf]')
parser.add_argument('--TADW-maxiter', default=10, type=int,
parser.add_argument('--TADW-maxiter', default=20, type=int,
help='max iter')
parser.add_argument('--ASNE-lamb', default=1.0, type=float,
help='balance struc and attr info; ranging [0, inf]')
parser.add_argument('--AttrComb-mode', default='concat', type=str,
help='choices of mode: concat, elementwise-mean, elementwise-max')
parser.add_argument('--Node2Vec-p', default=0.5, type=float, # if p=q=1.0 node2vec = deepwalk
help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]')
parser.add_argument('--Node2Vec-q', default=0.5, type=float,
help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]')
parser.add_argument('--GraRep-kstep', default=4, type=int,
help='use k-step transition probability matrix, error if dim%Kstep!=0')
parser.add_argument('--LINE-order', default=3, type=int,
@ -87,10 +92,10 @@ def parse_args():
help='length of each random walk')
parser.add_argument('--window-size', default=10, type=int,
help='window size of skipgram model')
parser.add_argument('--workers', default=24, type=int,
parser.add_argument('--workers', default=36, type=int,
help='# of parallel processes.')
# for deep learning based methods; parameters about layers and neurons used are not specified here
parser.add_argument('--learning-rate', default=0.001, type=float,
parser.add_argument('--learning-rate', default=0.0001, type=float,
help='learning rate')
parser.add_argument('--batch-size', default=128, type=int,
help='batch size')
@ -98,8 +103,6 @@ def parse_args():
help='epochs')
parser.add_argument('--dropout', default=0.5, type=float,
help='dropout rate (1 - keep probability)')
parser.add_argument('--weight-decay', type=float, default=0.0001,
help='weight for L2 loss on embedding matrix')
args = parser.parse_args()
return args
@ -111,12 +114,12 @@ def main(args):
# ---------------------------------------STEP1: load data-----------------------------------------------------
print('\nSTEP1: start loading data......')
t1 = time.time()
# load graph structure info------
# load graph structure info; by defalt, treat as undirected and unweighted graph ------
if args.graph_format == 'adjlist':
g.read_adjlist(path=args.graph_file, directed=args.directed)
elif args.graph_format == 'edgelist':
g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed)
# load node attribute info------
# load node attribute info ------
is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or
args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane')
if is_ane:
@ -133,6 +136,10 @@ def main(args):
test_edge_labels = []
if args.task == 'lp' or args.task == 'lp_and_nc':
edges_removed = g.remove_edge(ratio=args.link_remove)
num_test_links = 0
limit_percentage = 0.2 # at most, use 0.2 randomly removed links for testing
num_test_links = int( min(len(edges_removed), len(edges_removed)/args.link_remove*limit_percentage) )
edges_removed = random.sample(edges_removed, num_test_links)
test_node_pairs, test_edge_labels = generate_edges_for_linkpred(graph=g, edges_removed=edges_removed, balance_ratio=1.0)
t2 = time.time()
print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s')
@ -145,9 +152,9 @@ def main(args):
t1 = time.time()
model = None
if args.method == 'abrw':
from libnrl import abrw # ANE method; Attributed Biased Random Walk
model = abrw.ABRW(graph=g, dim=args.dim, alpha=args.ABRW_alpha, topk=args.ABRW_topk, number_walks=args.number_walks,
walk_length=args.walk_length, window=args.window_size, workers=args.workers)
from libnrl import abrw # ANE method; (Adaptive) Attributed Biased Random Walk
model = abrw.ABRW(graph=g, dim=args.dim, topk=args.ABRW_topk, beta=args.ABRW_beta, beta_mode=args.ABRW_beta_mode, alpha=args.ABRW_alpha,
number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers)
elif args.method == 'aane':
from libnrl import aane # ANE method
model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter,
@ -180,10 +187,10 @@ def main(args):
elif args.method == 'asne':
from libnrl import asne # ANE method
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
elif args.method == 'sagemean': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
elif args.method == 'sagemean': # parameters for graphsage models are in 'graphsage' -> '__init__.py'
from libnrl.graphsage import graphsageAPI # ANE method
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
elif args.method == 'sagegcn': # parameters for graphsage models are in 'graphsage' -> '__init__.py'
elif args.method == 'sagegcn': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
from libnrl.graphsage import graphsageAPI # ANE method
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
else:
@ -204,7 +211,7 @@ def main(args):
del model, g
# ------lp task
if args.task == 'lp' or args.task == 'lp_and_nc':
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
print(f'Link Prediction task; the number of testing links {len(test_edge_labels)} i.e. at most 2*0.2*all_positive_links)')
ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
ds_task.evaluate(test_node_pairs, test_edge_labels)
# ------nc task