update
This commit is contained in:
parent
d08aab00d0
commit
c7f06f54e5
@ -1,11 +1,13 @@
|
|||||||
# tested in python==3.6.6
|
# tested in python==3.6.6
|
||||||
numpy==1.14.5
|
numpy==1.14.5
|
||||||
|
scipy==1.1.0
|
||||||
tensorflow==1.10.0 # to do... compatible with latest tf and tf-gpu
|
tensorflow==1.10.0 # to do... compatible with latest tf and tf-gpu
|
||||||
tensorboard==1.10.0
|
tensorboard==1.10.0
|
||||||
networkx==2.2
|
networkx==2.3
|
||||||
gensim==3.0.1
|
gensim==3.7.3
|
||||||
scikit-learn==0.19.0 # to do... compatible with >0.20
|
scikit-learn==0.19.0 # to do... compatible with >0.20
|
||||||
pandas==0.23.0
|
pandas==0.23.0
|
||||||
|
psutil==5.6.3
|
||||||
|
|
||||||
# Enable GPU:
|
# Enable GPU:
|
||||||
# If using anaconda, run `conda install tensorflow-gpu==1.10.0`
|
# If using anaconda, run `conda install tensorflow-gpu==1.10.0`
|
||||||
@ -15,3 +17,47 @@ pandas==0.23.0
|
|||||||
|
|
||||||
# Or simply build from docker image: docker pull tensorflow/tensorflow:1.10.0-gpu-py3
|
# Or simply build from docker image: docker pull tensorflow/tensorflow:1.10.0-gpu-py3
|
||||||
# ref: https://www.tensorflow.org/install/docker#gpu_support
|
# ref: https://www.tensorflow.org/install/docker#gpu_support
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Package Version
|
||||||
|
--------------- --------
|
||||||
|
absl-py 0.7.1
|
||||||
|
astor 0.8.0
|
||||||
|
boto 2.49.0
|
||||||
|
boto3 1.9.160
|
||||||
|
botocore 1.12.160
|
||||||
|
certifi 2019.3.9
|
||||||
|
chardet 3.0.4
|
||||||
|
decorator 4.4.0
|
||||||
|
docutils 0.14
|
||||||
|
gast 0.2.2
|
||||||
|
gensim 3.7.3
|
||||||
|
grpcio 1.21.1
|
||||||
|
idna 2.8
|
||||||
|
jmespath 0.9.4
|
||||||
|
Markdown 3.1.1
|
||||||
|
mkl-fft 1.0.12
|
||||||
|
mkl-random 1.0.2
|
||||||
|
networkx 2.3
|
||||||
|
numpy 1.14.5
|
||||||
|
pandas 0.23.0
|
||||||
|
pip 19.1.1
|
||||||
|
protobuf 3.8.0
|
||||||
|
psutil 5.6.3
|
||||||
|
python-dateutil 2.8.0
|
||||||
|
pytz 2019.1
|
||||||
|
requests 2.22.0
|
||||||
|
s3transfer 0.2.0
|
||||||
|
scikit-learn 0.19.0
|
||||||
|
scipy 1.1.0
|
||||||
|
setuptools 39.1.0
|
||||||
|
six 1.12.0
|
||||||
|
smart-open 1.8.4
|
||||||
|
tensorboard 1.10.0
|
||||||
|
tensorflow 1.10.0
|
||||||
|
termcolor 1.1.0
|
||||||
|
urllib3 1.25.3
|
||||||
|
Werkzeug 0.15.4
|
||||||
|
wheel 0.33.4
|
||||||
|
'''
|
@ -3,6 +3,17 @@ ANE method: Accelerated Attributed Network Embedding (AANE)
|
|||||||
|
|
||||||
modified by Chengbin Hou 2018
|
modified by Chengbin Hou 2018
|
||||||
|
|
||||||
|
note: We tried this method in a HPC via pbs,
|
||||||
|
however, we don't know why it is particularly slow, even we observed multiple cores were used...
|
||||||
|
We then tried this method in a small individual linux server. It works well.
|
||||||
|
If you find the same problem, just try this method in other computers.
|
||||||
|
Usually, Cora dataset only requires 20s/iter using my PC with 4 cores.
|
||||||
|
|
||||||
|
However, when we run AANE for the large-scale dataset e.g. dblp (~60k nodes) in a Linux server with 40 cores,
|
||||||
|
it cost over 8000 seconds for each iteration...
|
||||||
|
For the reason, please see author's comments in https://github.com/xhuang31/AANE_Python/issues/5
|
||||||
|
|
||||||
|
|
||||||
originally from https://github.com/xhuang31/AANE_Python
|
originally from https://github.com/xhuang31/AANE_Python
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -33,7 +44,7 @@ class AANE:
|
|||||||
$Revision: 1.0.2 $ $Date: 2018/02/19 00:00:00 $
|
$Revision: 1.0.2 $ $Date: 2018/02/19 00:00:00 $
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=5, mode='comb', *varargs):
|
def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=2, mode='comb', *varargs):
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.look_back_list = graph.look_back_list # look back node id for Net and Attr
|
self.look_back_list = graph.look_back_list # look back node id for Net and Attr
|
||||||
self.lambd = lambd # Initial regularization parameter
|
self.lambd = lambd # Initial regularization parameter
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
ANE method: Attributed Biased Random Walks;
|
ANE method: Adap-ANE: Adaptive Attributed Network Embedding
|
||||||
|
based on previous Attributed Biased Random Walks https://arxiv.org/abs/1811.11728v2
|
||||||
|
|
||||||
by Chengbin Hou & Zeyu Dong 2018
|
by Chengbin Hou & Zeyu Dong 2018
|
||||||
"""
|
"""
|
||||||
@ -10,24 +11,46 @@ import warnings
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from gensim.models import Word2Vec
|
from gensim.models import Word2Vec
|
||||||
from scipy import sparse
|
from scipy import sparse
|
||||||
|
from sklearn.preprocessing import normalize
|
||||||
|
from sklearn.neighbors import NearestNeighbors
|
||||||
|
import psutil
|
||||||
|
|
||||||
from . import walker
|
from . import walker
|
||||||
from .utils import pairwise_similarity, row_as_probdist
|
from .utils import pairwise_similarity, row_as_probdist
|
||||||
|
|
||||||
|
|
||||||
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
|
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
|
||||||
|
|
||||||
|
|
||||||
|
def deg2beta_mapping(deg_list, alpha=np.e):
|
||||||
|
'''
|
||||||
|
***adaptive beta***
|
||||||
|
based on node degree for balancing structure info and attribute info
|
||||||
|
map node degree [0:+inf) -> beta [1:0]
|
||||||
|
input: deg_list: a scalar or list
|
||||||
|
alpha: default e; [0, +inf) but we suggest trying 0.5, 1, e, 10, 100, ...
|
||||||
|
output: beta_list: a scalar or list
|
||||||
|
'''
|
||||||
|
base_list = (1.0 + np.power(deg_list, alpha))
|
||||||
|
beta_list = np.power(base_list, -1/alpha) # characteristic curve of adaptive beta
|
||||||
|
# print('deg_list', deg_list[:50])
|
||||||
|
# print('beta_list', np.around(beta_list, decimals=3)[:50])
|
||||||
|
return beta_list
|
||||||
|
|
||||||
|
|
||||||
class ABRW(object):
|
class ABRW(object):
|
||||||
def __init__(self, graph, dim, alpha, topk, number_walks, walk_length, **kwargs):
|
def __init__(self, graph, dim, topk, beta, beta_mode, alpha, number_walks, walk_length, **kwargs):
|
||||||
self.g = graph
|
self.g = graph
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.alpha = float(alpha)
|
|
||||||
self.topk = int(topk)
|
self.topk = int(topk)
|
||||||
|
self.beta = float(beta)
|
||||||
|
self.beta_mode = int(beta_mode)
|
||||||
|
self.alpha = float(alpha)
|
||||||
self.number_walks = number_walks
|
self.number_walks = number_walks
|
||||||
self.walk_length = walk_length
|
self.walk_length = walk_length
|
||||||
|
|
||||||
# obtain biased transition mat -----------
|
# obtain biased transition mat -----------
|
||||||
self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(), X=self.g.get_attr_mat())
|
self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(dense_output=False), X=self.g.get_attr_mat(dense_output=False))
|
||||||
|
|
||||||
# aim to generate a sequences of walks/sentences
|
# aim to generate a sequences of walks/sentences
|
||||||
# apply weighted random walks on the reconstructed network based on biased transition mat
|
# apply weighted random walks on the reconstructed network based on biased transition mat
|
||||||
@ -54,38 +77,127 @@ class ABRW(object):
|
|||||||
'''
|
'''
|
||||||
given: A and X --> T_A and T_X
|
given: A and X --> T_A and T_X
|
||||||
research question: how to combine A and X in a more principled way
|
research question: how to combine A and X in a more principled way
|
||||||
genral idea: Attribute Biased Random Walk
|
our idea: T = (1-beta)*T_A + beta*T_X
|
||||||
i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X
|
mode 1: fixed beta
|
||||||
result: ABRW-trainsition matrix; T
|
mode 2: adaptive beta baed on average degree
|
||||||
|
mode 3: adaptive beta based on each node degree
|
||||||
'''
|
'''
|
||||||
print("obtaining biased transition matrix where each row sums up to 1.0...")
|
print("obtaining biased transition matrix where each row sums up to 1.0...")
|
||||||
|
|
||||||
preserve_zeros = False
|
# norm adj mat; For isolated node, return all-zeros row, so that T_A is not a strict transition matrix
|
||||||
T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
|
# preserve_all_zero_row=False gives similar result, but is less efficient
|
||||||
print('Preserve zero rows of the adj matrix: ', preserve_zeros)
|
t0 = time.time()
|
||||||
|
T_A = row_as_probdist(A, dense_output=False, preserve_all_zero_row=True) # **sparse mat**
|
||||||
t1 = time.time()
|
T_X = None
|
||||||
X_sim = pairwise_similarity(X) # attr similarity mat; X_sim is a square mat, but X is not
|
|
||||||
|
|
||||||
t2 = time.time()
|
|
||||||
print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
|
|
||||||
cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1)
|
|
||||||
X_sim[(X_sim < cutoff)] = 0 # improve both accuracy and efficiency
|
|
||||||
X_sim = sparse.csr_matrix(X_sim)
|
|
||||||
|
|
||||||
t3 = time.time()
|
|
||||||
T_X = row_as_probdist(X_sim)
|
|
||||||
|
|
||||||
t4 = time.time()
|
|
||||||
print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s; row norm time: {(t4-t3):.2f}s')
|
|
||||||
del A, X, X_sim
|
|
||||||
|
|
||||||
# =====================================information fusion via transition matrices========================================
|
|
||||||
print('------alpha for P = alpha * T_A + (1-alpha) * T_X------: ', self.alpha)
|
|
||||||
n = self.g.get_num_nodes()
|
n = self.g.get_num_nodes()
|
||||||
alp = np.array(n * [self.alpha]) # for vectorized computation
|
free_memory = psutil.virtual_memory().available
|
||||||
alp[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 0
|
print('free_memory ', free_memory)
|
||||||
T = sparse.diags(alp).dot(T_A) + sparse.diags(1 - alp).dot(T_X) # sparse version
|
# n*n*8 is the bytes required by pairwise similarity matrix; 2e9 = 2GB ROM remained for safety reason
|
||||||
|
# if your computer have 200G memory, there should be no problem for graph with 100k nodes
|
||||||
|
# this naive implementation is **faster** than BallTree implementation, thanks to numpy
|
||||||
|
#if n*n*8 + n*n*8 + n*5000*8 + 2e9 < free_memory and n < 1e5: # X_sim[n,n] dense + A[n,n] if dense + X[n,5000] if dense with max 5000 feats + 2e9 for safety
|
||||||
|
if False:
|
||||||
|
print('naive implementation + intro-select ')
|
||||||
|
t1 = time.time()
|
||||||
|
X_sim = pairwise_similarity(X.todense())
|
||||||
|
# sparse operator; reduce time and space complexity & remove less useful dissimilar nodes
|
||||||
|
t2 = time.time()
|
||||||
|
print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
|
||||||
|
cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1).reshape(-1,1) # introselect average speed O(1); see link below
|
||||||
|
X_sim[(X_sim < cutoff)] = 0 # https://docs.scipy.org/doc/numpy/reference/generated/numpy.partition.html
|
||||||
|
X_sim = sparse.csr_matrix(X_sim)
|
||||||
|
X_sim.setdiag(0)
|
||||||
|
# norm attr mat; note: T_X mush be a strict transition matrix, thanks to the strict transition matrix of X_sim
|
||||||
|
t3 = time.time()
|
||||||
|
T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat**
|
||||||
|
t4 = time.time()
|
||||||
|
print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s')
|
||||||
|
print(f'adj row norm time: {(t1-t0):.2f}s; attr row norm time: {(t4-t3):.2f}s')
|
||||||
|
print('all naive implementation time: ', t4-t1)
|
||||||
|
del A, X, X_sim, cutoff
|
||||||
|
# a scalable w.r.t. both time and space
|
||||||
|
# but might be slightly slower when n is small e.g. n<100k
|
||||||
|
# BallTree time complexity O( nlong(n) )
|
||||||
|
else:
|
||||||
|
print('BallTree implementation + multiprocessor query')
|
||||||
|
t1 = time.time()
|
||||||
|
X = normalize(X.todense(), norm='l2', axis=1)
|
||||||
|
t2 = time.time()
|
||||||
|
print('normalize time: ',t2-t1)
|
||||||
|
# after normalization -> Euclidean distance = cosine distance (inverse of cosine similarity)
|
||||||
|
neigh = NearestNeighbors(n_neighbors=self.topk, algorithm='ball_tree', leaf_size=40, metric='minkowski', p=2, n_jobs=-1)
|
||||||
|
neigh.fit(X)
|
||||||
|
t3 = time.time()
|
||||||
|
print('BallTree time: ',t3-t2)
|
||||||
|
dist, ind = neigh.kneighbors(X[:]) # Euclidean dist, indices
|
||||||
|
# print('dist',dist)
|
||||||
|
# print('ind',ind)
|
||||||
|
t4 = time.time()
|
||||||
|
print('query time: ',t4-t3)
|
||||||
|
sim = 1-np.multiply(dist, dist)/2 # cosine distance -> cosine similarity
|
||||||
|
# print('sim: ',sim)
|
||||||
|
t5 = time.time()
|
||||||
|
print('cosine distance -> cosine similarity time: ',t5-t4)
|
||||||
|
row = []
|
||||||
|
col = []
|
||||||
|
data = []
|
||||||
|
for i in range(n):
|
||||||
|
row.extend( [i]* self.topk )
|
||||||
|
col.extend( ind[i] )
|
||||||
|
data.extend( sim[i] )
|
||||||
|
t6 = time.time()
|
||||||
|
print('sparse matrix data & ind construction for loop time: ',t6-t5)
|
||||||
|
zero_row_ind = np.where(~X.any(axis=1))[0]
|
||||||
|
# print('zero_row_ind',zero_row_ind)
|
||||||
|
X_sim = sparse.csc_matrix((data, (row, col)), shape=(n, n))
|
||||||
|
for col in zero_row_ind:
|
||||||
|
X_sim.data[X_sim.indptr[col]:X_sim.indptr[col+1]] = 0
|
||||||
|
X_sim = sparse.csr_matrix(X_sim)
|
||||||
|
for row in zero_row_ind:
|
||||||
|
X_sim.data[X_sim.indptr[row]:X_sim.indptr[row+1]] = 0
|
||||||
|
X_sim.setdiag(0)
|
||||||
|
X_sim.eliminate_zeros()
|
||||||
|
t7 = time.time()
|
||||||
|
# print(X_sim.todense())
|
||||||
|
print('sparse.csr_matrix time:',t7-t6)
|
||||||
|
T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat**
|
||||||
|
t8 = time.time()
|
||||||
|
print('BallTree implementation ALL time',t8-t1)
|
||||||
|
del A, X, X_sim, data, row, col, neigh, sim
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================== information fusion via transition matrices =======================================================
|
||||||
|
print('about beta, beta_mode, alpha: ', self.beta, self.beta_mode, self.alpha)
|
||||||
|
b = None
|
||||||
|
|
||||||
|
# mode 1: fixed beta, except if T_A has any zero rows, set beta=1.0
|
||||||
|
if self.beta_mode == 1:
|
||||||
|
print('====== fixed beta: T = (1-beta)*T_A + beta*T_X where beta= ', self.beta)
|
||||||
|
b = np.array(n * [self.beta]) # vectored computing
|
||||||
|
b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0 # if T_A has any zero rows, set beta=0
|
||||||
|
|
||||||
|
# mode 2: adaptive beta baed on average degree which reflects the richness of structural info
|
||||||
|
if self.beta_mode == 2:
|
||||||
|
print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+ave_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha)
|
||||||
|
if self.g.G.is_directed():
|
||||||
|
print('directed graph, TODO...')
|
||||||
|
exit(0)
|
||||||
|
ave_deg = len(self.g.G.edges()) * 2.0 / len(self.g.G.nodes()) # see def http://konect.uni-koblenz.de/statistics/avgdegree
|
||||||
|
b = deg2beta_mapping(ave_deg, alpha=self.alpha) # mapping by the characteristic curve of adaptive beta
|
||||||
|
b = np.array(n * [b])
|
||||||
|
b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0
|
||||||
|
|
||||||
|
# mode 3: adaptive beta based on each node degree
|
||||||
|
if self.beta_mode == 3:
|
||||||
|
print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+node_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha)
|
||||||
|
if self.g.G.is_directed():
|
||||||
|
print('directed graph, TODO...')
|
||||||
|
exit(0)
|
||||||
|
node_deg_list = [deg*2 for (node, deg) in self.g.G.degree()] # *2 due to undirected graph; in consistant with ave_deg after mapping
|
||||||
|
b = deg2beta_mapping(node_deg_list, alpha=self.alpha) # mapping by the characteristic curve of adaptive beta
|
||||||
|
|
||||||
|
T = sparse.diags(1.0-b).dot(T_A) + sparse.diags(b).dot(T_X)
|
||||||
t5 = time.time()
|
t5 = time.time()
|
||||||
print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s')
|
print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s')
|
||||||
return T
|
return T
|
||||||
@ -97,3 +209,19 @@ class ABRW(object):
|
|||||||
for node, vec in self.vectors.items():
|
for node, vec in self.vectors.items():
|
||||||
fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
|
fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
|
||||||
fout.close()
|
fout.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------ utils draw_characteristic_curve ---------------------------
|
||||||
|
def draw_characteristic_curve():
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
deg_list = np.arange(0, 100, 0.01)
|
||||||
|
beta_list_1 = deg2beta_mapping(deg_list, alpha=0.5)
|
||||||
|
beta_list_2 = deg2beta_mapping(deg_list, alpha=1)
|
||||||
|
beta_list_3 = deg2beta_mapping(deg_list, alpha=np.e)
|
||||||
|
beta_list_4 = deg2beta_mapping(deg_list, alpha=10)
|
||||||
|
plt.plot(deg_list, beta_list_1, label='alpha=0.5')
|
||||||
|
plt.plot(deg_list, beta_list_2, label='alpha=1')
|
||||||
|
plt.plot(deg_list, beta_list_3, label='alpha=np.e')
|
||||||
|
plt.plot(deg_list, beta_list_4, label='alpha=10')
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
@ -20,7 +20,7 @@ from sklearn.base import BaseEstimator, TransformerMixin
|
|||||||
|
|
||||||
class ASNE(BaseEstimator, TransformerMixin):
|
class ASNE(BaseEstimator, TransformerMixin):
|
||||||
def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
|
def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
|
||||||
early_stopping=2000): # it seems that overfitting can get better result? try other early_stopping... to do...
|
early_stopping=2000):
|
||||||
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
|
X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
|
||||||
@ -121,7 +121,7 @@ class ASNE(BaseEstimator, TransformerMixin):
|
|||||||
iter_count = 0
|
iter_count = 0
|
||||||
train_loss_best = 0
|
train_loss_best = 0
|
||||||
train_loss_keep_increasing = 0
|
train_loss_keep_increasing = 0
|
||||||
early_stopping = self.early_stopping # early stopping if training loss increased
|
early_stopping = self.early_stopping # early stopping if training loss increased for early_stopping times
|
||||||
|
|
||||||
for epoch in range(self.epoch):
|
for epoch in range(self.epoch):
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
@ -136,6 +136,13 @@ class ASNE(BaseEstimator, TransformerMixin):
|
|||||||
|
|
||||||
# Fit training using batch data
|
# Fit training using batch data
|
||||||
train_loss = self.partial_fit(batch_xs)
|
train_loss = self.partial_fit(batch_xs)
|
||||||
|
'''
|
||||||
|
# no early stopping;
|
||||||
|
# as the original code did https://github.com/lizi-git/ASNE/blob/master/SNE.py
|
||||||
|
# it seems that the more epochs are, the better results are...
|
||||||
|
# e.g. 10 epochs are obviously better than 1 epoch,
|
||||||
|
# but after approximately 50 epochs, there is no much gain or loss...
|
||||||
|
# so we run for all 100 epochs to ensure the best performance
|
||||||
iter_count += 1
|
iter_count += 1
|
||||||
if iter_count == 1:
|
if iter_count == 1:
|
||||||
train_loss_best = train_loss
|
train_loss_best = train_loss
|
||||||
@ -157,6 +164,7 @@ class ASNE(BaseEstimator, TransformerMixin):
|
|||||||
return self.vectors
|
return self.vectors
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
'''
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
|
print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
|
||||||
|
|
||||||
@ -195,7 +203,7 @@ class ASNE(BaseEstimator, TransformerMixin):
|
|||||||
|
|
||||||
def format_data_from_OpenANE_to_ASNE(g, dim):
|
def format_data_from_OpenANE_to_ASNE(g, dim):
|
||||||
''' convert OpenANE data format to ASNE data format '''
|
''' convert OpenANE data format to ASNE data format '''
|
||||||
attr_Matrix = g.get_attr_mat(is_sparse=False)
|
attr_Matrix = g.get_attr_mat(dense_output=True)
|
||||||
id_N = attr_Matrix.shape[0] # n nodes
|
id_N = attr_Matrix.shape[0] # n nodes
|
||||||
attr_M = attr_Matrix.shape[1] # m features
|
attr_M = attr_Matrix.shape[1] # m features
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ class ATTRPURE(object):
|
|||||||
self.vectors[key] = embeddings[ind]
|
self.vectors[key] = embeddings[ind]
|
||||||
|
|
||||||
def train(self):
|
def train(self):
|
||||||
X = self.g.get_attr_mat().todense()
|
X = self.g.get_attr_mat()
|
||||||
X_compressed = None
|
X_compressed = None
|
||||||
if self.mode == 'pca':
|
if self.mode == 'pca':
|
||||||
X_compressed = dim_reduction(X, dim=self.dim, method='pca')
|
X_compressed = dim_reduction(X, dim=self.dim, method='pca')
|
||||||
|
@ -100,6 +100,21 @@ class lpClassifier(object):
|
|||||||
roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5
|
roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5
|
||||||
print("roc=", "{:.9f}".format(roc))
|
print("roc=", "{:.9f}".format(roc))
|
||||||
|
|
||||||
|
def cosine_similarity(a, b):
|
||||||
|
from numpy import dot
|
||||||
|
from numpy.linalg import norm
|
||||||
|
''' cosine similarity; can be used as score function; vector by vector;
|
||||||
|
If consider similarity for all pairs,
|
||||||
|
pairwise_similarity() implementation may be more efficient
|
||||||
|
'''
|
||||||
|
a = np.reshape(a,-1)
|
||||||
|
b = np.reshape(b,-1)
|
||||||
|
if norm(a)*norm(b) == 0:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
return dot(a, b)/(norm(a)*norm(b))
|
||||||
|
|
||||||
|
'''
|
||||||
def norm(a):
|
def norm(a):
|
||||||
sum = 0.0
|
sum = 0.0
|
||||||
for i in range(len(a)):
|
for i in range(len(a)):
|
||||||
@ -111,6 +126,7 @@ def cosine_similarity(a, b):
|
|||||||
for i in range(len(a)):
|
for i in range(len(a)):
|
||||||
sum = sum + a[i] * b[i]
|
sum = sum + a[i] * b[i]
|
||||||
return sum / (norm(a) * norm(b) + 1e-100)
|
return sum / (norm(a) * norm(b) + 1e-100)
|
||||||
|
'''
|
||||||
|
|
||||||
'''
|
'''
|
||||||
def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
|
def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
|
||||||
|
@ -92,24 +92,24 @@ class Graph(object):
|
|||||||
# ------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------
|
||||||
# --------------------commonly used APIs that will not modify graph-------------------------
|
# --------------------commonly used APIs that will not modify graph-------------------------
|
||||||
# ------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------
|
||||||
def get_adj_mat(self, is_sparse=True):
|
def get_adj_mat(self, dense_output=True):
|
||||||
""" return adjacency matrix; \n
|
""" return adjacency matrix; \n
|
||||||
use 'csr' format for sparse matrix \n
|
use 'csr' format for sparse matrix \n
|
||||||
"""
|
"""
|
||||||
if is_sparse:
|
if dense_output:
|
||||||
return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
|
|
||||||
else:
|
|
||||||
return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64')
|
return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64')
|
||||||
|
else:
|
||||||
|
return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
|
||||||
|
|
||||||
def get_attr_mat(self, is_sparse=True):
|
def get_attr_mat(self, dense_output=True):
|
||||||
""" return attribute matrix; \n
|
""" return attribute matrix; \n
|
||||||
use 'csr' format for sparse matrix \n
|
use 'csr' format for sparse matrix \n
|
||||||
"""
|
"""
|
||||||
attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())])
|
attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())])
|
||||||
if is_sparse:
|
if dense_output:
|
||||||
return sp.csr_matrix(attr_dense_narray, dtype='float64')
|
|
||||||
else:
|
|
||||||
return np.matrix(attr_dense_narray, dtype='float64')
|
return np.matrix(attr_dense_narray, dtype='float64')
|
||||||
|
else:
|
||||||
|
return sp.csr_matrix(attr_dense_narray, dtype='float64')
|
||||||
|
|
||||||
def get_num_nodes(self):
|
def get_num_nodes(self):
|
||||||
""" return the number of nodes """
|
""" return the number of nodes """
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
''' global parameters for graphsage models
|
'''
|
||||||
tune these parameters here if needed
|
global parameters for graphsage models
|
||||||
if needed use: from libnrl.graphsage.__init__ import *
|
tune these parameters here if needed
|
||||||
|
if needed use: from libnrl.graphsage.__init__ import *
|
||||||
|
we mostly follow the original code:
|
||||||
|
https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py
|
||||||
|
and https://github.com/tkipf/gcn/blob/master/gcn/train.py
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# seed = 2018
|
# seed = 2018
|
||||||
@ -8,10 +12,7 @@
|
|||||||
# tf.set_random_seed(seed)
|
# tf.set_random_seed(seed)
|
||||||
log_device_placement = False
|
log_device_placement = False
|
||||||
|
|
||||||
|
|
||||||
# follow the original code by the paper author https://github.com/williamleif/GraphSAGE
|
|
||||||
# we follow the opt parameters given by papers GCN and graphSAGE
|
# we follow the opt parameters given by papers GCN and graphSAGE
|
||||||
# note: citeseer+pubmed all follow the same parameters as cora, see their papers)
|
|
||||||
# tensorflow + Adam optimizer + Random weight init + row norm of attr
|
# tensorflow + Adam optimizer + Random weight init + row norm of attr
|
||||||
dim_1 = 64 # dim = dim1+dim2 = 128 for sage-mean and sage-gcn
|
dim_1 = 64 # dim = dim1+dim2 = 128 for sage-mean and sage-gcn
|
||||||
dim_2 = 64
|
dim_2 = 64
|
||||||
@ -19,19 +20,20 @@ samples_1 = 25
|
|||||||
samples_2 = 10
|
samples_2 = 10
|
||||||
|
|
||||||
# key parameters during training
|
# key parameters during training
|
||||||
epochs = 100
|
epochs = 50 # max epoch, we found it converges in a few epochs, and the more links are, the less epochs are required
|
||||||
learning_rate = 0.001 # search [0.01, 0.001, 0.0001, 0.00001]
|
# so we set run for all 50 epochs and take out the embeddings with the best val loss
|
||||||
dropout = 0.5
|
learning_rate = 0.0001 # search [0.01, 0.001, 0.0001]
|
||||||
weight_decay = 5e-4
|
dropout = 0.5 # dropout rate (1 - keep probability)
|
||||||
batch_size = 512 # if run out of memory, try to reduce them, default=512
|
batch_size = 128 # if run out of memory, try to reduce them, default=512
|
||||||
|
weight_decay = 1e-6 # weight for L2 loss on embedding matrix
|
||||||
|
|
||||||
# key parameters durning val
|
# key parameters durning val
|
||||||
validate_batch_size = 256 # if run out of memory, try to reduce them, default=256
|
validate_batch_size = 128 # if run out of memory, try to reduce them, default=256
|
||||||
validate_iter = 5000
|
validate_iter = 5000
|
||||||
max_total_steps = 10**10
|
max_total_steps = 10**10
|
||||||
print_every = 50
|
print_every = 50
|
||||||
|
|
||||||
# other parameters also follow the defaults https://github.com/williamleif/GraphSAGE
|
# other parameters: also follow the defaults https://github.com/williamleif/GraphSAGE
|
||||||
neg_sample_size = 20
|
neg_sample_size = 20
|
||||||
identity_dim = 0
|
identity_dim = 0
|
||||||
n2v_test_epochs = 1
|
n2v_test_epochs = 1
|
||||||
@ -44,6 +46,7 @@ base_log_dir = ''
|
|||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py
|
||||||
#core params..
|
#core params..
|
||||||
flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.')
|
flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.')
|
||||||
flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.')
|
flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.')
|
||||||
@ -73,4 +76,19 @@ flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation
|
|||||||
flags.DEFINE_integer('gpu', 1, "which gpu to use.")
|
flags.DEFINE_integer('gpu', 1, "which gpu to use.")
|
||||||
flags.DEFINE_integer('print_every', 50, "How often to print training info.")
|
flags.DEFINE_integer('print_every', 50, "How often to print training info.")
|
||||||
flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations")
|
flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations")
|
||||||
|
|
||||||
|
----------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
https://github.com/tkipf/gcn/blob/master/gcn/train.py
|
||||||
|
flags = tf.app.flags
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
flags.DEFINE_string('dataset', 'cora', 'Dataset string.') # 'cora', 'citeseer', 'pubmed'
|
||||||
|
flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense'
|
||||||
|
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
|
||||||
|
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
|
||||||
|
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
|
||||||
|
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
|
||||||
|
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
|
||||||
|
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
|
||||||
|
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
|
||||||
'''
|
'''
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -24,7 +25,11 @@ class graphSAGE(object):
|
|||||||
self.walk_len = 5
|
self.walk_len = 5
|
||||||
|
|
||||||
self.add_train_val_test_to_G(test_perc=0.0, val_perc=0.1) # if unsupervised, no test data
|
self.add_train_val_test_to_G(test_perc=0.0, val_perc=0.1) # if unsupervised, no test data
|
||||||
|
|
||||||
|
t1 = time.time()
|
||||||
train_data = self.tranform_data_for_graphsage() # obtain graphSAGE required training data
|
train_data = self.tranform_data_for_graphsage() # obtain graphSAGE required training data
|
||||||
|
t2 = time.time()
|
||||||
|
print(f'transform data format from OpenANE to SAGE; time cost: {(t2-t1):.2f}s')
|
||||||
|
|
||||||
self.vectors = None
|
self.vectors = None
|
||||||
if not is_supervised:
|
if not is_supervised:
|
||||||
|
@ -81,7 +81,7 @@ def construct_placeholders():
|
|||||||
|
|
||||||
def train(train_data, test_data, model):
|
def train(train_data, test_data, model):
|
||||||
print('---------- the graphsage model we used: ', model)
|
print('---------- the graphsage model we used: ', model)
|
||||||
print('---------- parameters we sued: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size',
|
print('---------- parameters we used: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size',
|
||||||
epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size)
|
epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size)
|
||||||
G = train_data[0]
|
G = train_data[0]
|
||||||
features = train_data[1] # note: features are in order of graph.look_up_list, since id_map = {k: v for v, k in enumerate(graph.look_back_list)}
|
features = train_data[1] # note: features are in order of graph.look_up_list, since id_map = {k: v for v, k in enumerate(graph.look_back_list)}
|
||||||
|
@ -29,7 +29,7 @@ class TADW(object):
|
|||||||
|
|
||||||
def getAdj(self):
|
def getAdj(self):
|
||||||
A = self.g.get_adj_mat() # by default, return a sparse matrix
|
A = self.g.get_adj_mat() # by default, return a sparse matrix
|
||||||
return np.array(row_as_probdist(A, dense_output=True, preserve_zeros=True)) # only support np.array, otherwise dim error...
|
return np.array(row_as_probdist(A, dense_output=True, preserve_all_zero_row=True)) # only support np.array, otherwise dim error...
|
||||||
|
|
||||||
def getT(self):
|
def getT(self):
|
||||||
g = self.g.G
|
g = self.g.G
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
commonly used ulits
|
commonly used utils
|
||||||
by Chengbin Hou & Zeyu Dong
|
by Chengbin Hou & Zeyu Dong
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -11,7 +11,7 @@ from scipy import sparse
|
|||||||
|
|
||||||
# ---------------------------------ulits for calculation--------------------------------
|
# ---------------------------------ulits for calculation--------------------------------
|
||||||
|
|
||||||
def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
|
def row_as_probdist(mat, dense_output=True, preserve_all_zero_row=False):
|
||||||
"""Make each row of matrix sums up to 1.0, i.e., a probability distribution.
|
"""Make each row of matrix sums up to 1.0, i.e., a probability distribution.
|
||||||
Support both dense and sparse matrix.
|
Support both dense and sparse matrix.
|
||||||
|
|
||||||
@ -36,7 +36,8 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
|
|||||||
row_sum[zero_rows] = 1
|
row_sum[zero_rows] = 1
|
||||||
diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
|
diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
|
||||||
mat = diag.dot(mat)
|
mat = diag.dot(mat)
|
||||||
if not preserve_zeros:
|
if not preserve_all_zero_row:
|
||||||
|
print('For all-zero row, replace each 0 with value 1/dim(row)... not preserving zero i.e. a strict transition matrix')
|
||||||
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
|
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
|
||||||
|
|
||||||
if dense_output and sparse.issparse(mat):
|
if dense_output and sparse.issparse(mat):
|
||||||
@ -44,7 +45,7 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
|
|||||||
return mat
|
return mat
|
||||||
|
|
||||||
|
|
||||||
def pairwise_similarity(mat, type='cosine'):
|
def pairwise_similarity(mat, type='cosine'): # for efficiency, plz given dense mat as the input
|
||||||
if type == 'cosine': # support sprase and dense mat
|
if type == 'cosine': # support sprase and dense mat
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
result = cosine_similarity(mat, dense_output=True)
|
result = cosine_similarity(mat, dense_output=True)
|
||||||
|
@ -20,10 +20,11 @@ class WeightedWalker:
|
|||||||
''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method
|
''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, node_id_map, transition_mat, workers):
|
def __init__(self, node_id_map, transition_mat, workers, parallel_walks=10): # recommend: parallel_walks = number_walks
|
||||||
self.look_back_list = node_id_map
|
self.look_back_list = node_id_map # if memory error due to python multiprocessor module, plz reduce parallel_walks
|
||||||
self.T = transition_mat
|
self.T = transition_mat
|
||||||
self.workers = workers
|
self.workers = workers
|
||||||
|
self.parallel_walks = parallel_walks
|
||||||
self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix
|
self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix
|
||||||
|
|
||||||
# alias sampling for ABRW-------------------------
|
# alias sampling for ABRW-------------------------
|
||||||
@ -38,8 +39,10 @@ class WeightedWalker:
|
|||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'Time for construct alias table: {(t2-t1):.2f}')
|
print(f'Time for construct alias table: {(t2-t1):.2f}')
|
||||||
|
|
||||||
pool = multiprocessing.Pool(processes=self.workers)
|
pool = multiprocessing.Pool(processes=self.parallel_walks)
|
||||||
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
|
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
|
||||||
|
pool.close() # Waiting for all subprocesses done..
|
||||||
|
pool.join()
|
||||||
all_walks = list(chain(*all_walks))
|
all_walks = list(chain(*all_walks))
|
||||||
t3 = time.time()
|
t3 = time.time()
|
||||||
print(f'Time for all random walks: {(t3-t2):.2f}') # use multiple cores, total time < sum(time@itr)
|
print(f'Time for all random walks: {(t3-t2):.2f}') # use multiple cores, total time < sum(time@itr)
|
||||||
@ -51,8 +54,12 @@ class WeightedWalker:
|
|||||||
|
|
||||||
def mp_rw_wrapper(self, walk_iter):
|
def mp_rw_wrapper(self, walk_iter):
|
||||||
walks = []
|
walks = []
|
||||||
|
random.seed() # *** for multiprocessor version
|
||||||
|
np.random.seed() # *** do NOT remove these 'random' operation
|
||||||
|
nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences...
|
||||||
|
random.shuffle(nodes) # *** which hence decrease performance
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
for node in self.nodes:
|
for node in nodes:
|
||||||
walks.append(self.weighted_walk(start_node=node))
|
walks.append(self.weighted_walk(start_node=node))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
|
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
|
||||||
@ -87,11 +94,12 @@ def deepwalk_walk_wrapper(class_instance, walk_length, start_node):
|
|||||||
class_instance.deepwalk_walk(walk_length, start_node)
|
class_instance.deepwalk_walk(walk_length, start_node)
|
||||||
|
|
||||||
class BasicWalker:
|
class BasicWalker:
|
||||||
def __init__(self, g, workers):
|
def __init__(self, g, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it
|
||||||
self.g = g
|
self.g = g
|
||||||
self.node_size = g.get_num_nodes()
|
self.node_size = g.get_num_nodes()
|
||||||
self.look_up_dict = g.look_up_dict
|
self.look_up_dict = g.look_up_dict
|
||||||
self.workers = workers
|
self.workers = workers
|
||||||
|
self.parallel_walks = parallel_walks
|
||||||
|
|
||||||
def deepwalk_walk(self, start_node):
|
def deepwalk_walk(self, start_node):
|
||||||
'''
|
'''
|
||||||
@ -112,8 +120,12 @@ class BasicWalker:
|
|||||||
|
|
||||||
def mp_rw_wrapper(self, walk_iter):
|
def mp_rw_wrapper(self, walk_iter):
|
||||||
walks = []
|
walks = []
|
||||||
|
random.seed() # *** for multiprocessor version
|
||||||
|
np.random.seed() # *** do NOT remove these 'random' operation
|
||||||
|
nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences...
|
||||||
|
random.shuffle(nodes) # *** which hence decrease performance
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
for node in self.nodes:
|
for node in nodes:
|
||||||
walks.append(self.deepwalk_walk(start_node=node))
|
walks.append(self.deepwalk_walk(start_node=node))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
|
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
|
||||||
@ -130,8 +142,10 @@ class BasicWalker:
|
|||||||
all_walks = None
|
all_walks = None
|
||||||
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
pool = multiprocessing.Pool(processes=self.workers)
|
pool = multiprocessing.Pool(processes=self.parallel_walks)
|
||||||
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
|
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
|
||||||
|
pool.close() # Waiting for all subprocesses done..
|
||||||
|
pool.join()
|
||||||
all_walks = list(chain(*all_walks))
|
all_walks = list(chain(*all_walks))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr)
|
print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr)
|
||||||
@ -140,11 +154,12 @@ class BasicWalker:
|
|||||||
|
|
||||||
# ===========================================node2vec-walker============================================
|
# ===========================================node2vec-walker============================================
|
||||||
class Walker:
|
class Walker:
|
||||||
def __init__(self, g, p, q, workers):
|
def __init__(self, g, p, q, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it
|
||||||
self.g = g
|
self.g = g
|
||||||
self.p = p
|
self.p = p
|
||||||
self.q = q
|
self.q = q
|
||||||
self.workers = workers
|
self.workers = workers
|
||||||
|
self.parallel_walks = parallel_walks
|
||||||
|
|
||||||
if self.g.get_isweighted():
|
if self.g.get_isweighted():
|
||||||
# print('is weighted graph: ', self.g.get_isweighted())
|
# print('is weighted graph: ', self.g.get_isweighted())
|
||||||
@ -180,8 +195,12 @@ class Walker:
|
|||||||
|
|
||||||
def mp_rw_wrapper(self, walk_iter):
|
def mp_rw_wrapper(self, walk_iter):
|
||||||
walks = []
|
walks = []
|
||||||
|
random.seed() # *** for multiprocessor version
|
||||||
|
np.random.seed() # *** do NOT remove these 'random' operation
|
||||||
|
nodes = list(self.nodes.copy()) # *** otherwise, each number_walks may give the same node sequences...
|
||||||
|
random.shuffle(nodes) # *** which hence decrease performance
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
for node in self.nodes:
|
for node in nodes:
|
||||||
walks.append(self.node2vec_walk(start_node=node))
|
walks.append(self.node2vec_walk(start_node=node))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
|
print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
|
||||||
@ -197,8 +216,10 @@ class Walker:
|
|||||||
all_walks = None
|
all_walks = None
|
||||||
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
pool = multiprocessing.Pool(processes=self.workers)
|
pool = multiprocessing.Pool(processes=self.parallel_walks)
|
||||||
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
|
all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
|
||||||
|
pool.close() # Waiting for all subprocesses done..
|
||||||
|
pool.join()
|
||||||
all_walks = list(chain(*all_walks))
|
all_walks = list(chain(*all_walks))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr)
|
print(f'Time for all random walks: {(t2-t1):.2f}') # use multiple cores, total time < sum(time@itr)
|
||||||
|
45
src/main.py
45
src/main.py
@ -11,6 +11,7 @@ by Chengbin HOU 2018 <chengbin.hou10@foxmail.com>
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import random
|
||||||
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
|
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
|
||||||
|
|
||||||
from sklearn.linear_model import LogisticRegression # to do... try SVM...
|
from sklearn.linear_model import LogisticRegression # to do... try SVM...
|
||||||
@ -35,9 +36,9 @@ def parse_args():
|
|||||||
help='node embeddings dimensions')
|
help='node embeddings dimensions')
|
||||||
parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'],
|
parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'],
|
||||||
help='choices of downstream tasks: none, lp, nc, lp_and_nc')
|
help='choices of downstream tasks: none, lp, nc, lp_and_nc')
|
||||||
parser.add_argument('--link-remove', default=0.1, type=float,
|
parser.add_argument('--link-remove', default=0.2, type=float,
|
||||||
help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]')
|
help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]')
|
||||||
parser.add_argument('--label-reserved', default=0.7, type=float,
|
parser.add_argument('--label-reserved', default=0.5, type=float,
|
||||||
help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
|
help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
|
||||||
parser.add_argument('--directed', default=False, action='store_true',
|
parser.add_argument('--directed', default=False, action='store_true',
|
||||||
help='directed or undirected graph')
|
help='directed or undirected graph')
|
||||||
@ -54,8 +55,12 @@ def parse_args():
|
|||||||
help='choices of Network Embedding methods')
|
help='choices of Network Embedding methods')
|
||||||
parser.add_argument('--ABRW-topk', default=30, type=int,
|
parser.add_argument('--ABRW-topk', default=30, type=int,
|
||||||
help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]')
|
help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]')
|
||||||
parser.add_argument('--ABRW-alpha', default=0.8, type=float,
|
parser.add_argument('--ABRW-alpha', default=2.71828, type=float,
|
||||||
help='balance struc and attr info; ranging [0, 1]')
|
help='control the shape of characteristic curve of adaptive beta, ranging [0, inf]')
|
||||||
|
parser.add_argument('--ABRW-beta-mode', default=1, type=int,
|
||||||
|
help='1: fixed; 2: adaptive based on average degree; 3: adaptive based on each node degree')
|
||||||
|
parser.add_argument('--ABRW-beta', default=0.2, type=float,
|
||||||
|
help='balance struc and attr info; ranging [0, 1]; disabled if beta-mode 2 or 3')
|
||||||
parser.add_argument('--AANE-lamb', default=0.05, type=float,
|
parser.add_argument('--AANE-lamb', default=0.05, type=float,
|
||||||
help='balance struc and attr info; ranging [0, inf]')
|
help='balance struc and attr info; ranging [0, inf]')
|
||||||
parser.add_argument('--AANE-rho', default=5, type=float,
|
parser.add_argument('--AANE-rho', default=5, type=float,
|
||||||
@ -64,16 +69,16 @@ def parse_args():
|
|||||||
help='max iter')
|
help='max iter')
|
||||||
parser.add_argument('--TADW-lamb', default=0.2, type=float,
|
parser.add_argument('--TADW-lamb', default=0.2, type=float,
|
||||||
help='balance struc and attr info; ranging [0, inf]')
|
help='balance struc and attr info; ranging [0, inf]')
|
||||||
parser.add_argument('--TADW-maxiter', default=10, type=int,
|
parser.add_argument('--TADW-maxiter', default=20, type=int,
|
||||||
help='max iter')
|
help='max iter')
|
||||||
parser.add_argument('--ASNE-lamb', default=1.0, type=float,
|
parser.add_argument('--ASNE-lamb', default=1.0, type=float,
|
||||||
help='balance struc and attr info; ranging [0, inf]')
|
help='balance struc and attr info; ranging [0, inf]')
|
||||||
parser.add_argument('--AttrComb-mode', default='concat', type=str,
|
parser.add_argument('--AttrComb-mode', default='concat', type=str,
|
||||||
help='choices of mode: concat, elementwise-mean, elementwise-max')
|
help='choices of mode: concat, elementwise-mean, elementwise-max')
|
||||||
parser.add_argument('--Node2Vec-p', default=0.5, type=float, # if p=q=1.0 node2vec = deepwalk
|
parser.add_argument('--Node2Vec-p', default=0.5, type=float, # if p=q=1.0 node2vec = deepwalk
|
||||||
help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
|
help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]')
|
||||||
parser.add_argument('--Node2Vec-q', default=0.5, type=float,
|
parser.add_argument('--Node2Vec-q', default=0.5, type=float,
|
||||||
help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
|
help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]')
|
||||||
parser.add_argument('--GraRep-kstep', default=4, type=int,
|
parser.add_argument('--GraRep-kstep', default=4, type=int,
|
||||||
help='use k-step transition probability matrix, error if dim%Kstep!=0')
|
help='use k-step transition probability matrix, error if dim%Kstep!=0')
|
||||||
parser.add_argument('--LINE-order', default=3, type=int,
|
parser.add_argument('--LINE-order', default=3, type=int,
|
||||||
@ -87,10 +92,10 @@ def parse_args():
|
|||||||
help='length of each random walk')
|
help='length of each random walk')
|
||||||
parser.add_argument('--window-size', default=10, type=int,
|
parser.add_argument('--window-size', default=10, type=int,
|
||||||
help='window size of skipgram model')
|
help='window size of skipgram model')
|
||||||
parser.add_argument('--workers', default=24, type=int,
|
parser.add_argument('--workers', default=36, type=int,
|
||||||
help='# of parallel processes.')
|
help='# of parallel processes.')
|
||||||
# for deep learning based methods; parameters about layers and neurons used are not specified here
|
# for deep learning based methods; parameters about layers and neurons used are not specified here
|
||||||
parser.add_argument('--learning-rate', default=0.001, type=float,
|
parser.add_argument('--learning-rate', default=0.0001, type=float,
|
||||||
help='learning rate')
|
help='learning rate')
|
||||||
parser.add_argument('--batch-size', default=128, type=int,
|
parser.add_argument('--batch-size', default=128, type=int,
|
||||||
help='batch size')
|
help='batch size')
|
||||||
@ -98,8 +103,6 @@ def parse_args():
|
|||||||
help='epochs')
|
help='epochs')
|
||||||
parser.add_argument('--dropout', default=0.5, type=float,
|
parser.add_argument('--dropout', default=0.5, type=float,
|
||||||
help='dropout rate (1 - keep probability)')
|
help='dropout rate (1 - keep probability)')
|
||||||
parser.add_argument('--weight-decay', type=float, default=0.0001,
|
|
||||||
help='weight for L2 loss on embedding matrix')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
@ -111,12 +114,12 @@ def main(args):
|
|||||||
# ---------------------------------------STEP1: load data-----------------------------------------------------
|
# ---------------------------------------STEP1: load data-----------------------------------------------------
|
||||||
print('\nSTEP1: start loading data......')
|
print('\nSTEP1: start loading data......')
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
# load graph structure info------
|
# load graph structure info; by defalt, treat as undirected and unweighted graph ------
|
||||||
if args.graph_format == 'adjlist':
|
if args.graph_format == 'adjlist':
|
||||||
g.read_adjlist(path=args.graph_file, directed=args.directed)
|
g.read_adjlist(path=args.graph_file, directed=args.directed)
|
||||||
elif args.graph_format == 'edgelist':
|
elif args.graph_format == 'edgelist':
|
||||||
g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed)
|
g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed)
|
||||||
# load node attribute info------
|
# load node attribute info ------
|
||||||
is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or
|
is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or
|
||||||
args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane')
|
args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane')
|
||||||
if is_ane:
|
if is_ane:
|
||||||
@ -133,6 +136,10 @@ def main(args):
|
|||||||
test_edge_labels = []
|
test_edge_labels = []
|
||||||
if args.task == 'lp' or args.task == 'lp_and_nc':
|
if args.task == 'lp' or args.task == 'lp_and_nc':
|
||||||
edges_removed = g.remove_edge(ratio=args.link_remove)
|
edges_removed = g.remove_edge(ratio=args.link_remove)
|
||||||
|
num_test_links = 0
|
||||||
|
limit_percentage = 0.2 # at most, use 0.2 randomly removed links for testing
|
||||||
|
num_test_links = int( min(len(edges_removed), len(edges_removed)/args.link_remove*limit_percentage) )
|
||||||
|
edges_removed = random.sample(edges_removed, num_test_links)
|
||||||
test_node_pairs, test_edge_labels = generate_edges_for_linkpred(graph=g, edges_removed=edges_removed, balance_ratio=1.0)
|
test_node_pairs, test_edge_labels = generate_edges_for_linkpred(graph=g, edges_removed=edges_removed, balance_ratio=1.0)
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s')
|
print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s')
|
||||||
@ -145,9 +152,9 @@ def main(args):
|
|||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
model = None
|
model = None
|
||||||
if args.method == 'abrw':
|
if args.method == 'abrw':
|
||||||
from libnrl import abrw # ANE method; Attributed Biased Random Walk
|
from libnrl import abrw # ANE method; (Adaptive) Attributed Biased Random Walk
|
||||||
model = abrw.ABRW(graph=g, dim=args.dim, alpha=args.ABRW_alpha, topk=args.ABRW_topk, number_walks=args.number_walks,
|
model = abrw.ABRW(graph=g, dim=args.dim, topk=args.ABRW_topk, beta=args.ABRW_beta, beta_mode=args.ABRW_beta_mode, alpha=args.ABRW_alpha,
|
||||||
walk_length=args.walk_length, window=args.window_size, workers=args.workers)
|
number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers)
|
||||||
elif args.method == 'aane':
|
elif args.method == 'aane':
|
||||||
from libnrl import aane # ANE method
|
from libnrl import aane # ANE method
|
||||||
model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter,
|
model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter,
|
||||||
@ -180,10 +187,10 @@ def main(args):
|
|||||||
elif args.method == 'asne':
|
elif args.method == 'asne':
|
||||||
from libnrl import asne # ANE method
|
from libnrl import asne # ANE method
|
||||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
|
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
|
||||||
elif args.method == 'sagemean': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
|
elif args.method == 'sagemean': # parameters for graphsage models are in 'graphsage' -> '__init__.py'
|
||||||
from libnrl.graphsage import graphsageAPI # ANE method
|
from libnrl.graphsage import graphsageAPI # ANE method
|
||||||
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
|
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
|
||||||
elif args.method == 'sagegcn': # parameters for graphsage models are in 'graphsage' -> '__init__.py'
|
elif args.method == 'sagegcn': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
|
||||||
from libnrl.graphsage import graphsageAPI # ANE method
|
from libnrl.graphsage import graphsageAPI # ANE method
|
||||||
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
|
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
|
||||||
else:
|
else:
|
||||||
@ -204,7 +211,7 @@ def main(args):
|
|||||||
del model, g
|
del model, g
|
||||||
# ------lp task
|
# ------lp task
|
||||||
if args.task == 'lp' or args.task == 'lp_and_nc':
|
if args.task == 'lp' or args.task == 'lp_and_nc':
|
||||||
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
|
print(f'Link Prediction task; the number of testing links {len(test_edge_labels)} i.e. at most 2*0.2*all_positive_links)')
|
||||||
ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
|
ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||||
ds_task.evaluate(test_node_pairs, test_edge_labels)
|
ds_task.evaluate(test_node_pairs, test_edge_labels)
|
||||||
# ------nc task
|
# ------nc task
|
||||||
|
Loading…
Reference in New Issue
Block a user