diff --git a/requirements.txt b/requirements.txt
index 1c5e87b..8c4f577 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,13 @@
 # tested in python==3.6.6
 numpy==1.14.5
+scipy==1.1.0
 tensorflow==1.10.0 # to do... compatible with latest tf and tf-gpu
 tensorboard==1.10.0
-networkx==2.2
-gensim==3.0.1
+networkx==2.3
+gensim==3.7.3
 scikit-learn==0.19.0 # to do... compatible with >0.20
 pandas==0.23.0
+psutil==5.6.3
 
 # Enable GPU:
 # If using anaconda, run `conda install tensorflow-gpu==1.10.0`
@@ -15,3 +17,47 @@ pandas==0.23.0
 
 # Or simply build from docker image: docker pull tensorflow/tensorflow:1.10.0-gpu-py3
 # ref: https://www.tensorflow.org/install/docker#gpu_support
+
+
+'''
+Package         Version
+--------------- --------
+absl-py         0.7.1
+astor           0.8.0
+boto            2.49.0
+boto3           1.9.160
+botocore        1.12.160
+certifi         2019.3.9
+chardet         3.0.4
+decorator       4.4.0
+docutils        0.14
+gast            0.2.2
+gensim          3.7.3
+grpcio          1.21.1
+idna            2.8
+jmespath        0.9.4
+Markdown        3.1.1
+mkl-fft         1.0.12
+mkl-random      1.0.2
+networkx        2.3
+numpy           1.14.5
+pandas          0.23.0
+pip             19.1.1
+protobuf        3.8.0
+psutil          5.6.3
+python-dateutil 2.8.0
+pytz            2019.1
+requests        2.22.0
+s3transfer      0.2.0
+scikit-learn    0.19.0
+scipy           1.1.0
+setuptools      39.1.0
+six             1.12.0
+smart-open      1.8.4
+tensorboard     1.10.0
+tensorflow      1.10.0
+termcolor       1.1.0
+urllib3         1.25.3
+Werkzeug        0.15.4
+wheel           0.33.4
+'''
\ No newline at end of file
diff --git a/src/libnrl/aane.py b/src/libnrl/aane.py
index 9d53697..fc949cf 100644
--- a/src/libnrl/aane.py
+++ b/src/libnrl/aane.py
@@ -3,6 +3,17 @@ ANE method: Accelerated Attributed Network Embedding (AANE)
 
 modified by Chengbin Hou 2018
 
+note: We tried this method in a HPC via pbs, 
+      however, we don't know why it is particularly slow, even we observed multiple cores were used...
+      We then tried this method in a small individual linux server. It works well.
+      If you find the same problem, just try this method in other computers.
+      Usually, Cora dataset only requires 20s/iter using my PC with 4 cores. 
+
+      However, when we run AANE for the large-scale dataset e.g. dblp (~60k nodes) in a Linux server with 40 cores,
+      it cost over 8000 seconds for each iteration...
+      For the reason, please see author's comments in https://github.com/xhuang31/AANE_Python/issues/5
+      
+      
 originally from https://github.com/xhuang31/AANE_Python
 """
 
@@ -33,7 +44,7 @@ class AANE:
     $Revision: 1.0.2 $  $Date: 2018/02/19 00:00:00 $
     """
 
-    def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=5, mode='comb', *varargs):
+    def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=2, mode='comb', *varargs):
         self.dim = dim
         self.look_back_list = graph.look_back_list  # look back node id for Net and Attr
         self.lambd = lambd  # Initial regularization parameter
diff --git a/src/libnrl/abrw.py b/src/libnrl/abrw.py
index 225b9e5..a15cd19 100644
--- a/src/libnrl/abrw.py
+++ b/src/libnrl/abrw.py
@@ -1,5 +1,6 @@
 """
-ANE method: Attributed Biased Random Walks;
+ANE method: Adap-ANE: Adaptive Attributed Network Embedding
+            based on previous Attributed Biased Random Walks https://arxiv.org/abs/1811.11728v2
 
 by Chengbin Hou & Zeyu Dong 2018
 """
@@ -10,24 +11,46 @@ import warnings
 import numpy as np
 from gensim.models import Word2Vec
 from scipy import sparse
+from sklearn.preprocessing import normalize
+from sklearn.neighbors import NearestNeighbors
+import psutil
 
 from . import walker
 from .utils import pairwise_similarity, row_as_probdist
 
+
 warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
 
 
+def deg2beta_mapping(deg_list, alpha=np.e):
+    ''' 
+    ***adaptive beta*** 
+    based on node degree for balancing structure info and attribute info
+    map node degree [0:+inf) -> beta [1:0]
+    input:  deg_list: a scalar or list
+            alpha: default e; [0, +inf) but we suggest trying 0.5, 1, e, 10, 100, ...
+    output: beta_list: a scalar or list
+    '''
+    base_list = (1.0 + np.power(deg_list, alpha))
+    beta_list = np.power(base_list, -1/alpha)  # characteristic curve of adaptive beta
+    # print('deg_list', deg_list[:50])
+    # print('beta_list', np.around(beta_list, decimals=3)[:50])
+    return beta_list
+
+
 class ABRW(object):
-    def __init__(self, graph, dim, alpha, topk, number_walks, walk_length, **kwargs):
+    def __init__(self, graph, dim, topk, beta, beta_mode, alpha, number_walks, walk_length, **kwargs):
         self.g = graph
         self.dim = dim
-        self.alpha = float(alpha)
         self.topk = int(topk)
+        self.beta = float(beta)
+        self.beta_mode = int(beta_mode)
+        self.alpha = float(alpha)
         self.number_walks = number_walks
         self.walk_length = walk_length
 
         # obtain biased transition mat -----------
-        self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(), X=self.g.get_attr_mat())
+        self.T = self.get_biased_transition_mat(A=self.g.get_adj_mat(dense_output=False), X=self.g.get_attr_mat(dense_output=False))
 
         # aim to generate a sequences of walks/sentences
         # apply weighted random walks on the reconstructed network based on biased transition mat
@@ -54,38 +77,127 @@ class ABRW(object):
         '''
         given: A and X --> T_A and T_X
         research question: how to combine A and X in a more principled way
-        genral idea: Attribute Biased Random Walk
-        i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X
-        result: ABRW-trainsition matrix; T
+        our idea: T = (1-beta)*T_A + beta*T_X
+        mode 1: fixed beta
+        mode 2: adaptive beta baed on average degree
+        mode 3: adaptive beta based on each node degree 
         '''
         print("obtaining biased transition matrix where each row sums up to 1.0...")
 
-        preserve_zeros = False
-        T_A = row_as_probdist(A, preserve_zeros)  # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
-        print('Preserve zero rows of the adj matrix: ', preserve_zeros)
-
-        t1 = time.time()
-        X_sim = pairwise_similarity(X)  # attr similarity mat; X_sim is a square mat, but X is not
-
-        t2 = time.time()
-        print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
-        cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1)
-        X_sim[(X_sim < cutoff)] = 0  # improve both accuracy and efficiency
-        X_sim = sparse.csr_matrix(X_sim)
-
-        t3 = time.time()
-        T_X = row_as_probdist(X_sim)
-
-        t4 = time.time()
-        print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s; row norm time: {(t4-t3):.2f}s')
-        del A, X, X_sim
-
-        # =====================================information fusion via transition matrices========================================
-        print('------alpha for P = alpha * T_A + (1-alpha) * T_X------: ', self.alpha)
+        # norm adj mat; For isolated node, return all-zeros row, so that T_A is not a strict transition matrix
+        # preserve_all_zero_row=False gives similar result, but is less efficient
+        t0 = time.time()
+        T_A = row_as_probdist(A, dense_output=False, preserve_all_zero_row=True)    # **sparse mat**
+        T_X = None
+        
         n = self.g.get_num_nodes()
-        alp = np.array(n * [self.alpha])  # for vectorized computation
-        alp[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 0
-        T = sparse.diags(alp).dot(T_A) + sparse.diags(1 - alp).dot(T_X)  # sparse version
+        free_memory = psutil.virtual_memory().available
+        print('free_memory ', free_memory)
+        # n*n*8 is the bytes required by pairwise similarity matrix; 2e9 = 2GB ROM remained for safety reason
+        # if your computer have 200G memory, there should be no problem for graph with 100k nodes
+        # this naive implementation is **faster** than BallTree implementation, thanks to numpy
+        #if n*n*8 + n*n*8 + n*5000*8 + 2e9 < free_memory and n < 1e5:   # X_sim[n,n] dense + A[n,n] if dense + X[n,5000] if dense with max 5000 feats + 2e9 for safety
+        if False:
+            print('naive implementation + intro-select ')
+            t1 = time.time()
+            X_sim = pairwise_similarity(X.todense())
+            # sparse operator; reduce time and space complexity & remove less useful dissimilar nodes
+            t2 = time.time()
+            print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
+            cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1).reshape(-1,1) # introselect average speed O(1); see link below
+            X_sim[(X_sim < cutoff)] = 0                                     # https://docs.scipy.org/doc/numpy/reference/generated/numpy.partition.html
+            X_sim = sparse.csr_matrix(X_sim)
+            X_sim.setdiag(0)
+            # norm attr mat; note: T_X mush be a strict transition matrix, thanks to the strict transition matrix of X_sim
+            t3 = time.time()
+            T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat**
+            t4 = time.time()
+            print(f'attr sim cal time: {(t2-t1):.2f}s; topk sparse ops time: {(t3-t2):.2f}s')
+            print(f'adj row norm time: {(t1-t0):.2f}s; attr row norm time: {(t4-t3):.2f}s')
+            print('all naive implementation time: ', t4-t1)
+            del A, X, X_sim, cutoff
+        # a scalable w.r.t. both time and space
+        # but might be slightly slower when n is small e.g. n<100k
+        # BallTree time complexity O( nlong(n) )
+        else:
+            print('BallTree implementation + multiprocessor query')
+            t1 = time.time()
+            X = normalize(X.todense(), norm='l2', axis=1)
+            t2 = time.time()
+            print('normalize time: ',t2-t1)
+            # after normalization -> Euclidean distance = cosine distance (inverse of cosine similarity)
+            neigh = NearestNeighbors(n_neighbors=self.topk, algorithm='ball_tree', leaf_size=40, metric='minkowski', p=2, n_jobs=-1)
+            neigh.fit(X)
+            t3 = time.time()
+            print('BallTree time: ',t3-t2)
+            dist, ind = neigh.kneighbors(X[:]) # Euclidean dist, indices
+            # print('dist',dist)
+            # print('ind',ind)
+            t4 = time.time()
+            print('query time: ',t4-t3)
+            sim = 1-np.multiply(dist, dist)/2  # cosine distance -> cosine similarity
+            # print('sim: ',sim)
+            t5 = time.time()
+            print('cosine distance -> cosine similarity time: ',t5-t4)
+            row = []
+            col = []
+            data = []
+            for i in range(n):
+                row.extend( [i]* self.topk )
+                col.extend( ind[i] )
+                data.extend( sim[i] )
+            t6 = time.time()
+            print('sparse matrix data & ind construction for loop time: ',t6-t5)
+            zero_row_ind = np.where(~X.any(axis=1))[0]
+            # print('zero_row_ind',zero_row_ind)
+            X_sim = sparse.csc_matrix((data, (row, col)), shape=(n, n))
+            for col in zero_row_ind:
+                X_sim.data[X_sim.indptr[col]:X_sim.indptr[col+1]] = 0
+            X_sim = sparse.csr_matrix(X_sim)
+            for row in zero_row_ind:
+                X_sim.data[X_sim.indptr[row]:X_sim.indptr[row+1]] = 0
+            X_sim.setdiag(0)
+            X_sim.eliminate_zeros()
+            t7 = time.time()
+            # print(X_sim.todense())
+            print('sparse.csr_matrix time:',t7-t6)
+            T_X = row_as_probdist(X_sim, dense_output=False, preserve_all_zero_row=False) # **sparse mat**
+            t8 = time.time()
+            print('BallTree implementation ALL time',t8-t1)
+            del A, X, X_sim, data, row, col, neigh, sim
+            
+
+        # ============================================== information fusion via transition matrices =======================================================
+        print('about beta, beta_mode, alpha: ', self.beta, self.beta_mode, self.alpha)
+        b = None
+
+        # mode 1: fixed beta, except if T_A has any zero rows, set beta=1.0
+        if self.beta_mode == 1:
+            print('====== fixed beta: T = (1-beta)*T_A + beta*T_X where beta= ', self.beta)
+            b = np.array(n * [self.beta])                                # vectored computing
+            b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0  # if T_A has any zero rows, set beta=0
+
+        # mode 2: adaptive beta baed on average degree which reflects the richness of structural info
+        if self.beta_mode == 2:
+            print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+ave_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha)
+            if self.g.G.is_directed():
+                print('directed graph, TODO...')
+                exit(0)
+            ave_deg = len(self.g.G.edges()) * 2.0 / len(self.g.G.nodes())  # see def http://konect.uni-koblenz.de/statistics/avgdegree
+            b = deg2beta_mapping(ave_deg, alpha=self.alpha)                # mapping by the characteristic curve of adaptive beta
+            b = np.array(n * [b])                               
+            b[~np.asarray(T_A.sum(axis=1) != 0).ravel()] = 1.0
+
+        # mode 3: adaptive beta based on each node degree
+        if self.beta_mode == 3:
+            print('====== adaptive beta: T = (1-beta)*T_A + beta*T_X, where adaptive beta=(1.0+node_deg^alpha)^(-1.0/alpha) and alpha= ', self.alpha)
+            if self.g.G.is_directed():
+                print('directed graph, TODO...')
+                exit(0)
+            node_deg_list = [deg*2 for (node, deg) in self.g.G.degree()]  # *2 due to undirected graph; in consistant with ave_deg after mapping
+            b = deg2beta_mapping(node_deg_list, alpha=self.alpha)         # mapping by the characteristic curve of adaptive beta
+
+        T = sparse.diags(1.0-b).dot(T_A) + sparse.diags(b).dot(T_X)
         t5 = time.time()
         print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s')
         return T
@@ -97,3 +209,19 @@ class ABRW(object):
         for node, vec in self.vectors.items():
             fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
         fout.close()
+
+
+# ------------------------ utils draw_characteristic_curve ---------------------------
+def draw_characteristic_curve():
+    import matplotlib.pyplot as plt
+    deg_list = np.arange(0, 100, 0.01)
+    beta_list_1 = deg2beta_mapping(deg_list, alpha=0.5)
+    beta_list_2 = deg2beta_mapping(deg_list, alpha=1)
+    beta_list_3 = deg2beta_mapping(deg_list, alpha=np.e)
+    beta_list_4 = deg2beta_mapping(deg_list, alpha=10)
+    plt.plot(deg_list, beta_list_1, label='alpha=0.5')
+    plt.plot(deg_list, beta_list_2, label='alpha=1')
+    plt.plot(deg_list, beta_list_3, label='alpha=np.e')
+    plt.plot(deg_list, beta_list_4, label='alpha=10')
+    plt.legend()
+    plt.show()
\ No newline at end of file
diff --git a/src/libnrl/asne.py b/src/libnrl/asne.py
index 2b1211d..13afaae 100644
--- a/src/libnrl/asne.py
+++ b/src/libnrl/asne.py
@@ -20,7 +20,7 @@ from sklearn.base import BaseEstimator, TransformerMixin
 
 class ASNE(BaseEstimator, TransformerMixin):
     def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
-                 early_stopping=2000):  # it seems that overfitting can get better result? try other early_stopping... to do...
+                 early_stopping=2000):
 
         t1 = time.time()
         X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
@@ -121,7 +121,7 @@ class ASNE(BaseEstimator, TransformerMixin):
         iter_count = 0
         train_loss_best = 0
         train_loss_keep_increasing = 0
-        early_stopping = self.early_stopping  # early stopping if training loss increased
+        early_stopping = self.early_stopping  # early stopping if training loss increased for early_stopping times
 
         for epoch in range(self.epoch):
             t1 = time.time()
@@ -136,6 +136,13 @@ class ASNE(BaseEstimator, TransformerMixin):
 
                 # Fit training using batch data
                 train_loss = self.partial_fit(batch_xs)
+                ''' 
+                # no early stopping; 
+                # as the original code did https://github.com/lizi-git/ASNE/blob/master/SNE.py
+                # it seems that the more epochs are, the better results are...
+                # e.g. 10 epochs are obviously better than 1 epoch, 
+                # but after approximately 50 epochs, there is no much gain or loss...
+                # so we run for all 100 epochs to ensure the best performance
                 iter_count += 1
                 if iter_count == 1:
                     train_loss_best = train_loss
@@ -157,6 +164,7 @@ class ASNE(BaseEstimator, TransformerMixin):
                             return self.vectors
                         else:
                             pass
+                    '''
             t2 = time.time()
             print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
 
@@ -195,7 +203,7 @@ class ASNE(BaseEstimator, TransformerMixin):
 
 def format_data_from_OpenANE_to_ASNE(g, dim):
     ''' convert OpenANE data format to ASNE data format '''
-    attr_Matrix = g.get_attr_mat(is_sparse=False)
+    attr_Matrix = g.get_attr_mat(dense_output=True)
     id_N = attr_Matrix.shape[0]  # n nodes
     attr_M = attr_Matrix.shape[1]  # m features
 
diff --git a/src/libnrl/attrpure.py b/src/libnrl/attrpure.py
index 1848e08..a18633f 100644
--- a/src/libnrl/attrpure.py
+++ b/src/libnrl/attrpure.py
@@ -21,7 +21,7 @@ class ATTRPURE(object):
             self.vectors[key] = embeddings[ind]
 
     def train(self):
-        X = self.g.get_attr_mat().todense()
+        X = self.g.get_attr_mat()
         X_compressed = None
         if self.mode == 'pca':
             X_compressed = dim_reduction(X, dim=self.dim, method='pca')
diff --git a/src/libnrl/downstream.py b/src/libnrl/downstream.py
index 9199fff..fc6ffef 100644
--- a/src/libnrl/downstream.py
+++ b/src/libnrl/downstream.py
@@ -100,6 +100,21 @@ class lpClassifier(object):
             roc = 1.0 - roc  # since lp is binary clf task, just predict the opposite if<0.5
         print("roc=", "{:.9f}".format(roc))
 
+def cosine_similarity(a, b):
+    from numpy import dot
+    from numpy.linalg import norm
+    ''' cosine similarity; can be used as score function; vector by vector; 
+        If consider similarity for all pairs,
+        pairwise_similarity() implementation may be more efficient
+    '''
+    a = np.reshape(a,-1)
+    b = np.reshape(b,-1)
+    if norm(a)*norm(b) == 0:
+        return 0.0
+    else:
+        return dot(a, b)/(norm(a)*norm(b))
+
+'''
 def norm(a):
     sum = 0.0
     for i in range(len(a)):
@@ -111,6 +126,7 @@ def cosine_similarity(a, b):
     for i in range(len(a)):
         sum = sum + a[i] * b[i]
     return sum / (norm(a) * norm(b) + 1e-100)
+'''
 
 '''
 def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
diff --git a/src/libnrl/graph.py b/src/libnrl/graph.py
index af57000..2218b96 100644
--- a/src/libnrl/graph.py
+++ b/src/libnrl/graph.py
@@ -92,24 +92,24 @@ class Graph(object):
     # ------------------------------------------------------------------------------------------
     # --------------------commonly used APIs that will not modify graph-------------------------
     # ------------------------------------------------------------------------------------------
-    def get_adj_mat(self, is_sparse=True):
+    def get_adj_mat(self, dense_output=True):
         """ return adjacency matrix; \n
             use 'csr' format for sparse matrix \n
         """
-        if is_sparse:
-            return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
-        else:
+        if dense_output:
             return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64')
+        else:
+            return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
 
-    def get_attr_mat(self, is_sparse=True):
+    def get_attr_mat(self, dense_output=True):
         """ return attribute matrix; \n
             use 'csr' format for sparse matrix \n
         """
         attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())])
-        if is_sparse:
-            return sp.csr_matrix(attr_dense_narray, dtype='float64')
-        else:
+        if dense_output:
             return np.matrix(attr_dense_narray, dtype='float64')
+        else:
+            return sp.csr_matrix(attr_dense_narray, dtype='float64')
 
     def get_num_nodes(self):
         """ return the number of nodes """
diff --git a/src/libnrl/graphsage/__init__.py b/src/libnrl/graphsage/__init__.py
index afb5be0..f6c1c13 100644
--- a/src/libnrl/graphsage/__init__.py
+++ b/src/libnrl/graphsage/__init__.py
@@ -1,6 +1,10 @@
-''' global parameters for graphsage models
-    tune these parameters here if needed
-    if needed use: from libnrl.graphsage.__init__ import *
+''' 
+global parameters for graphsage models
+tune these parameters here if needed
+if needed use: from libnrl.graphsage.__init__ import *
+we mostly follow the original code: 
+https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py
+and   https://github.com/tkipf/gcn/blob/master/gcn/train.py
 '''
 
 # seed = 2018
@@ -8,10 +12,7 @@
 # tf.set_random_seed(seed)
 log_device_placement = False
 
-
-# follow the original code by the paper author https://github.com/williamleif/GraphSAGE
 # we follow the opt parameters given by papers GCN and graphSAGE
-# note: citeseer+pubmed all follow the same parameters as cora, see their papers)
 # tensorflow + Adam optimizer + Random weight init + row norm of attr
 dim_1 = 64  # dim = dim1+dim2 = 128 for sage-mean and sage-gcn
 dim_2 = 64
@@ -19,19 +20,20 @@ samples_1 = 25
 samples_2 = 10
 
 # key parameters during training
-epochs = 100
-learning_rate = 0.001  # search [0.01, 0.001, 0.0001, 0.00001]
-dropout = 0.5
-weight_decay = 5e-4
-batch_size = 512  # if run out of memory, try to reduce them, default=512
+epochs = 50             # max epoch, we found it converges in a few epochs, and the more links are, the less epochs are required
+                        # so we set run for all 50 epochs and take out the embeddings with the best val loss
+learning_rate = 0.0001  # search [0.01, 0.001, 0.0001]
+dropout = 0.5           # dropout rate (1 - keep probability)
+batch_size = 128        # if run out of memory, try to reduce them, default=512
+weight_decay = 1e-6     # weight for L2 loss on embedding matrix
 
 # key parameters durning val
-validate_batch_size = 256  # if run out of memory, try to reduce them, default=256
+validate_batch_size = 128  # if run out of memory, try to reduce them, default=256
 validate_iter = 5000
 max_total_steps = 10**10
 print_every = 50
 
-# other parameters also follow the defaults https://github.com/williamleif/GraphSAGE
+# other parameters: also follow the defaults https://github.com/williamleif/GraphSAGE
 neg_sample_size = 20
 identity_dim = 0
 n2v_test_epochs = 1
@@ -44,6 +46,7 @@ base_log_dir = ''
 
 
 '''
+https://github.com/williamleif/GraphSAGE/blob/master/graphsage/unsupervised_train.py
 #core params..
 flags.DEFINE_string('model', 'graphsage', 'model names. See README for possible values.')
 flags.DEFINE_float('learning_rate', 0.00001, 'initial learning rate.')
@@ -73,4 +76,19 @@ flags.DEFINE_integer('validate_batch_size', 256, "how many nodes per validation
 flags.DEFINE_integer('gpu', 1, "which gpu to use.")
 flags.DEFINE_integer('print_every', 50, "How often to print training info.")
 flags.DEFINE_integer('max_total_steps', 10**10, "Maximum total number of iterations")
+
+----------------------------------------------------------------------------------------------------------
+
+https://github.com/tkipf/gcn/blob/master/gcn/train.py
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+flags.DEFINE_string('dataset', 'cora', 'Dataset string.')  # 'cora', 'citeseer', 'pubmed'
+flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
+flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
+flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
+flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
+flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
+flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
+flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
+flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
 '''
diff --git a/src/libnrl/graphsage/graphsageAPI.py b/src/libnrl/graphsage/graphsageAPI.py
index 2753263..0be813e 100644
--- a/src/libnrl/graphsage/graphsageAPI.py
+++ b/src/libnrl/graphsage/graphsageAPI.py
@@ -9,6 +9,7 @@
 '''
 
 import random
+import time
 
 import networkx as nx
 import numpy as np
@@ -24,7 +25,11 @@ class graphSAGE(object):
         self.walk_len = 5
 
         self.add_train_val_test_to_G(test_perc=0.0, val_perc=0.1)  # if unsupervised, no test data
+
+        t1 = time.time()
         train_data = self.tranform_data_for_graphsage()  # obtain graphSAGE required training data
+        t2 = time.time()
+        print(f'transform data format from OpenANE to SAGE; time cost: {(t2-t1):.2f}s')
 
         self.vectors = None
         if not is_supervised:
diff --git a/src/libnrl/graphsage/unsupervised_train.py b/src/libnrl/graphsage/unsupervised_train.py
index a496941..598d9ee 100644
--- a/src/libnrl/graphsage/unsupervised_train.py
+++ b/src/libnrl/graphsage/unsupervised_train.py
@@ -81,7 +81,7 @@ def construct_placeholders():
 
 def train(train_data, test_data, model):
     print('---------- the graphsage model we used: ', model)
-    print('---------- parameters we sued: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size',
+    print('---------- parameters we used: epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size',
           epochs, dim_1+dim_2, samples_1, samples_2, dropout, weight_decay, learning_rate, batch_size)
     G = train_data[0]
     features = train_data[1]  # note: features are in order of graph.look_up_list, since id_map = {k: v for v, k in enumerate(graph.look_back_list)}
diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py
index 55521e1..6d8a73d 100644
--- a/src/libnrl/tadw.py
+++ b/src/libnrl/tadw.py
@@ -29,7 +29,7 @@ class TADW(object):
 
     def getAdj(self):
         A = self.g.get_adj_mat()  # by default, return a sparse matrix
-        return np.array(row_as_probdist(A, dense_output=True, preserve_zeros=True))  # only support np.array, otherwise dim error...
+        return np.array(row_as_probdist(A, dense_output=True, preserve_all_zero_row=True))  # only support np.array, otherwise dim error...
 
     def getT(self):
         g = self.g.G
diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py
index 106b433..614cb5b 100644
--- a/src/libnrl/utils.py
+++ b/src/libnrl/utils.py
@@ -1,5 +1,5 @@
 """
-commonly used ulits
+commonly used utils
 by Chengbin Hou & Zeyu Dong
 """
 
@@ -11,7 +11,7 @@ from scipy import sparse
 
 # ---------------------------------ulits for calculation--------------------------------
 
-def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
+def row_as_probdist(mat, dense_output=True, preserve_all_zero_row=False):
     """Make each row of matrix sums up to 1.0, i.e., a probability distribution.
     Support both dense and sparse matrix.
 
@@ -36,7 +36,8 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
     row_sum[zero_rows] = 1
     diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
     mat = diag.dot(mat)
-    if not preserve_zeros:
+    if not preserve_all_zero_row:
+        print('For all-zero row, replace each 0 with value 1/dim(row)... not preserving zero i.e. a strict transition matrix')
         mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
 
     if dense_output and sparse.issparse(mat):
@@ -44,7 +45,7 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
     return mat
 
 
-def pairwise_similarity(mat, type='cosine'):
+def pairwise_similarity(mat, type='cosine'):   # for efficiency, plz given dense mat as the input
     if type == 'cosine':  # support sprase and dense mat
         from sklearn.metrics.pairwise import cosine_similarity
         result = cosine_similarity(mat, dense_output=True)
diff --git a/src/libnrl/walker.py b/src/libnrl/walker.py
index a092cd6..b810b40 100644
--- a/src/libnrl/walker.py
+++ b/src/libnrl/walker.py
@@ -20,10 +20,11 @@ class WeightedWalker:
     ''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method
     '''
 
-    def __init__(self, node_id_map, transition_mat, workers):
-        self.look_back_list = node_id_map
+    def __init__(self, node_id_map, transition_mat, workers, parallel_walks=10):  # recommend: parallel_walks = number_walks
+        self.look_back_list = node_id_map                                         # if memory error due to python multiprocessor module, plz reduce parallel_walks
         self.T = transition_mat
         self.workers = workers
+        self.parallel_walks = parallel_walks
         self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph())  # reconstructed "directed" "weighted" graph based on transition matrix
 
     # alias sampling for ABRW-------------------------
@@ -38,8 +39,10 @@ class WeightedWalker:
         t2 = time.time()
         print(f'Time for construct alias table: {(t2-t1):.2f}')
 
-        pool = multiprocessing.Pool(processes=self.workers)
+        pool = multiprocessing.Pool(processes=self.parallel_walks)
         all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
+        pool.close()  # Waiting for all subprocesses done..
+        pool.join()
         all_walks = list(chain(*all_walks))
         t3 = time.time()
         print(f'Time for all random walks: {(t3-t2):.2f}')  # use multiple cores, total time < sum(time@itr)
@@ -51,8 +54,12 @@ class WeightedWalker:
 
     def mp_rw_wrapper(self, walk_iter):
         walks = []
+        random.seed()                       # *** for multiprocessor version
+        np.random.seed()                    # *** do NOT remove these 'random' operation
+        nodes = list(self.nodes.copy())     # *** otherwise, each number_walks may give the same node sequences...
+        random.shuffle(nodes)               # *** which hence decrease performance
         t1 = time.time()
-        for node in self.nodes:
+        for node in nodes:
             walks.append(self.weighted_walk(start_node=node))
         t2 = time.time()
         print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
@@ -87,11 +94,12 @@ def deepwalk_walk_wrapper(class_instance, walk_length, start_node):
     class_instance.deepwalk_walk(walk_length, start_node)
 
 class BasicWalker:
-    def __init__(self, g, workers):
+    def __init__(self, g, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it
         self.g = g
         self.node_size = g.get_num_nodes()
         self.look_up_dict = g.look_up_dict
         self.workers = workers
+        self.parallel_walks = parallel_walks
 
     def deepwalk_walk(self, start_node):
         '''
@@ -112,8 +120,12 @@ class BasicWalker:
 
     def mp_rw_wrapper(self, walk_iter):
         walks = []
+        random.seed()                       # *** for multiprocessor version
+        np.random.seed()                    # *** do NOT remove these 'random' operation
+        nodes = list(self.nodes.copy())     # *** otherwise, each number_walks may give the same node sequences...
+        random.shuffle(nodes)               # *** which hence decrease performance
         t1 = time.time()
-        for node in self.nodes:
+        for node in nodes:
             walks.append(self.deepwalk_walk(start_node=node))
         t2 = time.time()
         print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
@@ -130,8 +142,10 @@ class BasicWalker:
         all_walks = None
 
         t1 = time.time()
-        pool = multiprocessing.Pool(processes=self.workers)
+        pool = multiprocessing.Pool(processes=self.parallel_walks)
         all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
+        pool.close()  # Waiting for all subprocesses done..
+        pool.join()
         all_walks = list(chain(*all_walks))
         t2 = time.time()
         print(f'Time for all random walks: {(t2-t1):.2f}')  # use multiple cores, total time < sum(time@itr)
@@ -140,11 +154,12 @@ class BasicWalker:
 
 # ===========================================node2vec-walker============================================
 class Walker:
-    def __init__(self, g, p, q, workers):
+    def __init__(self, g, p, q, workers, parallel_walks=10): # recommend: parallel_walks = number_walks; if memory error, plz reduce it
         self.g = g
         self.p = p
         self.q = q
         self.workers = workers
+        self.parallel_walks = parallel_walks
 
         if self.g.get_isweighted():
             # print('is weighted graph: ', self.g.get_isweighted())
@@ -180,8 +195,12 @@ class Walker:
 
     def mp_rw_wrapper(self, walk_iter):
         walks = []
+        random.seed()                       # *** for multiprocessor version
+        np.random.seed()                    # *** do NOT remove these 'random' operation
+        nodes = list(self.nodes.copy())     # *** otherwise, each number_walks may give the same node sequences...
+        random.shuffle(nodes)               # *** which hence decrease performance
         t1 = time.time()
-        for node in self.nodes:
+        for node in nodes:
             walks.append(self.node2vec_walk(start_node=node))
         t2 = time.time()
         print(f'Walk iteration: {walk_iter+1}/{self.num_walks}; time cost: {(t2-t1):.2f}')
@@ -197,8 +216,10 @@ class Walker:
         all_walks = None
 
         t1 = time.time()
-        pool = multiprocessing.Pool(processes=self.workers)
+        pool = multiprocessing.Pool(processes=self.parallel_walks)
         all_walks = pool.map(self.mp_rw_wrapper, range(self.num_walks))
+        pool.close()  # Waiting for all subprocesses done..
+        pool.join()
         all_walks = list(chain(*all_walks))
         t2 = time.time()
         print(f'Time for all random walks: {(t2-t1):.2f}')  # use multiple cores, total time < sum(time@itr)
diff --git a/src/main.py b/src/main.py
index ce8ea03..c71a95e 100644
--- a/src/main.py
+++ b/src/main.py
@@ -11,6 +11,7 @@ by Chengbin HOU 2018 <chengbin.hou10@foxmail.com>
 '''
 
 import time
+import random
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 
 from sklearn.linear_model import LogisticRegression  # to do... try SVM...
@@ -35,9 +36,9 @@ def parse_args():
                         help='node embeddings dimensions')
     parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'],
                         help='choices of downstream tasks: none, lp, nc, lp_and_nc')
-    parser.add_argument('--link-remove', default=0.1, type=float,
+    parser.add_argument('--link-remove', default=0.2, type=float,
                         help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]')
-    parser.add_argument('--label-reserved', default=0.7, type=float,
+    parser.add_argument('--label-reserved', default=0.5, type=float,
                         help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
     parser.add_argument('--directed', default=False, action='store_true',
                         help='directed or undirected graph')
@@ -54,8 +55,12 @@ def parse_args():
                         help='choices of Network Embedding methods')
     parser.add_argument('--ABRW-topk', default=30, type=int,
                         help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]')
-    parser.add_argument('--ABRW-alpha', default=0.8, type=float,
-                        help='balance struc and attr info; ranging [0, 1]')
+    parser.add_argument('--ABRW-alpha', default=2.71828, type=float,
+                        help='control the shape of characteristic curve of adaptive beta, ranging [0, inf]')
+    parser.add_argument('--ABRW-beta-mode', default=1, type=int,
+                        help='1: fixed; 2: adaptive based on average degree; 3: adaptive based on each node degree')
+    parser.add_argument('--ABRW-beta', default=0.2, type=float,
+                        help='balance struc and attr info; ranging [0, 1]; disabled if beta-mode 2 or 3')
     parser.add_argument('--AANE-lamb', default=0.05, type=float,
                         help='balance struc and attr info; ranging [0, inf]')
     parser.add_argument('--AANE-rho', default=5, type=float,
@@ -64,16 +69,16 @@ def parse_args():
                         help='max iter')
     parser.add_argument('--TADW-lamb', default=0.2, type=float,
                         help='balance struc and attr info; ranging [0, inf]')
-    parser.add_argument('--TADW-maxiter', default=10, type=int,
+    parser.add_argument('--TADW-maxiter', default=20, type=int,
                         help='max iter')
     parser.add_argument('--ASNE-lamb', default=1.0, type=float,
                         help='balance struc and attr info; ranging [0, inf]')
     parser.add_argument('--AttrComb-mode', default='concat', type=str,
                         help='choices of mode: concat, elementwise-mean, elementwise-max')
     parser.add_argument('--Node2Vec-p', default=0.5, type=float,  # if p=q=1.0 node2vec = deepwalk
-                        help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
+                        help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]')
     parser.add_argument('--Node2Vec-q', default=0.5, type=float,
-                        help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
+                        help='trade-off BFS and DFS; grid search [0.25; 0.50; 1; 2; 4]')
     parser.add_argument('--GraRep-kstep', default=4, type=int,
                         help='use k-step transition probability matrix, error if dim%Kstep!=0')
     parser.add_argument('--LINE-order', default=3, type=int,
@@ -87,10 +92,10 @@ def parse_args():
                         help='length of each random walk')
     parser.add_argument('--window-size', default=10, type=int,
                         help='window size of skipgram model')
-    parser.add_argument('--workers', default=24, type=int,
+    parser.add_argument('--workers', default=36, type=int,
                         help='# of parallel processes.')
     # for deep learning based methods; parameters about layers and neurons used are not specified here
-    parser.add_argument('--learning-rate', default=0.001, type=float,
+    parser.add_argument('--learning-rate', default=0.0001, type=float,
                         help='learning rate')
     parser.add_argument('--batch-size', default=128, type=int,
                         help='batch size')
@@ -98,8 +103,6 @@ def parse_args():
                         help='epochs')
     parser.add_argument('--dropout', default=0.5, type=float,
                         help='dropout rate (1 - keep probability)')
-    parser.add_argument('--weight-decay', type=float, default=0.0001,
-                        help='weight for L2 loss on embedding matrix')
     args = parser.parse_args()
     return args
 
@@ -111,12 +114,12 @@ def main(args):
     # ---------------------------------------STEP1: load data-----------------------------------------------------
     print('\nSTEP1: start loading data......')
     t1 = time.time()
-    # load graph structure info------
+    # load graph structure info; by defalt, treat as undirected and unweighted graph ------
     if args.graph_format == 'adjlist':
         g.read_adjlist(path=args.graph_file, directed=args.directed)
     elif args.graph_format == 'edgelist':
         g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed)
-    # load node attribute info------
+    # load node attribute info ------
     is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or
               args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane')
     if is_ane:
@@ -133,6 +136,10 @@ def main(args):
     test_edge_labels = []
     if args.task == 'lp' or args.task == 'lp_and_nc':
         edges_removed = g.remove_edge(ratio=args.link_remove)
+        num_test_links = 0
+        limit_percentage = 0.2    # at most, use 0.2 randomly removed links for testing
+        num_test_links = int( min(len(edges_removed), len(edges_removed)/args.link_remove*limit_percentage) )
+        edges_removed = random.sample(edges_removed, num_test_links)
         test_node_pairs, test_edge_labels = generate_edges_for_linkpred(graph=g, edges_removed=edges_removed, balance_ratio=1.0)
     t2 = time.time()
     print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s')
@@ -145,9 +152,9 @@ def main(args):
     t1 = time.time()
     model = None
     if args.method == 'abrw':
-        from libnrl import abrw  # ANE method; Attributed Biased Random Walk
-        model = abrw.ABRW(graph=g, dim=args.dim, alpha=args.ABRW_alpha, topk=args.ABRW_topk, number_walks=args.number_walks,
-                          walk_length=args.walk_length, window=args.window_size, workers=args.workers)
+        from libnrl import abrw  # ANE method; (Adaptive) Attributed Biased Random Walk
+        model = abrw.ABRW(graph=g, dim=args.dim, topk=args.ABRW_topk, beta=args.ABRW_beta, beta_mode=args.ABRW_beta_mode, alpha=args.ABRW_alpha, 
+                          number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers)
     elif args.method == 'aane':
         from libnrl import aane  # ANE method
         model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter,
@@ -180,10 +187,10 @@ def main(args):
     elif args.method == 'asne':
         from libnrl import asne  # ANE method
         model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
-    elif args.method == 'sagemean':  # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
+    elif args.method == 'sagemean':  # parameters for graphsage models are in 'graphsage' -> '__init__.py'
         from libnrl.graphsage import graphsageAPI  # ANE method
         model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
-    elif args.method == 'sagegcn':  # parameters for graphsage models are in 'graphsage' -> '__init__.py'
+    elif args.method == 'sagegcn':   # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
         from libnrl.graphsage import graphsageAPI  # ANE method
         model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
     else:
@@ -204,7 +211,7 @@ def main(args):
     del model, g
     # ------lp task
     if args.task == 'lp' or args.task == 'lp_and_nc':
-        print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
+        print(f'Link Prediction task; the number of testing links {len(test_edge_labels)} i.e. at most 2*0.2*all_positive_links)')
         ds_task = lpClassifier(vectors=vectors)  # similarity/distance metric as clf; basically, lp is a binary clf probelm
         ds_task.evaluate(test_node_pairs, test_edge_labels)
     # ------nc task