clean up for relaese

2018-11-30 21:41:31 +08:00 · 2018-11-30 21:41:31 +08:00 · b9f4edbebd
commit b9f4edbebd
parent f4a765efc5
21 changed files with 9 additions and 1068 deletions
--- a/src/libnrl/abrw.py
+++ b/src/libnrl/abrw.py
@ -57,15 +57,10 @@ class ABRW(object):
        genral idea: Attribute Biased Random Walk
        i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X
        result: ABRW-trainsition matrix; T
-        *** questions: 1) what about if we have some single nodes i.e. some rows of T_A gives 0s
-                       2) the similarity/distance metric to obtain T_X
-                       3) alias sampling as used in node2vec for speeding up, but this is the case
-                            if each row of P gives many 0s
-                            --> how to make each row of P is a pdf and meanwhile is sparse
        '''
        print("obtaining biased transition matrix where each row sums up to 1.0...")

-        preserve_zeros = False  # compare them: 1) accuracy; 2) efficiency
+        preserve_zeros = False
        T_A = row_as_probdist(A, preserve_zeros)  # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
        print('Preserve zero rows of the adj matrix: ', preserve_zeros)

@ -95,8 +90,8 @@ class ABRW(object):
        print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s')
        return T

-    def save_embeddings(self, filename):  # to do... put it to utils;
-        fout = open(filename, 'w')  # call it while __init__ (abrw calss) with flag --save-emb=True (from main.py)
+    def save_embeddings(self, filename):
+        fout = open(filename, 'w')
        node_num = len(self.vectors.keys())
        fout.write("{} {}\n".format(node_num, self.dim))
        for node, vec in self.vectors.items():
--- a/src/libnrl/attrcomb.py
+++ b/src/libnrl/attrcomb.py
@ -73,7 +73,7 @@ class ATTRCOMB(object):
                nrl_embeddings.append(model.vectors[key])
            return np.array(nrl_embeddings)

-        elif comb_with == 'node2vec':  # to do... the parameters
+        elif comb_with == 'node2vec':
            model = node2vec.Node2vec(graph=self.g, path_length=80, num_paths=self.number_walks,
                                      dim=dim, workers=4, p=0.8, q=0.8, window=10)
            nrl_embeddings = []
--- a/src/libnrl/downstream.py
+++ b/src/libnrl/downstream.py
@ -39,7 +39,7 @@ class ncClassifier(object):
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y)
-        np.random.set_state(state)  # why??? for binarizer.transform??
+        np.random.set_state(state)
        return self.evaluate(X_test, Y_test)

    def train(self, X, Y, Y_all):
@ -66,7 +66,6 @@ class ncClassifier(object):
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
-        # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
        print(results)
        return results

@ -94,10 +93,6 @@ class lpClassifier(object):
    # clf here is simply a similarity/distance metric
    def evaluate(self, X_test, Y_test, seed=0):
        test_size = len(X_test)
-        # shuffle_indices = np.random.permutation(np.arange(test_size))
-        # X_test = [X_test[shuffle_indices[i]] for i in range(test_size)]
-        # Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)]
-
        Y_true = [int(i) for i in Y_test]
        Y_probs = []
        for i in range(test_size):
@ -114,7 +109,6 @@ class lpClassifier(object):
        if roc < 0.5:
            roc = 1.0 - roc  # since lp is binary clf task, just predict the opposite if<0.5
        print("roc=", "{:.9f}".format(roc))
-        # plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value


 def norm(a):
@ -128,19 +122,9 @@ def cosine_similarity(a, b):
    sum = 0.0
    for i in range(len(a)):
        sum = sum + a[i] * b[i]
-    # return sum/(norm(a) * norm(b))
-    # fix numerical issue 1e-100 almost = 0!
    return sum / (norm(a) * norm(b) + 1e-100)


-'''
-#cosine_similarity realized by use...
-#or try sklearn....
-        from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances  # we may try diff metrics
-        #ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
-'''
-
-
 def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
    # randomly split links/edges into training set and testing set
    # *** note: we do not assume every node must be connected after removing links
@ -160,8 +144,6 @@ def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):

    # generate testing set that contains both pos and neg samples
    test_pos_sample = random.sample(g.G.edges(), int(test_size))
-    # test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do...
-    # more efficient way:
    test_neg_sample = []
    num_neg_sample = int(test_size * neg_pos_link_ratio)
    num = 0
--- a/src/libnrl/gcn/init.py
+++ b/src/libnrl/gcn/init.py
@ -1,2 +0,0 @@
-from __future__ import print_function
-from __future__ import division
--- a/src/libnrl/gcn/gcnAPI.py
+++ b/src/libnrl/gcn/gcnAPI.py
@ -1,162 +0,0 @@
-import time
-
-import numpy as np
-import tensorflow as tf
-
-from . import models
-from .utils import *
-
-
-class GCN(object):
-
-    def __init__(self, graph, learning_rate=0.01, epochs=200,
-                 hidden1=16, dropout=0.5, weight_decay=5e-4, early_stopping=10,
-                 max_degree=3, clf_ratio=0.1):
-        """
-                        learning_rate: Initial learning rate
-                        epochs: Number of epochs to train
-                        hidden1: Number of units in hidden layer 1
-                        dropout: Dropout rate (1 - keep probability)
-                        weight_decay: Weight for L2 loss on embedding matrix
-                        early_stopping: Tolerance for early stopping (# of epochs)
-                        max_degree: Maximum Chebyshev polynomial degree
-        """
-        self.graph = graph
-        self.clf_ratio = clf_ratio
-        self.learning_rate = learning_rate
-        self.epochs = epochs
-        self.hidden1 = hidden1
-        self.dropout = dropout
-        self.weight_decay = weight_decay
-        self.early_stopping = early_stopping
-        self.max_degree = max_degree
-
-        self.preprocess_data()
-        self.build_placeholders()
-        # Create model
-        self.model = models.GCN(self.placeholders, input_dim=self.features[2][1], hidden1=self.hidden1, weight_decay=self.weight_decay, logging=True)
-        # Initialize session
-        self.sess = tf.Session()
-        # Init variables
-        self.sess.run(tf.global_variables_initializer())
-
-        cost_val = []
-
-        # Train model
-        for epoch in range(self.epochs):
-
-            t = time.time()
-            # Construct feed dictionary
-            feed_dict = self.construct_feed_dict(self.train_mask)
-            feed_dict.update({self.placeholders['dropout']: self.dropout})
-
-            # Training step
-            outs = self.sess.run([self.model.opt_op, self.model.loss, self.model.accuracy], feed_dict=feed_dict)
-
-            # Validation
-            cost, acc, duration = self.evaluate(self.val_mask)
-            cost_val.append(cost)
-
-            # Print results
-            print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
-                  "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
-                  "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))
-            '''  #something wrong for early stoppting?? to do...
-            if epoch > self.early_stopping and cost_val[-1] > np.mean(cost_val[-(self.early_stopping+1):-1]):
-                print("Early stopping...")
-                break
-            '''
-        print("Optimization Finished!")
-
-        # Testing
-        test_cost, test_acc, test_duration = self.evaluate(self.test_mask)
-        print("Test set results:", "cost=", "{:.5f}".format(test_cost),
-              "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))
-
-    # Define model evaluation function
-    def evaluate(self, mask):
-        t_test = time.time()
-        feed_dict_val = self.construct_feed_dict(mask)
-        outs_val = self.sess.run([self.model.loss, self.model.accuracy], feed_dict=feed_dict_val)
-        return outs_val[0], outs_val[1], (time.time() - t_test)
-
-    def build_placeholders(self):
-        num_supports = 1
-        self.placeholders = {
-            'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
-            'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(self.features[2], dtype=tf.int64)),
-            'labels': tf.placeholder(tf.float32, shape=(None, self.labels.shape[1])),
-            'labels_mask': tf.placeholder(tf.int32),
-            'dropout': tf.placeholder_with_default(0., shape=()),
-            # helper variable for sparse dropout
-            'num_features_nonzero': tf.placeholder(tf.int32)
-        }
-
-    def build_label(self):
-        g = self.graph.G
-        look_up = self.graph.look_up_dict
-        labels = []
-        label_dict = {}
-        label_id = 0
-        for node in g.nodes():
-            labels.append((node, g.nodes[node]['label']))
-            for l in g.nodes[node]['label']:
-                if l not in label_dict:
-                    label_dict[l] = label_id
-                    label_id += 1
-        self.labels = np.zeros((len(labels), label_id))
-        self.label_dict = label_dict
-        for node, l in labels:
-            node_id = look_up[node]
-            for ll in l:
-                l_id = label_dict[ll]
-                self.labels[node_id][l_id] = 1
-
-    def build_train_val_test(self):
-        """
-            build train_mask test_mask val_mask
-        """
-        train_precent = self.clf_ratio
-        training_size = int(train_precent * self.graph.G.number_of_nodes())
-        state = np.random.get_state()
-        np.random.seed(0)
-        shuffle_indices = np.random.permutation(np.arange(self.graph.G.number_of_nodes()))
-        np.random.set_state(state)
-
-        look_up = self.graph.look_up_dict
-        g = self.graph.G
-
-        def sample_mask(begin, end):
-            mask = np.zeros(g.number_of_nodes())
-            for i in range(begin, end):
-                mask[shuffle_indices[i]] = 1
-            return mask
-
-        self.train_mask = sample_mask(0, training_size-100)
-        self.val_mask = sample_mask(training_size-100, training_size)
-        self.test_mask = sample_mask(training_size, g.number_of_nodes())
-
-    def preprocess_data(self):
-        """
-            adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
-            y_train, y_val, y_test can merge to y
-        """
-        g = self.graph.G
-        look_back = self.graph.look_back_list
-        self.features = np.vstack([g.nodes[look_back[i]]['feature']
-                                   for i in range(g.number_of_nodes())])
-        self.features = preprocess_features(self.features)
-        self.build_label()
-        self.build_train_val_test()
-        adj = nx.adjacency_matrix(g)  # the type of graph
-        self.support = [preprocess_adj(adj)]
-
-    def construct_feed_dict(self, labels_mask):
-        """Construct feed dictionary."""
-        feed_dict = dict()
-        feed_dict.update({self.placeholders['labels']: self.labels})
-        feed_dict.update({self.placeholders['labels_mask']: labels_mask})
-        feed_dict.update({self.placeholders['features']: self.features})
-        feed_dict.update({self.placeholders['support'][i]: self.support[i] for i in range(len(self.support))})
-        feed_dict.update({self.placeholders['num_features_nonzero']: self.features[1].shape})
-        return feed_dict
--- a/src/libnrl/gcn/inits.py
+++ b/src/libnrl/gcn/inits.py
@ -1,27 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-
-def uniform(shape, scale=0.05, name=None):
-    """Uniform init."""
-    initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
-    return tf.Variable(initial, name=name)
-
-
-def glorot(shape, name=None):
-    """Glorot & Bengio (AISTATS 2010) init."""
-    init_range = np.sqrt(6.0/(shape[0]+shape[1]))
-    initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
-    return tf.Variable(initial, name=name)
-
-
-def zeros(shape, name=None):
-    """All zeros."""
-    initial = tf.zeros(shape, dtype=tf.float32)
-    return tf.Variable(initial, name=name)
-
-
-def ones(shape, name=None):
-    """All ones."""
-    initial = tf.ones(shape, dtype=tf.float32)
-    return tf.Variable(initial, name=name)
--- a/src/libnrl/gcn/layers.py
+++ b/src/libnrl/gcn/layers.py
@ -1,191 +0,0 @@
-import tensorflow as tf
-
-from .inits import *
-
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-
-# global unique layer ID dictionary for layer name assignment
-_LAYER_UIDS = {}
-
-
-def get_layer_uid(layer_name=''):
-    """Helper function, assigns unique layer IDs."""
-    if layer_name not in _LAYER_UIDS:
-        _LAYER_UIDS[layer_name] = 1
-        return 1
-    else:
-        _LAYER_UIDS[layer_name] += 1
-        return _LAYER_UIDS[layer_name]
-
-
-def sparse_dropout(x, keep_prob, noise_shape):
-    """Dropout for sparse tensors."""
-    random_tensor = keep_prob
-    random_tensor += tf.random_uniform(noise_shape)
-    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
-    pre_out = tf.sparse_retain(x, dropout_mask)
-    return pre_out * (1./keep_prob)
-
-
-def dot(x, y, sparse=False):
-    """Wrapper for tf.matmul (sparse vs dense)."""
-    if sparse:
-        res = tf.sparse_tensor_dense_matmul(x, y)
-    else:
-        res = tf.matmul(x, y)
-    return res
-
-
-class Layer(object):
-    """Base layer class. Defines basic API for all layer objects.
-    Implementation inspired by keras (http://keras.io).
-
-    # Properties
-        name: String, defines the variable scope of the layer.
-        logging: Boolean, switches Tensorflow histogram logging on/off
-
-    # Methods
-        _call(inputs): Defines computation graph of layer
-            (i.e. takes input, returns output)
-        __call__(inputs): Wrapper for _call()
-        _log_vars(): Log all variables
-    """
-
-    def __init__(self, **kwargs):
-        allowed_kwargs = {'name', 'logging'}
-        for kwarg in kwargs.keys():
-            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
-        name = kwargs.get('name')
-        if not name:
-            layer = self.__class__.__name__.lower()
-            name = layer + '_' + str(get_layer_uid(layer))
-        self.name = name
-        self.vars = {}
-        logging = kwargs.get('logging', False)
-        self.logging = logging
-        self.sparse_inputs = False
-
-    def _call(self, inputs):
-        return inputs
-
-    def __call__(self, inputs):
-        with tf.name_scope(self.name):
-            if self.logging and not self.sparse_inputs:
-                tf.summary.histogram(self.name + '/inputs', inputs)
-            outputs = self._call(inputs)
-            if self.logging:
-                tf.summary.histogram(self.name + '/outputs', outputs)
-            return outputs
-
-    def _log_vars(self):
-        for var in self.vars:
-            tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])
-
-
-class Dense(Layer):
-    """Dense layer."""
-
-    def __init__(self, input_dim, output_dim, placeholders, dropout=0., sparse_inputs=False,
-                 act=tf.nn.relu, bias=False, featureless=False, **kwargs):
-        super(Dense, self).__init__(**kwargs)
-
-        if dropout:
-            self.dropout = placeholders['dropout']
-        else:
-            self.dropout = 0.
-
-        self.act = act
-        self.sparse_inputs = sparse_inputs
-        self.featureless = featureless
-        self.bias = bias
-
-        # helper variable for sparse dropout
-        self.num_features_nonzero = placeholders['num_features_nonzero']
-
-        with tf.variable_scope(self.name + '_vars'):
-            self.vars['weights'] = glorot([input_dim, output_dim],
-                                          name='weights')
-            if self.bias:
-                self.vars['bias'] = zeros([output_dim], name='bias')
-
-        if self.logging:
-            self._log_vars()
-
-    def _call(self, inputs):
-        x = inputs
-
-        # dropout
-        if self.sparse_inputs:
-            x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
-        else:
-            x = tf.nn.dropout(x, 1-self.dropout)
-
-        # transform
-        output = dot(x, self.vars['weights'], sparse=self.sparse_inputs)
-
-        # bias
-        if self.bias:
-            output += self.vars['bias']
-
-        return self.act(output)
-
-
-class GraphConvolution(Layer):
-    """Graph convolution layer."""
-
-    def __init__(self, input_dim, output_dim, placeholders, dropout=0.,
-                 sparse_inputs=False, act=tf.nn.relu, bias=False,
-                 featureless=False, **kwargs):
-        super(GraphConvolution, self).__init__(**kwargs)
-
-        if dropout:
-            self.dropout = placeholders['dropout']
-        else:
-            self.dropout = 0.
-
-        self.act = act
-        self.support = placeholders['support']
-        self.sparse_inputs = sparse_inputs
-        self.featureless = featureless
-        self.bias = bias
-
-        # helper variable for sparse dropout
-        self.num_features_nonzero = placeholders['num_features_nonzero']
-
-        with tf.variable_scope(self.name + '_vars'):
-            for i in range(len(self.support)):
-                self.vars['weights_' + str(i)] = glorot([input_dim, output_dim],
-                                                        name='weights_' + str(i))
-            if self.bias:
-                self.vars['bias'] = zeros([output_dim], name='bias')
-
-        if self.logging:
-            self._log_vars()
-
-    def _call(self, inputs):
-        x = inputs
-
-        # dropout
-        if self.sparse_inputs:
-            x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
-        else:
-            x = tf.nn.dropout(x, 1-self.dropout)
-
-        # convolve
-        supports = list()
-        for i in range(len(self.support)):
-            if not self.featureless:
-                pre_sup = dot(x, self.vars['weights_' + str(i)],
-                              sparse=self.sparse_inputs)
-            else:
-                pre_sup = self.vars['weights_' + str(i)]
-            support = dot(self.support[i], pre_sup, sparse=True)
-            supports.append(support)
-        output = tf.add_n(supports)
-
-        # bias
-        if self.bias:
-            output += self.vars['bias']
-
-        return self.act(output)
--- a/src/libnrl/gcn/metrics.py
+++ b/src/libnrl/gcn/metrics.py
@ -1,20 +0,0 @@
-import tensorflow as tf
-
-
-def masked_softmax_cross_entropy(preds, labels, mask):
-    """Softmax cross-entropy loss with masking."""
-    loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
-    mask = tf.cast(mask, dtype=tf.float32)
-    mask /= tf.reduce_mean(mask)
-    loss *= mask
-    return tf.reduce_mean(loss)
-
-
-def masked_accuracy(preds, labels, mask):
-    """Accuracy with masking."""
-    correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1))
-    accuracy_all = tf.cast(correct_prediction, tf.float32)
-    mask = tf.cast(mask, dtype=tf.float32)
-    mask /= tf.reduce_mean(mask)
-    accuracy_all *= mask
-    return tf.reduce_mean(accuracy_all)
--- a/src/libnrl/gcn/models.py
+++ b/src/libnrl/gcn/models.py
@ -1,179 +0,0 @@
-from .layers import *
-from .metrics import *
-
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-
-
-class Model(object):
-    def __init__(self, **kwargs):
-        allowed_kwargs = {'name', 'logging'}
-        for kwarg in kwargs.keys():
-            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
-        name = kwargs.get('name')
-        if not name:
-            name = self.__class__.__name__.lower()
-        self.name = name
-
-        logging = kwargs.get('logging', False)
-        self.logging = logging
-
-        self.vars = {}
-        self.placeholders = {}
-
-        self.layers = []
-        self.activations = []
-
-        self.inputs = None
-        self.outputs = None
-
-        self.loss = 0
-        self.accuracy = 0
-        self.optimizer = None
-        self.opt_op = None
-
-    def _build(self):
-        raise NotImplementedError
-
-    def build(self):
-        """ Wrapper for _build() """
-        with tf.variable_scope(self.name):
-            self._build()
-
-        # Build sequential layer model
-        self.activations.append(self.inputs)
-        for layer in self.layers:
-            hidden = layer(self.activations[-1])
-            self.activations.append(hidden)
-        self.outputs = self.activations[-1]
-
-        # Store model variables for easy access
-        variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
-        self.vars = {var.name: var for var in variables}
-
-        # Build metrics
-        self._loss()
-        self._accuracy()
-
-        self.opt_op = self.optimizer.minimize(self.loss)
-
-    def predict(self):
-        pass
-
-    def _loss(self):
-        raise NotImplementedError
-
-    def _accuracy(self):
-        raise NotImplementedError
-
-    def save(self, sess=None):
-        if not sess:
-            raise AttributeError("TensorFlow session not provided.")
-        saver = tf.train.Saver(self.vars)
-        save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
-        print("Model saved in file: %s" % save_path)
-
-    def load(self, sess=None):
-        if not sess:
-            raise AttributeError("TensorFlow session not provided.")
-        saver = tf.train.Saver(self.vars)
-        save_path = "tmp/%s.ckpt" % self.name
-        saver.restore(sess, save_path)
-        print("Model restored from file: %s" % save_path)
-
-
-class MLP(Model):
-    def __init__(self, placeholders, input_dim, **kwargs):
-        super(MLP, self).__init__(**kwargs)
-
-        self.inputs = placeholders['features']
-        self.input_dim = input_dim
-        # self.input_dim = self.inputs.get_shape().as_list()[1]  # To be supported in future Tensorflow versions
-        self.output_dim = placeholders['labels'].get_shape().as_list()[1]
-        self.placeholders = placeholders
-
-        self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
-
-        self.build()
-
-    def _loss(self):
-        # Weight decay loss
-        for var in self.layers[0].vars.values():
-            self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
-
-        # Cross entropy error
-        self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
-                                                  self.placeholders['labels_mask'])
-
-    def _accuracy(self):
-        self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'],
-                                        self.placeholders['labels_mask'])
-
-    def _build(self):
-        self.layers.append(Dense(input_dim=self.input_dim,
-                                 output_dim=FLAGS.hidden1,
-                                 placeholders=self.placeholders,
-                                 act=tf.nn.relu,
-                                 dropout=True,
-                                 sparse_inputs=True,
-                                 logging=self.logging))
-
-        self.layers.append(Dense(input_dim=FLAGS.hidden1,
-                                 output_dim=self.output_dim,
-                                 placeholders=self.placeholders,
-                                 act=lambda x: x,
-                                 dropout=True,
-                                 logging=self.logging))
-
-    def predict(self):
-        return tf.nn.softmax(self.outputs)
-
-
-class GCN(Model):
-    def __init__(self, placeholders, input_dim, hidden1, weight_decay, **kwargs):
-        super(GCN, self).__init__(**kwargs)
-
-        self.inputs = placeholders['features']
-        self.hidden1 = hidden1
-        self.weight_decay = weight_decay
-        self.input_dim = input_dim
-        # self.input_dim = self.inputs.get_shape().as_list()[1]  # To be supported in future Tensorflow versions
-        self.output_dim = placeholders['labels'].get_shape().as_list()[1]
-        self.placeholders = placeholders
-
-        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
-
-        self.build()
-
-    def _loss(self):
-        # Weight decay loss
-        for var in self.layers[0].vars.values():
-            self.loss += self.weight_decay * tf.nn.l2_loss(var)
-
-        # Cross entropy error
-        self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
-                                                  self.placeholders['labels_mask'])
-
-    def _accuracy(self):
-        self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'],
-                                        self.placeholders['labels_mask'])
-
-    def _build(self):
-
-        self.layers.append(GraphConvolution(input_dim=self.input_dim,
-                                            output_dim=self.hidden1,
-                                            placeholders=self.placeholders,
-                                            act=tf.nn.relu,
-                                            dropout=True,
-                                            sparse_inputs=True,
-                                            logging=self.logging))
-
-        self.layers.append(GraphConvolution(input_dim=self.hidden1,
-                                            output_dim=self.output_dim,
-                                            placeholders=self.placeholders,
-                                            act=lambda x: x,
-                                            dropout=True,
-                                            logging=self.logging))
-
-    def predict(self):
-        return tf.nn.softmax(self.outputs)
--- a/src/libnrl/gcn/train.py
+++ b/src/libnrl/gcn/train.py
@ -1,107 +0,0 @@
-from __future__ import division, print_function
-
-import time
-
-import tensorflow as tf
-
-from gcn.models import GCN, MLP
-from gcn.utils import *
-
-# Set random seed
-seed = 123
-np.random.seed(seed)
-tf.set_random_seed(seed)
-
-# Settings
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-flags.DEFINE_string('dataset', 'cora', 'Dataset string.')  # 'cora', 'citeseer', 'pubmed'
-flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
-flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
-flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
-flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
-flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
-flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
-flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
-flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
-
-# Load data
-adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset)
-
-# Some preprocessing
-features = preprocess_features(features)
-if FLAGS.model == 'gcn':
-    support = [preprocess_adj(adj)]
-    num_supports = 1
-    model_func = GCN
-elif FLAGS.model == 'gcn_cheby':
-    support = chebyshev_polynomials(adj, FLAGS.max_degree)
-    num_supports = 1 + FLAGS.max_degree
-    model_func = GCN
-elif FLAGS.model == 'dense':
-    support = [preprocess_adj(adj)]  # Not used
-    num_supports = 1
-    model_func = MLP
-else:
-    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))
-
-# Define placeholders
-placeholders = {
-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
-    'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
-    'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
-    'labels_mask': tf.placeholder(tf.int32),
-    'dropout': tf.placeholder_with_default(0., shape=()),
-    'num_features_nonzero': tf.placeholder(tf.int32)  # helper variable for sparse dropout
-}
-
-# Create model
-model = model_func(placeholders, input_dim=features[2][1], logging=True)
-
-# Initialize session
-sess = tf.Session()
-
-
-# Define model evaluation function
-def evaluate(features, support, labels, mask, placeholders):
-    t_test = time.time()
-    feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders)
-    outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val)
-    return outs_val[0], outs_val[1], (time.time() - t_test)
-
-
-# Init variables
-sess.run(tf.global_variables_initializer())
-
-cost_val = []
-
-# Train model
-for epoch in range(FLAGS.epochs):
-
-    t = time.time()
-    # Construct feed dictionary
-    feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
-    feed_dict.update({placeholders['dropout']: FLAGS.dropout})
-
-    # Training step
-    outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)
-
-    # Validation
-    cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders)
-    cost_val.append(cost)
-
-    # Print results
-    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
-          "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
-          "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))
-
-    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
-        print("Early stopping...")
-        break
-
-print("Optimization Finished!")
-
-# Testing
-test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders)
-print("Test set results:", "cost=", "{:.5f}".format(test_cost),
-      "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))
--- a/src/libnrl/gcn/utils.py
+++ b/src/libnrl/gcn/utils.py
@ -1,153 +0,0 @@
-import pickle as pkl
-import sys
-
-import networkx as nx
-import numpy as np
-import scipy.sparse as sp
-from scipy.sparse.linalg.eigen.arpack import eigsh
-
-
-def parse_index_file(filename):
-    """Parse index file."""
-    index = []
-    for line in open(filename):
-        index.append(int(line.strip()))
-    return index
-
-
-def sample_mask(idx, l):
-    """Create mask."""
-    mask = np.zeros(l)
-    mask[idx] = 1
-    return np.array(mask, dtype=np.bool)
-
-
-def load_data(dataset_str):
-    """Load data."""
-    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
-    objects = []
-    for i in range(len(names)):
-        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
-            if sys.version_info > (3, 0):
-                objects.append(pkl.load(f, encoding='latin1'))
-            else:
-                objects.append(pkl.load(f))
-
-    x, y, tx, ty, allx, ally, graph = tuple(objects)
-    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
-    test_idx_range = np.sort(test_idx_reorder)
-
-    if dataset_str == 'citeseer':
-        # Fix citeseer dataset (there are some isolated nodes in the graph)
-        # Find isolated nodes, add them as zero-vecs into the right position
-        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
-        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
-        tx_extended[test_idx_range-min(test_idx_range), :] = tx
-        tx = tx_extended
-        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
-        ty_extended[test_idx_range-min(test_idx_range), :] = ty
-        ty = ty_extended
-
-    features = sp.vstack((allx, tx)).tolil()
-    features[test_idx_reorder, :] = features[test_idx_range, :]
-    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
-
-    labels = np.vstack((ally, ty))
-    labels[test_idx_reorder, :] = labels[test_idx_range, :]
-
-    idx_test = test_idx_range.tolist()
-    idx_train = range(len(y))
-    idx_val = range(len(y), len(y)+500)
-
-    train_mask = sample_mask(idx_train, labels.shape[0])
-    val_mask = sample_mask(idx_val, labels.shape[0])
-    test_mask = sample_mask(idx_test, labels.shape[0])
-
-    y_train = np.zeros(labels.shape)
-    y_val = np.zeros(labels.shape)
-    y_test = np.zeros(labels.shape)
-    y_train[train_mask, :] = labels[train_mask, :]
-    y_val[val_mask, :] = labels[val_mask, :]
-    y_test[test_mask, :] = labels[test_mask, :]
-
-    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
-
-
-def sparse_to_tuple(sparse_mx):
-    """Convert sparse matrix to tuple representation."""
-    def to_tuple(mx):
-        if not sp.isspmatrix_coo(mx):
-            mx = mx.tocoo()
-        coords = np.vstack((mx.row, mx.col)).transpose()
-        values = mx.data
-        shape = mx.shape
-        return coords, values, shape
-
-    if isinstance(sparse_mx, list):
-        for i in range(len(sparse_mx)):
-            sparse_mx[i] = to_tuple(sparse_mx[i])
-    else:
-        sparse_mx = to_tuple(sparse_mx)
-
-    return sparse_mx
-
-
-def preprocess_features(features):
-    """Row-normalize feature matrix and convert to tuple representation"""
-    rowsum = np.array(features.sum(1))
-    r_inv = np.power(rowsum, -1).flatten()
-    r_inv[np.isinf(r_inv)] = 0.
-    r_mat_inv = sp.diags(r_inv)
-    features = sp.coo_matrix(features)
-    features = r_mat_inv.dot(features)
-    return sparse_to_tuple(features)
-
-
-def normalize_adj(adj):
-    """Symmetrically normalize adjacency matrix."""
-    adj = sp.coo_matrix(adj)
-    rowsum = np.array(adj.sum(1))
-    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
-    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
-    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
-    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
-
-
-def preprocess_adj(adj):
-    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
-    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
-    return sparse_to_tuple(adj_normalized)
-
-
-def construct_feed_dict(features, support, labels, labels_mask, placeholders):
-    """Construct feed dictionary."""
-    feed_dict = dict()
-    feed_dict.update({placeholders['labels']: labels})
-    feed_dict.update({placeholders['labels_mask']: labels_mask})
-    feed_dict.update({placeholders['features']: features})
-    feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
-    feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
-    return feed_dict
-
-
-def chebyshev_polynomials(adj, k):
-    """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
-    print("Calculating Chebyshev polynomials up to order {}...".format(k))
-
-    adj_normalized = normalize_adj(adj)
-    laplacian = sp.eye(adj.shape[0]) - adj_normalized
-    largest_eigval, _ = eigsh(laplacian, 1, which='LM')
-    scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
-
-    t_k = list()
-    t_k.append(sp.eye(adj.shape[0]))
-    t_k.append(scaled_laplacian)
-
-    def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
-        s_lap = sp.csr_matrix(scaled_lap, copy=True)
-        return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two
-
-    for i in range(2, k+1):
-        t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))
-
-    return sparse_to_tuple(t_k)
--- a/src/libnrl/graph.py
+++ b/src/libnrl/graph.py
@ -77,17 +77,6 @@ class Graph(object):
                vec = l.split()
                self.G.nodes[vec[0]]['attr'] = np.array([float(x) for x in vec[1:]])

-    def read_node_label(self, path):
-        """ todo... read node labels and store as NetworkX graph {'node_id': {'label': values}} \n
-            input file format: node_id1 labels \n
-                               node_id2 labels \n
-        with open(path, 'r') as fin: \n
-            for l in fin.readlines(): \n
-                vec = l.split() \n
-                self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]]) \n
-        """
-        pass  # to do...
-
    def remove_edge(self, ratio=0.0):
        """ randomly remove edges/links \n
            ratio: the percentage of edges to be removed \n
@ -100,17 +89,6 @@ class Graph(object):
        print('after removing, the # of edges: ', self.G.number_of_edges())
        return edges_removed

-    def remove_node_attr(self, ratio):
-        """ todo... randomly remove node attributes; \n
-        """
-        pass  # to do...
-
-    def remove_node(self, ratio):
-        """ todo... randomly remove nodes; \n
-            #self.node_mapping() #update node id index mapping is needed \n
-        """
-        pass  # to do...
-
    # ------------------------------------------------------------------------------------------
    # --------------------commonly used APIs that will not modify graph-------------------------
    # ------------------------------------------------------------------------------------------
@ -164,9 +142,3 @@ class Graph(object):
    def get_common_neighbors(self, node1, node2):
        """ return common neighbors of two nodes """
        return list(nx.common_neighbors(self.G, node1, node2))
-
-    def get_centrality(self, centrality_type='degree'):
-        """ todo... return specified type of centrality \n
-            see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html \n
-        """
-        pass  # to do...
--- a/src/libnrl/graphsage/graphsageAPI.py
+++ b/src/libnrl/graphsage/graphsageAPI.py
@ -30,11 +30,6 @@ class graphSAGE(object):
        if not is_supervised:
            from libnrl.graphsage import unsupervised_train
            self.vectors = unsupervised_train.train(train_data=train_data, test_data=None, model=sage_model)
-        else:
-            # to do...
-            # from libnrl.graphsage import supervised_train
-            # self.vectors = supervised_train.train()
-            pass

    def add_train_val_test_to_G(self, test_perc=0.0, val_perc=0.1):
        ''' add if 'val' and/or 'test' to each node in G '''
@ -54,7 +49,7 @@ class graphSAGE(object):
                G.nodes[id]['test'] = False
                G.nodes[id]['val'] = False
        # Make sure the graph has edge train_removed annotations
-        # (some datasets might already have this..)
+        # some datasets might already have this
        print("Loaded data.. now preprocessing..")
        for edge in G.edges():
            if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
--- a/src/libnrl/graphsage/models.py
+++ b/src/libnrl/graphsage/models.py
@ -11,11 +11,6 @@ from libnrl.graphsage.aggregators import (GCNAggregator, MaxPoolingAggregator,
                                          MeanPoolingAggregator, SeqAggregator)
 from libnrl.graphsage.prediction import BipartiteEdgePredLayer

-'''
-flags = tf.app.flags
-FLAGS = FLAGS
-'''
-
 # DISCLAIMER:
 # Boilerplate parts of this code file were originally forked from
 # https://github.com/tkipf/gcn
--- a/src/libnrl/graphsage/unsupervised_train.py
+++ b/src/libnrl/graphsage/unsupervised_train.py
@ -258,11 +258,7 @@ def train(train_data, test_data, model):
        # only print the last iter result at the end of each epoch
        print("Epoch:", '%04d' % epoch,
              "train_loss=", "{:.5f}".format(train_cost),
-              # "train_mrr=", "{:.5f}".format(train_mrr),
-              # "train_mrr_ema=", "{:.5f}".format(train_shadow_mrr),
              "val_loss=", "{:.5f}".format(val_cost),
-              # "val_mrr=", "{:.5f}".format(val_mrr),
-              # "val_mrr_ema=", "{:.5f}".format(shadow_mrr),
              "time cost", "{:.2f}".format(t2-t1))

        # no early stopping was used in original code---------------- auto-save-best-emb ------------------------------
--- a/src/libnrl/graphsage/utils.py
+++ b/src/libnrl/graphsage/utils.py
@ -18,7 +18,6 @@ from networkx.readwrite import json_graph
 version_info = list(map(int, nx.__version__.split('.')))
 major = version_info[0]
 minor = version_info[1]
-#assert (major <= 1) and (minor <= 11), "networkx major version > 1.11"

 WALK_LEN = 5
 N_WALKS = 50
@ -27,12 +26,6 @@ N_WALKS = 50
 def load_data(prefix, normalize=True, load_walks=False):
    G_data = json.load(open(prefix + "-G.json"))
    G = json_graph.node_link_graph(G_data)
-    '''
-    if isinstance(G.nodes()[0], int):
-        conversion = lambda n : int(n)
-    else:
-        conversion = lambda n : n
-    '''
    def conversion(n): return int(n)  # compatible with networkx >2.0

    if os.path.exists(prefix + "-feats.npy"):
@ -61,7 +54,7 @@ def load_data(prefix, normalize=True, load_walks=False):
    print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count))

    # Make sure the graph has edge train_removed annotations
-    # (some datasets might already have this..)
+    # some datasets might already have this
    print("Loaded data.. now preprocessing..")
    for edge in G.edges():
        if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
@ -104,7 +97,7 @@ def run_random_walks(G, nodes, num_walks=N_WALKS):
    return pairs


-if __name__ == "__main__":  # 这个地方需要改写，可以每次运行都跑一次
+if __name__ == "__main__":
    """ Run random walks """
    graph_file = sys.argv[1]
    out_file = sys.argv[2]
--- a/src/libnrl/line.py
+++ b/src/libnrl/line.py
@ -16,8 +16,7 @@ import numpy as np
 import tensorflow as tf
 from sklearn.linear_model import LogisticRegression

-from .downstream import \
-    ncClassifier  # to do... try use lpClassifier to choose best embeddings?
+from .downstream import ncClassifier
 from .utils import read_node_label_downstream


@ -48,9 +47,6 @@ class _LINE(object):
        cur_seed = random.getrandbits(32)
        self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
        self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
-        # self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1)
-        # self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1)
-        # self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1)
        self.h_e = tf.nn.embedding_lookup(self.embeddings, self.h)
        self.t_e = tf.nn.embedding_lookup(self.embeddings, self.t)
        self.t_e_context = tf.nn.embedding_lookup(self.context_embeddings, self.t)
@ -88,7 +84,6 @@ class _LINE(object):
        edges = [(look_up[x[0]], look_up[x[1]]) for x in self.g.G.edges()]

        data_size = self.g.G.number_of_edges()
-        # edge_set = set([x[0]*numNodes+x[1] for x in edges])
        shuffle_indices = np.random.permutation(np.arange(data_size))

        # positive or negative mod
@ -193,7 +188,6 @@ class _LINE(object):
    def get_embeddings(self):
        vectors = {}
        embeddings = self.embeddings.eval(session=self.sess)
-        # embeddings = self.sess.run(tf.nn.l2_normalize(self.embeddings.eval(session=self.sess), 1))
        look_back = self.g.look_back_list
        for i, embedding in enumerate(embeddings):
            vectors[look_back[i]] = embedding
--- a/src/libnrl/tadw.py
+++ b/src/libnrl/tadw.py
@ -36,7 +36,6 @@ class TADW(object):
        look_back = self.g.look_back_list
        self.features = np.vstack([g.nodes[look_back[i]]['attr']
                                   for i in range(g.number_of_nodes())])
-        # self.features = self.g.get_attr_mat().todense()
        self.preprocessFeature()
        return self.features.T

@ -46,8 +45,6 @@ class TADW(object):
            Ud = U[:, 0:200]
            Sd = S[0:200]
            self.features = np.array(Ud)*Sd.reshape(200)
-            # from .utils import dim_reduction
-            # self.features = dim_reduction(self.features, dim=200, method='svd')

    def train(self):
        self.adj = self.getAdj()
--- a/src/libnrl/utils.py
+++ b/src/libnrl/utils.py
@ -49,8 +49,6 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):


 def pairwise_similarity(mat, type='cosine'):
-    # XXX: possible to integrate pairwise_similarity with top_k to enhance performance?
-    # we'll use it elsewhere. if really needed, write a new method for this purpose
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
@ -62,50 +60,19 @@ def pairwise_similarity(mat, type='cosine'):
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
-        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = euclidean_distances(mat)
        result = -result
-        # result = 1 - 2 / np.pi * np.arctan(result)
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
-        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = manhattan_distances(mat)
        result = -result
-        # result = 1 - 2 / np.pi * np.arctan(result)
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result

-
-# ---------------------------------ulits for preprocessing--------------------------------
-def node_auxi_to_attr(fin, fout):
-    """ TODO...
-        -> read auxi info associated with each node;
-        -> preprocessing auxi via:
-            1) NLP for sentences; or 2) one-hot for discrete features;
-        -> then becomes node attr with m dim, and store them into attr file
-    """
-    # https://radimrehurek.com/gensim/apiref.html
-    # word2vec, doc2vec, 把句子转为vec
-    # text2vec, tfidf, 把离散的features转为vec
-    pass
-
-
-def simulate_incomplete_stru():
-    pass
-
-
-def simulate_incomplete_attr():
-    pass
-
-
-def simulate_noisy_world():
-    pass
-
 # ---------------------------------ulits for downstream tasks--------------------------------
-# XXX: read and save using panda or numpy


 def read_edge_label_downstream(filename):
@ -143,37 +110,6 @@ def read_node_label_downstream(filename):
    return X, Y


-def store_embedddings(vectors, filename, dim):
-    """ store embeddings to file
-    """
-    fout = open(filename, 'w')
-    num_nodes = len(vectors.keys())
-    fout.write("{} {}\n".format(num_nodes, dim))
-    for node, vec in vectors.items():
-        fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
-    fout.close()
-    print('store the resulting embeddings in file: ', filename)
-
-
-def load_embeddings(filename):
-    """ load embeddings from file
-    """
-    fin = open(filename, 'r')
-    num_nodes, size = [int(x) for x in fin.readline().strip().split()]
-    vectors = {}
-    while 1:
-        line = fin.readline()
-        if line == '':
-            break
-        vec = line.strip().split(' ')
-        assert len(vec) == size + 1
-        vectors[vec[0]] = [float(x) for x in vec[1:]]
-    fin.close()
-    assert len(vectors) == num_nodes
-    return vectors
-
-
-# ----------------- 以下你整理到utils，有问题的我都用中文写出来了，没有中文的暂时没啥问题，可以先不用管-----------------------
 def generate_edges_for_linkpred(graph, edges_removed, balance_ratio=1.0):
    ''' given a graph and edges_removed;
        generate non_edges not in [both graph and edges_removed];
@ -216,52 +152,3 @@ def dim_reduction(mat, dim=128, method='pca'):
    t2 = time.time()
    print('END dimensionality reduction: {:.2f}s'.format(t2-t1))
    return mat_reduced
-
-
-def row_normalized(mat, is_transition_matrix=False):
-    ''' to do...
-        两个问题：1）sparse矩阵在该场景下比dense慢,(至少我自己写的这块代码是)
-                2）dense矩阵测试后发现所有元素加起来不是整数，似乎还是要用我以前笨方法来弥补
-                3)在is_transition_matrix时候，需要给全零行赋值，sparse时候会有点小问题，不能直接mat[i, :] = p赋值
-    '''
-    p = 1.0/mat.shape[0]  # probability = 1/num of rows
-    norms = np.asarray(mat.sum(axis=1)).ravel()
-    for i, norm in enumerate(norms):
-        if norm != 0:
-            mat[i, :] /= norm
-        else:
-            if is_transition_matrix:
-                mat[i, :] = p  # every row of transition matrix should sum up to 1
-            else:
-                pass  # do nothing; keep all-zero row
-    return mat
-
-
-''' 笨方法如下'''
-
-
-def rowAsPDF(mat):  # make each row sum up to 1 i.e. a probabolity density distribution
-    mat = np.array(mat)
-    for i in range(mat.shape[0]):
-        sum_row = mat[i, :].sum()
-        if sum_row != 0:
-            mat[i, :] = mat[i, :]/sum_row  # if a row [0, 1, 1, 1] -> [0, 1/3, 1/3, 1/3] -> may have some small issue...
-        else:
-            # to do...
-            # for node without any link... remain row as [0, 0, 0, 0]  OR set to [1/n, 1/n, 1/n...]??
-            pass
-        if mat[i, :].sum() != 1.00:  # small trick to make sure each row is a pdf 笨犯法。。。
-            error = 1.00 - mat[i, :].sum()
-            mat[i, -1] += error
-    return mat
-
-
-def sparse_to_dense():
-    ''' to dense np.matrix format 记得dtype用float64'''
-    pass
-
-
-def dense_to_sparse():
-    ''' to sparse crs format 记得dtype用float64'''
-
-    pass
--- a/src/libnrl/walker.py
+++ b/src/libnrl/walker.py
@ -23,9 +23,6 @@ class WeightedWalker:
        self.T = transition_mat
        self.workers = workers
        self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph())  # reconstructed "directed" "weighted" graph based on transition matrix
-        # print(nx.adjacency_matrix(self.rec_G).todense()[0:6, 0:6])
-        # print(transition_mat[0:6, 0:6])
-        # print(nx.adjacency_matrix(self.rec_G).todense()==transition_mat)

    # alias sampling for ABRW-------------------------
    def simulate_walks(self, num_walks, walk_length):
--- a/src/main.py
+++ b/src/main.py
@ -22,7 +22,6 @@ from libnrl import line  # PNE method
 from libnrl import tadw  # ANE method
 from libnrl.downstream import lpClassifier, ncClassifier
 from libnrl.graph import Graph
-# from libnrl.gcn import gcnAPI  # ANE method
 from libnrl.graphsage import graphsageAPI  # ANE method
 from libnrl.grarep import GraRep  # PNE method
 from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream
@ -30,8 +29,6 @@ from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream
 from sklearn.linear_model import LogisticRegression  # to do... 1) put it in downstream.py; and 2) try SVM...
 from libnrl import abrw  # ANE method; Attributed Biased Random Walk
 from libnrl import node2vec  # PNE method; including deepwalk and node2vec
-# from libnrl import TriDNR #to do... ANE method
-# https://github.com/dfdazac/dgi #to do... ANE method


 def parse_args():
@ -51,10 +48,6 @@ def parse_args():
                        help='choices of downstream tasks: none, lp, nc, lp_and_nc')
    parser.add_argument('--link-remove', default=0.1, type=float,
                        help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]')
-    # parser.add_argument('--attr-remove', default=0.0, type=float,
-    #                    help='simulate randomly missing attributes if necessary; a ratio ranging [0.0, 1.0]')
-    # parser.add_argument('--link-reserved', default=0.7, type=float,
-    #                    help='for lp task, train/test split, a ratio ranging [0.0, 1.0]')
    parser.add_argument('--label-reserved', default=0.7, type=float,
                        help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
    parser.add_argument('--directed', default=False, action='store_true',
@ -141,7 +134,6 @@ def main(args):
        assert args.attribute_file != ''
        g.read_node_attr(args.attribute_file)
    # load node label info------
-    # to do... similar to attribute {'key_attribute': value}, label also loaded as {'key_label': value}
    t2 = time.time()
    print(f'STEP1: end loading data; time cost: {(t2-t1):.2f}s')

@ -204,16 +196,6 @@ def main(args):
        model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
        print(f'Save node embeddings in file: {args.emb_file}')

-    '''
-    #to do.... semi-supervised methods: gcn, graphsage, etc...
-    if args.method == 'gcn':   #semi-supervised gcn
-        assert args.label_file != ''
-        assert args.feature_file != ''
-        g.read_node_label(args.label_file)
-        model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved)
-        print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs
-        exit(0)
-    '''

    # ---------------------------------------STEP4: downstream task-----------------------------------------------
    print('\nSTEP4: start evaluating ......: ')
@ -222,7 +204,6 @@ def main(args):
    del model, g
    # ------lp task
    if args.task == 'lp' or args.task == 'lp_and_nc':
-        # X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file)  # if you want to load your own lp testing data
        print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
        clf = lpClassifier(vectors=vectors)  # similarity/distance metric as clf; basically, lp is a binary clf probelm
        clf.evaluate(test_node_pairs, test_edge_labels)
@ -238,7 +219,5 @@ def main(args):

 if __name__ == '__main__':
    print(f'------ START @ {time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())} ------')
-    # random.seed(2018)
-    # np.random.seed(2018)
    main(parse_args())
    print(f'------ END @ {time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())} ------')