From b9f4edbebd647a5756b62c875aaba9260bf02c41 Mon Sep 17 00:00:00 2001 From: Dongzy Date: Fri, 30 Nov 2018 21:41:31 +0800 Subject: [PATCH] clean up for relaese --- src/libnrl/abrw.py | 11 +- src/libnrl/attrcomb.py | 2 +- src/libnrl/downstream.py | 20 +-- src/libnrl/gcn/__init__.py | 2 - src/libnrl/gcn/gcnAPI.py | 162 ----------------- src/libnrl/gcn/inits.py | 27 --- src/libnrl/gcn/layers.py | 191 --------------------- src/libnrl/gcn/metrics.py | 20 --- src/libnrl/gcn/models.py | 179 ------------------- src/libnrl/gcn/train.py | 107 ------------ src/libnrl/gcn/utils.py | 153 ----------------- src/libnrl/graph.py | 28 --- src/libnrl/graphsage/graphsageAPI.py | 7 +- src/libnrl/graphsage/models.py | 5 - src/libnrl/graphsage/unsupervised_train.py | 4 - src/libnrl/graphsage/utils.py | 11 +- src/libnrl/line.py | 8 +- src/libnrl/tadw.py | 3 - src/libnrl/utils.py | 113 ------------ src/libnrl/walker.py | 3 - src/main.py | 21 --- 21 files changed, 9 insertions(+), 1068 deletions(-) delete mode 100644 src/libnrl/gcn/__init__.py delete mode 100644 src/libnrl/gcn/gcnAPI.py delete mode 100644 src/libnrl/gcn/inits.py delete mode 100644 src/libnrl/gcn/layers.py delete mode 100644 src/libnrl/gcn/metrics.py delete mode 100644 src/libnrl/gcn/models.py delete mode 100644 src/libnrl/gcn/train.py delete mode 100644 src/libnrl/gcn/utils.py diff --git a/src/libnrl/abrw.py b/src/libnrl/abrw.py index 6908096..225b9e5 100644 --- a/src/libnrl/abrw.py +++ b/src/libnrl/abrw.py @@ -57,15 +57,10 @@ class ABRW(object): genral idea: Attribute Biased Random Walk i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X result: ABRW-trainsition matrix; T - *** questions: 1) what about if we have some single nodes i.e. some rows of T_A gives 0s - 2) the similarity/distance metric to obtain T_X - 3) alias sampling as used in node2vec for speeding up, but this is the case - if each row of P gives many 0s - --> how to make each row of P is a pdf and meanwhile is sparse ''' print("obtaining biased transition matrix where each row sums up to 1.0...") - preserve_zeros = False # compare them: 1) accuracy; 2) efficiency + preserve_zeros = False T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row print('Preserve zero rows of the adj matrix: ', preserve_zeros) @@ -95,8 +90,8 @@ class ABRW(object): print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s') return T - def save_embeddings(self, filename): # to do... put it to utils; - fout = open(filename, 'w') # call it while __init__ (abrw calss) with flag --save-emb=True (from main.py) + def save_embeddings(self, filename): + fout = open(filename, 'w') node_num = len(self.vectors.keys()) fout.write("{} {}\n".format(node_num, self.dim)) for node, vec in self.vectors.items(): diff --git a/src/libnrl/attrcomb.py b/src/libnrl/attrcomb.py index 5df27fe..954306f 100644 --- a/src/libnrl/attrcomb.py +++ b/src/libnrl/attrcomb.py @@ -73,7 +73,7 @@ class ATTRCOMB(object): nrl_embeddings.append(model.vectors[key]) return np.array(nrl_embeddings) - elif comb_with == 'node2vec': # to do... the parameters + elif comb_with == 'node2vec': model = node2vec.Node2vec(graph=self.g, path_length=80, num_paths=self.number_walks, dim=dim, workers=4, p=0.8, q=0.8, window=10) nrl_embeddings = [] diff --git a/src/libnrl/downstream.py b/src/libnrl/downstream.py index 7e60509..7a035e4 100644 --- a/src/libnrl/downstream.py +++ b/src/libnrl/downstream.py @@ -39,7 +39,7 @@ class ncClassifier(object): Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] self.train(X_train, Y_train, Y) - np.random.set_state(state) # why??? for binarizer.transform?? + np.random.set_state(state) return self.evaluate(X_test, Y_test) def train(self, X, Y, Y_all): @@ -66,7 +66,6 @@ class ncClassifier(object): results = {} for average in averages: results[average] = f1_score(Y, Y_, average=average) - # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]])) print(results) return results @@ -94,10 +93,6 @@ class lpClassifier(object): # clf here is simply a similarity/distance metric def evaluate(self, X_test, Y_test, seed=0): test_size = len(X_test) - # shuffle_indices = np.random.permutation(np.arange(test_size)) - # X_test = [X_test[shuffle_indices[i]] for i in range(test_size)] - # Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)] - Y_true = [int(i) for i in Y_test] Y_probs = [] for i in range(test_size): @@ -114,7 +109,6 @@ class lpClassifier(object): if roc < 0.5: roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5 print("roc=", "{:.9f}".format(roc)) - # plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value def norm(a): @@ -128,19 +122,9 @@ def cosine_similarity(a, b): sum = 0.0 for i in range(len(a)): sum = sum + a[i] * b[i] - # return sum/(norm(a) * norm(b)) - # fix numerical issue 1e-100 almost = 0! return sum / (norm(a) * norm(b) + 1e-100) -''' -#cosine_similarity realized by use... -#or try sklearn.... - from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics - #ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise -''' - - def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0): # randomly split links/edges into training set and testing set # *** note: we do not assume every node must be connected after removing links @@ -160,8 +144,6 @@ def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0): # generate testing set that contains both pos and neg samples test_pos_sample = random.sample(g.G.edges(), int(test_size)) - # test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do... - # more efficient way: test_neg_sample = [] num_neg_sample = int(test_size * neg_pos_link_ratio) num = 0 diff --git a/src/libnrl/gcn/__init__.py b/src/libnrl/gcn/__init__.py deleted file mode 100644 index bfa83a0..0000000 --- a/src/libnrl/gcn/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from __future__ import print_function -from __future__ import division diff --git a/src/libnrl/gcn/gcnAPI.py b/src/libnrl/gcn/gcnAPI.py deleted file mode 100644 index d24a7b3..0000000 --- a/src/libnrl/gcn/gcnAPI.py +++ /dev/null @@ -1,162 +0,0 @@ -import time - -import numpy as np -import tensorflow as tf - -from . import models -from .utils import * - - -class GCN(object): - - def __init__(self, graph, learning_rate=0.01, epochs=200, - hidden1=16, dropout=0.5, weight_decay=5e-4, early_stopping=10, - max_degree=3, clf_ratio=0.1): - """ - learning_rate: Initial learning rate - epochs: Number of epochs to train - hidden1: Number of units in hidden layer 1 - dropout: Dropout rate (1 - keep probability) - weight_decay: Weight for L2 loss on embedding matrix - early_stopping: Tolerance for early stopping (# of epochs) - max_degree: Maximum Chebyshev polynomial degree - """ - self.graph = graph - self.clf_ratio = clf_ratio - self.learning_rate = learning_rate - self.epochs = epochs - self.hidden1 = hidden1 - self.dropout = dropout - self.weight_decay = weight_decay - self.early_stopping = early_stopping - self.max_degree = max_degree - - self.preprocess_data() - self.build_placeholders() - # Create model - self.model = models.GCN(self.placeholders, input_dim=self.features[2][1], hidden1=self.hidden1, weight_decay=self.weight_decay, logging=True) - # Initialize session - self.sess = tf.Session() - # Init variables - self.sess.run(tf.global_variables_initializer()) - - cost_val = [] - - # Train model - for epoch in range(self.epochs): - - t = time.time() - # Construct feed dictionary - feed_dict = self.construct_feed_dict(self.train_mask) - feed_dict.update({self.placeholders['dropout']: self.dropout}) - - # Training step - outs = self.sess.run([self.model.opt_op, self.model.loss, self.model.accuracy], feed_dict=feed_dict) - - # Validation - cost, acc, duration = self.evaluate(self.val_mask) - cost_val.append(cost) - - # Print results - print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), - "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), - "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t)) - ''' #something wrong for early stoppting?? to do... - if epoch > self.early_stopping and cost_val[-1] > np.mean(cost_val[-(self.early_stopping+1):-1]): - print("Early stopping...") - break - ''' - print("Optimization Finished!") - - # Testing - test_cost, test_acc, test_duration = self.evaluate(self.test_mask) - print("Test set results:", "cost=", "{:.5f}".format(test_cost), - "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration)) - - # Define model evaluation function - def evaluate(self, mask): - t_test = time.time() - feed_dict_val = self.construct_feed_dict(mask) - outs_val = self.sess.run([self.model.loss, self.model.accuracy], feed_dict=feed_dict_val) - return outs_val[0], outs_val[1], (time.time() - t_test) - - def build_placeholders(self): - num_supports = 1 - self.placeholders = { - 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], - 'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(self.features[2], dtype=tf.int64)), - 'labels': tf.placeholder(tf.float32, shape=(None, self.labels.shape[1])), - 'labels_mask': tf.placeholder(tf.int32), - 'dropout': tf.placeholder_with_default(0., shape=()), - # helper variable for sparse dropout - 'num_features_nonzero': tf.placeholder(tf.int32) - } - - def build_label(self): - g = self.graph.G - look_up = self.graph.look_up_dict - labels = [] - label_dict = {} - label_id = 0 - for node in g.nodes(): - labels.append((node, g.nodes[node]['label'])) - for l in g.nodes[node]['label']: - if l not in label_dict: - label_dict[l] = label_id - label_id += 1 - self.labels = np.zeros((len(labels), label_id)) - self.label_dict = label_dict - for node, l in labels: - node_id = look_up[node] - for ll in l: - l_id = label_dict[ll] - self.labels[node_id][l_id] = 1 - - def build_train_val_test(self): - """ - build train_mask test_mask val_mask - """ - train_precent = self.clf_ratio - training_size = int(train_precent * self.graph.G.number_of_nodes()) - state = np.random.get_state() - np.random.seed(0) - shuffle_indices = np.random.permutation(np.arange(self.graph.G.number_of_nodes())) - np.random.set_state(state) - - look_up = self.graph.look_up_dict - g = self.graph.G - - def sample_mask(begin, end): - mask = np.zeros(g.number_of_nodes()) - for i in range(begin, end): - mask[shuffle_indices[i]] = 1 - return mask - - self.train_mask = sample_mask(0, training_size-100) - self.val_mask = sample_mask(training_size-100, training_size) - self.test_mask = sample_mask(training_size, g.number_of_nodes()) - - def preprocess_data(self): - """ - adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask - y_train, y_val, y_test can merge to y - """ - g = self.graph.G - look_back = self.graph.look_back_list - self.features = np.vstack([g.nodes[look_back[i]]['feature'] - for i in range(g.number_of_nodes())]) - self.features = preprocess_features(self.features) - self.build_label() - self.build_train_val_test() - adj = nx.adjacency_matrix(g) # the type of graph - self.support = [preprocess_adj(adj)] - - def construct_feed_dict(self, labels_mask): - """Construct feed dictionary.""" - feed_dict = dict() - feed_dict.update({self.placeholders['labels']: self.labels}) - feed_dict.update({self.placeholders['labels_mask']: labels_mask}) - feed_dict.update({self.placeholders['features']: self.features}) - feed_dict.update({self.placeholders['support'][i]: self.support[i] for i in range(len(self.support))}) - feed_dict.update({self.placeholders['num_features_nonzero']: self.features[1].shape}) - return feed_dict diff --git a/src/libnrl/gcn/inits.py b/src/libnrl/gcn/inits.py deleted file mode 100644 index 7851774..0000000 --- a/src/libnrl/gcn/inits.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy as np -import tensorflow as tf - - -def uniform(shape, scale=0.05, name=None): - """Uniform init.""" - initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32) - return tf.Variable(initial, name=name) - - -def glorot(shape, name=None): - """Glorot & Bengio (AISTATS 2010) init.""" - init_range = np.sqrt(6.0/(shape[0]+shape[1])) - initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32) - return tf.Variable(initial, name=name) - - -def zeros(shape, name=None): - """All zeros.""" - initial = tf.zeros(shape, dtype=tf.float32) - return tf.Variable(initial, name=name) - - -def ones(shape, name=None): - """All ones.""" - initial = tf.ones(shape, dtype=tf.float32) - return tf.Variable(initial, name=name) diff --git a/src/libnrl/gcn/layers.py b/src/libnrl/gcn/layers.py deleted file mode 100644 index 89c35b5..0000000 --- a/src/libnrl/gcn/layers.py +++ /dev/null @@ -1,191 +0,0 @@ -import tensorflow as tf - -from .inits import * - -flags = tf.app.flags -FLAGS = flags.FLAGS - -# global unique layer ID dictionary for layer name assignment -_LAYER_UIDS = {} - - -def get_layer_uid(layer_name=''): - """Helper function, assigns unique layer IDs.""" - if layer_name not in _LAYER_UIDS: - _LAYER_UIDS[layer_name] = 1 - return 1 - else: - _LAYER_UIDS[layer_name] += 1 - return _LAYER_UIDS[layer_name] - - -def sparse_dropout(x, keep_prob, noise_shape): - """Dropout for sparse tensors.""" - random_tensor = keep_prob - random_tensor += tf.random_uniform(noise_shape) - dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool) - pre_out = tf.sparse_retain(x, dropout_mask) - return pre_out * (1./keep_prob) - - -def dot(x, y, sparse=False): - """Wrapper for tf.matmul (sparse vs dense).""" - if sparse: - res = tf.sparse_tensor_dense_matmul(x, y) - else: - res = tf.matmul(x, y) - return res - - -class Layer(object): - """Base layer class. Defines basic API for all layer objects. - Implementation inspired by keras (http://keras.io). - - # Properties - name: String, defines the variable scope of the layer. - logging: Boolean, switches Tensorflow histogram logging on/off - - # Methods - _call(inputs): Defines computation graph of layer - (i.e. takes input, returns output) - __call__(inputs): Wrapper for _call() - _log_vars(): Log all variables - """ - - def __init__(self, **kwargs): - allowed_kwargs = {'name', 'logging'} - for kwarg in kwargs.keys(): - assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg - name = kwargs.get('name') - if not name: - layer = self.__class__.__name__.lower() - name = layer + '_' + str(get_layer_uid(layer)) - self.name = name - self.vars = {} - logging = kwargs.get('logging', False) - self.logging = logging - self.sparse_inputs = False - - def _call(self, inputs): - return inputs - - def __call__(self, inputs): - with tf.name_scope(self.name): - if self.logging and not self.sparse_inputs: - tf.summary.histogram(self.name + '/inputs', inputs) - outputs = self._call(inputs) - if self.logging: - tf.summary.histogram(self.name + '/outputs', outputs) - return outputs - - def _log_vars(self): - for var in self.vars: - tf.summary.histogram(self.name + '/vars/' + var, self.vars[var]) - - -class Dense(Layer): - """Dense layer.""" - - def __init__(self, input_dim, output_dim, placeholders, dropout=0., sparse_inputs=False, - act=tf.nn.relu, bias=False, featureless=False, **kwargs): - super(Dense, self).__init__(**kwargs) - - if dropout: - self.dropout = placeholders['dropout'] - else: - self.dropout = 0. - - self.act = act - self.sparse_inputs = sparse_inputs - self.featureless = featureless - self.bias = bias - - # helper variable for sparse dropout - self.num_features_nonzero = placeholders['num_features_nonzero'] - - with tf.variable_scope(self.name + '_vars'): - self.vars['weights'] = glorot([input_dim, output_dim], - name='weights') - if self.bias: - self.vars['bias'] = zeros([output_dim], name='bias') - - if self.logging: - self._log_vars() - - def _call(self, inputs): - x = inputs - - # dropout - if self.sparse_inputs: - x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero) - else: - x = tf.nn.dropout(x, 1-self.dropout) - - # transform - output = dot(x, self.vars['weights'], sparse=self.sparse_inputs) - - # bias - if self.bias: - output += self.vars['bias'] - - return self.act(output) - - -class GraphConvolution(Layer): - """Graph convolution layer.""" - - def __init__(self, input_dim, output_dim, placeholders, dropout=0., - sparse_inputs=False, act=tf.nn.relu, bias=False, - featureless=False, **kwargs): - super(GraphConvolution, self).__init__(**kwargs) - - if dropout: - self.dropout = placeholders['dropout'] - else: - self.dropout = 0. - - self.act = act - self.support = placeholders['support'] - self.sparse_inputs = sparse_inputs - self.featureless = featureless - self.bias = bias - - # helper variable for sparse dropout - self.num_features_nonzero = placeholders['num_features_nonzero'] - - with tf.variable_scope(self.name + '_vars'): - for i in range(len(self.support)): - self.vars['weights_' + str(i)] = glorot([input_dim, output_dim], - name='weights_' + str(i)) - if self.bias: - self.vars['bias'] = zeros([output_dim], name='bias') - - if self.logging: - self._log_vars() - - def _call(self, inputs): - x = inputs - - # dropout - if self.sparse_inputs: - x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero) - else: - x = tf.nn.dropout(x, 1-self.dropout) - - # convolve - supports = list() - for i in range(len(self.support)): - if not self.featureless: - pre_sup = dot(x, self.vars['weights_' + str(i)], - sparse=self.sparse_inputs) - else: - pre_sup = self.vars['weights_' + str(i)] - support = dot(self.support[i], pre_sup, sparse=True) - supports.append(support) - output = tf.add_n(supports) - - # bias - if self.bias: - output += self.vars['bias'] - - return self.act(output) diff --git a/src/libnrl/gcn/metrics.py b/src/libnrl/gcn/metrics.py deleted file mode 100644 index 0a60edc..0000000 --- a/src/libnrl/gcn/metrics.py +++ /dev/null @@ -1,20 +0,0 @@ -import tensorflow as tf - - -def masked_softmax_cross_entropy(preds, labels, mask): - """Softmax cross-entropy loss with masking.""" - loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels) - mask = tf.cast(mask, dtype=tf.float32) - mask /= tf.reduce_mean(mask) - loss *= mask - return tf.reduce_mean(loss) - - -def masked_accuracy(preds, labels, mask): - """Accuracy with masking.""" - correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1)) - accuracy_all = tf.cast(correct_prediction, tf.float32) - mask = tf.cast(mask, dtype=tf.float32) - mask /= tf.reduce_mean(mask) - accuracy_all *= mask - return tf.reduce_mean(accuracy_all) diff --git a/src/libnrl/gcn/models.py b/src/libnrl/gcn/models.py deleted file mode 100644 index d5eebdf..0000000 --- a/src/libnrl/gcn/models.py +++ /dev/null @@ -1,179 +0,0 @@ -from .layers import * -from .metrics import * - -flags = tf.app.flags -FLAGS = flags.FLAGS - - -class Model(object): - def __init__(self, **kwargs): - allowed_kwargs = {'name', 'logging'} - for kwarg in kwargs.keys(): - assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg - name = kwargs.get('name') - if not name: - name = self.__class__.__name__.lower() - self.name = name - - logging = kwargs.get('logging', False) - self.logging = logging - - self.vars = {} - self.placeholders = {} - - self.layers = [] - self.activations = [] - - self.inputs = None - self.outputs = None - - self.loss = 0 - self.accuracy = 0 - self.optimizer = None - self.opt_op = None - - def _build(self): - raise NotImplementedError - - def build(self): - """ Wrapper for _build() """ - with tf.variable_scope(self.name): - self._build() - - # Build sequential layer model - self.activations.append(self.inputs) - for layer in self.layers: - hidden = layer(self.activations[-1]) - self.activations.append(hidden) - self.outputs = self.activations[-1] - - # Store model variables for easy access - variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) - self.vars = {var.name: var for var in variables} - - # Build metrics - self._loss() - self._accuracy() - - self.opt_op = self.optimizer.minimize(self.loss) - - def predict(self): - pass - - def _loss(self): - raise NotImplementedError - - def _accuracy(self): - raise NotImplementedError - - def save(self, sess=None): - if not sess: - raise AttributeError("TensorFlow session not provided.") - saver = tf.train.Saver(self.vars) - save_path = saver.save(sess, "tmp/%s.ckpt" % self.name) - print("Model saved in file: %s" % save_path) - - def load(self, sess=None): - if not sess: - raise AttributeError("TensorFlow session not provided.") - saver = tf.train.Saver(self.vars) - save_path = "tmp/%s.ckpt" % self.name - saver.restore(sess, save_path) - print("Model restored from file: %s" % save_path) - - -class MLP(Model): - def __init__(self, placeholders, input_dim, **kwargs): - super(MLP, self).__init__(**kwargs) - - self.inputs = placeholders['features'] - self.input_dim = input_dim - # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions - self.output_dim = placeholders['labels'].get_shape().as_list()[1] - self.placeholders = placeholders - - self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) - - self.build() - - def _loss(self): - # Weight decay loss - for var in self.layers[0].vars.values(): - self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) - - # Cross entropy error - self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'], - self.placeholders['labels_mask']) - - def _accuracy(self): - self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'], - self.placeholders['labels_mask']) - - def _build(self): - self.layers.append(Dense(input_dim=self.input_dim, - output_dim=FLAGS.hidden1, - placeholders=self.placeholders, - act=tf.nn.relu, - dropout=True, - sparse_inputs=True, - logging=self.logging)) - - self.layers.append(Dense(input_dim=FLAGS.hidden1, - output_dim=self.output_dim, - placeholders=self.placeholders, - act=lambda x: x, - dropout=True, - logging=self.logging)) - - def predict(self): - return tf.nn.softmax(self.outputs) - - -class GCN(Model): - def __init__(self, placeholders, input_dim, hidden1, weight_decay, **kwargs): - super(GCN, self).__init__(**kwargs) - - self.inputs = placeholders['features'] - self.hidden1 = hidden1 - self.weight_decay = weight_decay - self.input_dim = input_dim - # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions - self.output_dim = placeholders['labels'].get_shape().as_list()[1] - self.placeholders = placeholders - - self.optimizer = tf.train.AdamOptimizer(learning_rate=0.01) - - self.build() - - def _loss(self): - # Weight decay loss - for var in self.layers[0].vars.values(): - self.loss += self.weight_decay * tf.nn.l2_loss(var) - - # Cross entropy error - self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'], - self.placeholders['labels_mask']) - - def _accuracy(self): - self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'], - self.placeholders['labels_mask']) - - def _build(self): - - self.layers.append(GraphConvolution(input_dim=self.input_dim, - output_dim=self.hidden1, - placeholders=self.placeholders, - act=tf.nn.relu, - dropout=True, - sparse_inputs=True, - logging=self.logging)) - - self.layers.append(GraphConvolution(input_dim=self.hidden1, - output_dim=self.output_dim, - placeholders=self.placeholders, - act=lambda x: x, - dropout=True, - logging=self.logging)) - - def predict(self): - return tf.nn.softmax(self.outputs) diff --git a/src/libnrl/gcn/train.py b/src/libnrl/gcn/train.py deleted file mode 100644 index 3d36bea..0000000 --- a/src/libnrl/gcn/train.py +++ /dev/null @@ -1,107 +0,0 @@ -from __future__ import division, print_function - -import time - -import tensorflow as tf - -from gcn.models import GCN, MLP -from gcn.utils import * - -# Set random seed -seed = 123 -np.random.seed(seed) -tf.set_random_seed(seed) - -# Settings -flags = tf.app.flags -FLAGS = flags.FLAGS -flags.DEFINE_string('dataset', 'cora', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' -flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense' -flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') -flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') -flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') -flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).') -flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') -flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') -flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') - -# Load data -adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) - -# Some preprocessing -features = preprocess_features(features) -if FLAGS.model == 'gcn': - support = [preprocess_adj(adj)] - num_supports = 1 - model_func = GCN -elif FLAGS.model == 'gcn_cheby': - support = chebyshev_polynomials(adj, FLAGS.max_degree) - num_supports = 1 + FLAGS.max_degree - model_func = GCN -elif FLAGS.model == 'dense': - support = [preprocess_adj(adj)] # Not used - num_supports = 1 - model_func = MLP -else: - raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) - -# Define placeholders -placeholders = { - 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], - 'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)), - 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), - 'labels_mask': tf.placeholder(tf.int32), - 'dropout': tf.placeholder_with_default(0., shape=()), - 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout -} - -# Create model -model = model_func(placeholders, input_dim=features[2][1], logging=True) - -# Initialize session -sess = tf.Session() - - -# Define model evaluation function -def evaluate(features, support, labels, mask, placeholders): - t_test = time.time() - feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders) - outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) - return outs_val[0], outs_val[1], (time.time() - t_test) - - -# Init variables -sess.run(tf.global_variables_initializer()) - -cost_val = [] - -# Train model -for epoch in range(FLAGS.epochs): - - t = time.time() - # Construct feed dictionary - feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders) - feed_dict.update({placeholders['dropout']: FLAGS.dropout}) - - # Training step - outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) - - # Validation - cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders) - cost_val.append(cost) - - # Print results - print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), - "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), - "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t)) - - if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]): - print("Early stopping...") - break - -print("Optimization Finished!") - -# Testing -test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders) -print("Test set results:", "cost=", "{:.5f}".format(test_cost), - "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration)) diff --git a/src/libnrl/gcn/utils.py b/src/libnrl/gcn/utils.py deleted file mode 100644 index 0200e2b..0000000 --- a/src/libnrl/gcn/utils.py +++ /dev/null @@ -1,153 +0,0 @@ -import pickle as pkl -import sys - -import networkx as nx -import numpy as np -import scipy.sparse as sp -from scipy.sparse.linalg.eigen.arpack import eigsh - - -def parse_index_file(filename): - """Parse index file.""" - index = [] - for line in open(filename): - index.append(int(line.strip())) - return index - - -def sample_mask(idx, l): - """Create mask.""" - mask = np.zeros(l) - mask[idx] = 1 - return np.array(mask, dtype=np.bool) - - -def load_data(dataset_str): - """Load data.""" - names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] - objects = [] - for i in range(len(names)): - with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: - if sys.version_info > (3, 0): - objects.append(pkl.load(f, encoding='latin1')) - else: - objects.append(pkl.load(f)) - - x, y, tx, ty, allx, ally, graph = tuple(objects) - test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) - test_idx_range = np.sort(test_idx_reorder) - - if dataset_str == 'citeseer': - # Fix citeseer dataset (there are some isolated nodes in the graph) - # Find isolated nodes, add them as zero-vecs into the right position - test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) - tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) - tx_extended[test_idx_range-min(test_idx_range), :] = tx - tx = tx_extended - ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) - ty_extended[test_idx_range-min(test_idx_range), :] = ty - ty = ty_extended - - features = sp.vstack((allx, tx)).tolil() - features[test_idx_reorder, :] = features[test_idx_range, :] - adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) - - labels = np.vstack((ally, ty)) - labels[test_idx_reorder, :] = labels[test_idx_range, :] - - idx_test = test_idx_range.tolist() - idx_train = range(len(y)) - idx_val = range(len(y), len(y)+500) - - train_mask = sample_mask(idx_train, labels.shape[0]) - val_mask = sample_mask(idx_val, labels.shape[0]) - test_mask = sample_mask(idx_test, labels.shape[0]) - - y_train = np.zeros(labels.shape) - y_val = np.zeros(labels.shape) - y_test = np.zeros(labels.shape) - y_train[train_mask, :] = labels[train_mask, :] - y_val[val_mask, :] = labels[val_mask, :] - y_test[test_mask, :] = labels[test_mask, :] - - return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask - - -def sparse_to_tuple(sparse_mx): - """Convert sparse matrix to tuple representation.""" - def to_tuple(mx): - if not sp.isspmatrix_coo(mx): - mx = mx.tocoo() - coords = np.vstack((mx.row, mx.col)).transpose() - values = mx.data - shape = mx.shape - return coords, values, shape - - if isinstance(sparse_mx, list): - for i in range(len(sparse_mx)): - sparse_mx[i] = to_tuple(sparse_mx[i]) - else: - sparse_mx = to_tuple(sparse_mx) - - return sparse_mx - - -def preprocess_features(features): - """Row-normalize feature matrix and convert to tuple representation""" - rowsum = np.array(features.sum(1)) - r_inv = np.power(rowsum, -1).flatten() - r_inv[np.isinf(r_inv)] = 0. - r_mat_inv = sp.diags(r_inv) - features = sp.coo_matrix(features) - features = r_mat_inv.dot(features) - return sparse_to_tuple(features) - - -def normalize_adj(adj): - """Symmetrically normalize adjacency matrix.""" - adj = sp.coo_matrix(adj) - rowsum = np.array(adj.sum(1)) - d_inv_sqrt = np.power(rowsum, -0.5).flatten() - d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. - d_mat_inv_sqrt = sp.diags(d_inv_sqrt) - return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() - - -def preprocess_adj(adj): - """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" - adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) - return sparse_to_tuple(adj_normalized) - - -def construct_feed_dict(features, support, labels, labels_mask, placeholders): - """Construct feed dictionary.""" - feed_dict = dict() - feed_dict.update({placeholders['labels']: labels}) - feed_dict.update({placeholders['labels_mask']: labels_mask}) - feed_dict.update({placeholders['features']: features}) - feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))}) - feed_dict.update({placeholders['num_features_nonzero']: features[1].shape}) - return feed_dict - - -def chebyshev_polynomials(adj, k): - """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation).""" - print("Calculating Chebyshev polynomials up to order {}...".format(k)) - - adj_normalized = normalize_adj(adj) - laplacian = sp.eye(adj.shape[0]) - adj_normalized - largest_eigval, _ = eigsh(laplacian, 1, which='LM') - scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0]) - - t_k = list() - t_k.append(sp.eye(adj.shape[0])) - t_k.append(scaled_laplacian) - - def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap): - s_lap = sp.csr_matrix(scaled_lap, copy=True) - return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two - - for i in range(2, k+1): - t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian)) - - return sparse_to_tuple(t_k) diff --git a/src/libnrl/graph.py b/src/libnrl/graph.py index 0cd061d..af57000 100644 --- a/src/libnrl/graph.py +++ b/src/libnrl/graph.py @@ -77,17 +77,6 @@ class Graph(object): vec = l.split() self.G.nodes[vec[0]]['attr'] = np.array([float(x) for x in vec[1:]]) - def read_node_label(self, path): - """ todo... read node labels and store as NetworkX graph {'node_id': {'label': values}} \n - input file format: node_id1 labels \n - node_id2 labels \n - with open(path, 'r') as fin: \n - for l in fin.readlines(): \n - vec = l.split() \n - self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]]) \n - """ - pass # to do... - def remove_edge(self, ratio=0.0): """ randomly remove edges/links \n ratio: the percentage of edges to be removed \n @@ -100,17 +89,6 @@ class Graph(object): print('after removing, the # of edges: ', self.G.number_of_edges()) return edges_removed - def remove_node_attr(self, ratio): - """ todo... randomly remove node attributes; \n - """ - pass # to do... - - def remove_node(self, ratio): - """ todo... randomly remove nodes; \n - #self.node_mapping() #update node id index mapping is needed \n - """ - pass # to do... - # ------------------------------------------------------------------------------------------ # --------------------commonly used APIs that will not modify graph------------------------- # ------------------------------------------------------------------------------------------ @@ -164,9 +142,3 @@ class Graph(object): def get_common_neighbors(self, node1, node2): """ return common neighbors of two nodes """ return list(nx.common_neighbors(self.G, node1, node2)) - - def get_centrality(self, centrality_type='degree'): - """ todo... return specified type of centrality \n - see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html \n - """ - pass # to do... diff --git a/src/libnrl/graphsage/graphsageAPI.py b/src/libnrl/graphsage/graphsageAPI.py index 0b39b44..2753263 100644 --- a/src/libnrl/graphsage/graphsageAPI.py +++ b/src/libnrl/graphsage/graphsageAPI.py @@ -30,11 +30,6 @@ class graphSAGE(object): if not is_supervised: from libnrl.graphsage import unsupervised_train self.vectors = unsupervised_train.train(train_data=train_data, test_data=None, model=sage_model) - else: - # to do... - # from libnrl.graphsage import supervised_train - # self.vectors = supervised_train.train() - pass def add_train_val_test_to_G(self, test_perc=0.0, val_perc=0.1): ''' add if 'val' and/or 'test' to each node in G ''' @@ -54,7 +49,7 @@ class graphSAGE(object): G.nodes[id]['test'] = False G.nodes[id]['val'] = False # Make sure the graph has edge train_removed annotations - # (some datasets might already have this..) + # some datasets might already have this print("Loaded data.. now preprocessing..") for edge in G.edges(): if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or diff --git a/src/libnrl/graphsage/models.py b/src/libnrl/graphsage/models.py index 1d6abfa..02d6675 100644 --- a/src/libnrl/graphsage/models.py +++ b/src/libnrl/graphsage/models.py @@ -11,11 +11,6 @@ from libnrl.graphsage.aggregators import (GCNAggregator, MaxPoolingAggregator, MeanPoolingAggregator, SeqAggregator) from libnrl.graphsage.prediction import BipartiteEdgePredLayer -''' -flags = tf.app.flags -FLAGS = FLAGS -''' - # DISCLAIMER: # Boilerplate parts of this code file were originally forked from # https://github.com/tkipf/gcn diff --git a/src/libnrl/graphsage/unsupervised_train.py b/src/libnrl/graphsage/unsupervised_train.py index 03102da..a496941 100644 --- a/src/libnrl/graphsage/unsupervised_train.py +++ b/src/libnrl/graphsage/unsupervised_train.py @@ -258,11 +258,7 @@ def train(train_data, test_data, model): # only print the last iter result at the end of each epoch print("Epoch:", '%04d' % epoch, "train_loss=", "{:.5f}".format(train_cost), - # "train_mrr=", "{:.5f}".format(train_mrr), - # "train_mrr_ema=", "{:.5f}".format(train_shadow_mrr), "val_loss=", "{:.5f}".format(val_cost), - # "val_mrr=", "{:.5f}".format(val_mrr), - # "val_mrr_ema=", "{:.5f}".format(shadow_mrr), "time cost", "{:.2f}".format(t2-t1)) # no early stopping was used in original code---------------- auto-save-best-emb ------------------------------ diff --git a/src/libnrl/graphsage/utils.py b/src/libnrl/graphsage/utils.py index 1a6204e..71ca3e6 100644 --- a/src/libnrl/graphsage/utils.py +++ b/src/libnrl/graphsage/utils.py @@ -18,7 +18,6 @@ from networkx.readwrite import json_graph version_info = list(map(int, nx.__version__.split('.'))) major = version_info[0] minor = version_info[1] -#assert (major <= 1) and (minor <= 11), "networkx major version > 1.11" WALK_LEN = 5 N_WALKS = 50 @@ -27,12 +26,6 @@ N_WALKS = 50 def load_data(prefix, normalize=True, load_walks=False): G_data = json.load(open(prefix + "-G.json")) G = json_graph.node_link_graph(G_data) - ''' - if isinstance(G.nodes()[0], int): - conversion = lambda n : int(n) - else: - conversion = lambda n : n - ''' def conversion(n): return int(n) # compatible with networkx >2.0 if os.path.exists(prefix + "-feats.npy"): @@ -61,7 +54,7 @@ def load_data(prefix, normalize=True, load_walks=False): print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count)) # Make sure the graph has edge train_removed annotations - # (some datasets might already have this..) + # some datasets might already have this print("Loaded data.. now preprocessing..") for edge in G.edges(): if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or @@ -104,7 +97,7 @@ def run_random_walks(G, nodes, num_walks=N_WALKS): return pairs -if __name__ == "__main__": # 这个地方需要改写,可以每次运行都跑一次 +if __name__ == "__main__": """ Run random walks """ graph_file = sys.argv[1] out_file = sys.argv[2] diff --git a/src/libnrl/line.py b/src/libnrl/line.py index 1490bc5..541809f 100644 --- a/src/libnrl/line.py +++ b/src/libnrl/line.py @@ -16,8 +16,7 @@ import numpy as np import tensorflow as tf from sklearn.linear_model import LogisticRegression -from .downstream import \ - ncClassifier # to do... try use lpClassifier to choose best embeddings? +from .downstream import ncClassifier from .utils import read_node_label_downstream @@ -48,9 +47,6 @@ class _LINE(object): cur_seed = random.getrandbits(32) self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed)) self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed)) - # self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1) - # self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1) - # self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1) self.h_e = tf.nn.embedding_lookup(self.embeddings, self.h) self.t_e = tf.nn.embedding_lookup(self.embeddings, self.t) self.t_e_context = tf.nn.embedding_lookup(self.context_embeddings, self.t) @@ -88,7 +84,6 @@ class _LINE(object): edges = [(look_up[x[0]], look_up[x[1]]) for x in self.g.G.edges()] data_size = self.g.G.number_of_edges() - # edge_set = set([x[0]*numNodes+x[1] for x in edges]) shuffle_indices = np.random.permutation(np.arange(data_size)) # positive or negative mod @@ -193,7 +188,6 @@ class _LINE(object): def get_embeddings(self): vectors = {} embeddings = self.embeddings.eval(session=self.sess) - # embeddings = self.sess.run(tf.nn.l2_normalize(self.embeddings.eval(session=self.sess), 1)) look_back = self.g.look_back_list for i, embedding in enumerate(embeddings): vectors[look_back[i]] = embedding diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py index c6593f0..55521e1 100644 --- a/src/libnrl/tadw.py +++ b/src/libnrl/tadw.py @@ -36,7 +36,6 @@ class TADW(object): look_back = self.g.look_back_list self.features = np.vstack([g.nodes[look_back[i]]['attr'] for i in range(g.number_of_nodes())]) - # self.features = self.g.get_attr_mat().todense() self.preprocessFeature() return self.features.T @@ -46,8 +45,6 @@ class TADW(object): Ud = U[:, 0:200] Sd = S[0:200] self.features = np.array(Ud)*Sd.reshape(200) - # from .utils import dim_reduction - # self.features = dim_reduction(self.features, dim=200, method='svd') def train(self): self.adj = self.getAdj() diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index 7647a08..792d841 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -49,8 +49,6 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False): def pairwise_similarity(mat, type='cosine'): - # XXX: possible to integrate pairwise_similarity with top_k to enhance performance? - # we'll use it elsewhere. if really needed, write a new method for this purpose if type == 'cosine': # support sprase and dense mat from sklearn.metrics.pairwise import cosine_similarity result = cosine_similarity(mat, dense_output=True) @@ -62,50 +60,19 @@ def pairwise_similarity(mat, type='cosine'): elif type == 'euclidean': from sklearn.metrics.pairwise import euclidean_distances # note: similarity = - distance - # other version: similarity = 1 - 2 / pi * arctan(distance) result = euclidean_distances(mat) result = -result - # result = 1 - 2 / np.pi * np.arctan(result) elif type == 'manhattan': from sklearn.metrics.pairwise import manhattan_distances # note: similarity = - distance - # other version: similarity = 1 - 2 / pi * arctan(distance) result = manhattan_distances(mat) result = -result - # result = 1 - 2 / np.pi * np.arctan(result) else: print('Please choose from: cosine, jaccard, euclidean or manhattan') return 'Not found!' return result - -# ---------------------------------ulits for preprocessing-------------------------------- -def node_auxi_to_attr(fin, fout): - """ TODO... - -> read auxi info associated with each node; - -> preprocessing auxi via: - 1) NLP for sentences; or 2) one-hot for discrete features; - -> then becomes node attr with m dim, and store them into attr file - """ - # https://radimrehurek.com/gensim/apiref.html - # word2vec, doc2vec, 把句子转为vec - # text2vec, tfidf, 把离散的features转为vec - pass - - -def simulate_incomplete_stru(): - pass - - -def simulate_incomplete_attr(): - pass - - -def simulate_noisy_world(): - pass - # ---------------------------------ulits for downstream tasks-------------------------------- -# XXX: read and save using panda or numpy def read_edge_label_downstream(filename): @@ -143,37 +110,6 @@ def read_node_label_downstream(filename): return X, Y -def store_embedddings(vectors, filename, dim): - """ store embeddings to file - """ - fout = open(filename, 'w') - num_nodes = len(vectors.keys()) - fout.write("{} {}\n".format(num_nodes, dim)) - for node, vec in vectors.items(): - fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) - fout.close() - print('store the resulting embeddings in file: ', filename) - - -def load_embeddings(filename): - """ load embeddings from file - """ - fin = open(filename, 'r') - num_nodes, size = [int(x) for x in fin.readline().strip().split()] - vectors = {} - while 1: - line = fin.readline() - if line == '': - break - vec = line.strip().split(' ') - assert len(vec) == size + 1 - vectors[vec[0]] = [float(x) for x in vec[1:]] - fin.close() - assert len(vectors) == num_nodes - return vectors - - -# ----------------- 以下你整理到utils,有问题的我都用中文写出来了,没有中文的暂时没啥问题,可以先不用管----------------------- def generate_edges_for_linkpred(graph, edges_removed, balance_ratio=1.0): ''' given a graph and edges_removed; generate non_edges not in [both graph and edges_removed]; @@ -216,52 +152,3 @@ def dim_reduction(mat, dim=128, method='pca'): t2 = time.time() print('END dimensionality reduction: {:.2f}s'.format(t2-t1)) return mat_reduced - - -def row_normalized(mat, is_transition_matrix=False): - ''' to do... - 两个问题:1)sparse矩阵在该场景下比dense慢,(至少我自己写的这块代码是) - 2)dense矩阵测试后发现所有元素加起来不是整数,似乎还是要用我以前笨方法来弥补 - 3)在is_transition_matrix时候,需要给全零行赋值,sparse时候会有点小问题,不能直接mat[i, :] = p赋值 - ''' - p = 1.0/mat.shape[0] # probability = 1/num of rows - norms = np.asarray(mat.sum(axis=1)).ravel() - for i, norm in enumerate(norms): - if norm != 0: - mat[i, :] /= norm - else: - if is_transition_matrix: - mat[i, :] = p # every row of transition matrix should sum up to 1 - else: - pass # do nothing; keep all-zero row - return mat - - -''' 笨方法如下''' - - -def rowAsPDF(mat): # make each row sum up to 1 i.e. a probabolity density distribution - mat = np.array(mat) - for i in range(mat.shape[0]): - sum_row = mat[i, :].sum() - if sum_row != 0: - mat[i, :] = mat[i, :]/sum_row # if a row [0, 1, 1, 1] -> [0, 1/3, 1/3, 1/3] -> may have some small issue... - else: - # to do... - # for node without any link... remain row as [0, 0, 0, 0] OR set to [1/n, 1/n, 1/n...]?? - pass - if mat[i, :].sum() != 1.00: # small trick to make sure each row is a pdf 笨犯法。。。 - error = 1.00 - mat[i, :].sum() - mat[i, -1] += error - return mat - - -def sparse_to_dense(): - ''' to dense np.matrix format 记得dtype用float64''' - pass - - -def dense_to_sparse(): - ''' to sparse crs format 记得dtype用float64''' - - pass diff --git a/src/libnrl/walker.py b/src/libnrl/walker.py index f6c0a50..ef17aed 100644 --- a/src/libnrl/walker.py +++ b/src/libnrl/walker.py @@ -23,9 +23,6 @@ class WeightedWalker: self.T = transition_mat self.workers = workers self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix - # print(nx.adjacency_matrix(self.rec_G).todense()[0:6, 0:6]) - # print(transition_mat[0:6, 0:6]) - # print(nx.adjacency_matrix(self.rec_G).todense()==transition_mat) # alias sampling for ABRW------------------------- def simulate_walks(self, num_walks, walk_length): diff --git a/src/main.py b/src/main.py index 113d907..e55823d 100644 --- a/src/main.py +++ b/src/main.py @@ -22,7 +22,6 @@ from libnrl import line # PNE method from libnrl import tadw # ANE method from libnrl.downstream import lpClassifier, ncClassifier from libnrl.graph import Graph -# from libnrl.gcn import gcnAPI # ANE method from libnrl.graphsage import graphsageAPI # ANE method from libnrl.grarep import GraRep # PNE method from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream @@ -30,8 +29,6 @@ from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream from sklearn.linear_model import LogisticRegression # to do... 1) put it in downstream.py; and 2) try SVM... from libnrl import abrw # ANE method; Attributed Biased Random Walk from libnrl import node2vec # PNE method; including deepwalk and node2vec -# from libnrl import TriDNR #to do... ANE method -# https://github.com/dfdazac/dgi #to do... ANE method def parse_args(): @@ -51,10 +48,6 @@ def parse_args(): help='choices of downstream tasks: none, lp, nc, lp_and_nc') parser.add_argument('--link-remove', default=0.1, type=float, help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]') - # parser.add_argument('--attr-remove', default=0.0, type=float, - # help='simulate randomly missing attributes if necessary; a ratio ranging [0.0, 1.0]') - # parser.add_argument('--link-reserved', default=0.7, type=float, - # help='for lp task, train/test split, a ratio ranging [0.0, 1.0]') parser.add_argument('--label-reserved', default=0.7, type=float, help='for nc task, train/test split, a ratio ranging [0.0, 1.0]') parser.add_argument('--directed', default=False, action='store_true', @@ -141,7 +134,6 @@ def main(args): assert args.attribute_file != '' g.read_node_attr(args.attribute_file) # load node label info------ - # to do... similar to attribute {'key_attribute': value}, label also loaded as {'key_label': value} t2 = time.time() print(f'STEP1: end loading data; time cost: {(t2-t1):.2f}s') @@ -204,16 +196,6 @@ def main(args): model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime())) print(f'Save node embeddings in file: {args.emb_file}') - ''' - #to do.... semi-supervised methods: gcn, graphsage, etc... - if args.method == 'gcn': #semi-supervised gcn - assert args.label_file != '' - assert args.feature_file != '' - g.read_node_label(args.label_file) - model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved) - print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs - exit(0) - ''' # ---------------------------------------STEP4: downstream task----------------------------------------------- print('\nSTEP4: start evaluating ......: ') @@ -222,7 +204,6 @@ def main(args): del model, g # ------lp task if args.task == 'lp' or args.task == 'lp_and_nc': - # X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file) # if you want to load your own lp testing data print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)') clf = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm clf.evaluate(test_node_pairs, test_edge_labels) @@ -238,7 +219,5 @@ def main(args): if __name__ == '__main__': print(f'------ START @ {time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())} ------') - # random.seed(2018) - # np.random.seed(2018) main(parse_args()) print(f'------ END @ {time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())} ------')