clean up for relaese
This commit is contained in:
parent
f4a765efc5
commit
b9f4edbebd
@ -57,15 +57,10 @@ class ABRW(object):
|
||||
genral idea: Attribute Biased Random Walk
|
||||
i.e. a walker based on a mixed transition matrix by P=alpha*T_A + (1-alpha)*T_X
|
||||
result: ABRW-trainsition matrix; T
|
||||
*** questions: 1) what about if we have some single nodes i.e. some rows of T_A gives 0s
|
||||
2) the similarity/distance metric to obtain T_X
|
||||
3) alias sampling as used in node2vec for speeding up, but this is the case
|
||||
if each row of P gives many 0s
|
||||
--> how to make each row of P is a pdf and meanwhile is sparse
|
||||
'''
|
||||
print("obtaining biased transition matrix where each row sums up to 1.0...")
|
||||
|
||||
preserve_zeros = False # compare them: 1) accuracy; 2) efficiency
|
||||
preserve_zeros = False
|
||||
T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
|
||||
print('Preserve zero rows of the adj matrix: ', preserve_zeros)
|
||||
|
||||
@ -95,8 +90,8 @@ class ABRW(object):
|
||||
print(f'ABRW biased transition matrix processing time: {(t5-t4):.2f}s')
|
||||
return T
|
||||
|
||||
def save_embeddings(self, filename): # to do... put it to utils;
|
||||
fout = open(filename, 'w') # call it while __init__ (abrw calss) with flag --save-emb=True (from main.py)
|
||||
def save_embeddings(self, filename):
|
||||
fout = open(filename, 'w')
|
||||
node_num = len(self.vectors.keys())
|
||||
fout.write("{} {}\n".format(node_num, self.dim))
|
||||
for node, vec in self.vectors.items():
|
||||
|
@ -73,7 +73,7 @@ class ATTRCOMB(object):
|
||||
nrl_embeddings.append(model.vectors[key])
|
||||
return np.array(nrl_embeddings)
|
||||
|
||||
elif comb_with == 'node2vec': # to do... the parameters
|
||||
elif comb_with == 'node2vec':
|
||||
model = node2vec.Node2vec(graph=self.g, path_length=80, num_paths=self.number_walks,
|
||||
dim=dim, workers=4, p=0.8, q=0.8, window=10)
|
||||
nrl_embeddings = []
|
||||
|
@ -39,7 +39,7 @@ class ncClassifier(object):
|
||||
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
|
||||
|
||||
self.train(X_train, Y_train, Y)
|
||||
np.random.set_state(state) # why??? for binarizer.transform??
|
||||
np.random.set_state(state)
|
||||
return self.evaluate(X_test, Y_test)
|
||||
|
||||
def train(self, X, Y, Y_all):
|
||||
@ -66,7 +66,6 @@ class ncClassifier(object):
|
||||
results = {}
|
||||
for average in averages:
|
||||
results[average] = f1_score(Y, Y_, average=average)
|
||||
# print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
|
||||
print(results)
|
||||
return results
|
||||
|
||||
@ -94,10 +93,6 @@ class lpClassifier(object):
|
||||
# clf here is simply a similarity/distance metric
|
||||
def evaluate(self, X_test, Y_test, seed=0):
|
||||
test_size = len(X_test)
|
||||
# shuffle_indices = np.random.permutation(np.arange(test_size))
|
||||
# X_test = [X_test[shuffle_indices[i]] for i in range(test_size)]
|
||||
# Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)]
|
||||
|
||||
Y_true = [int(i) for i in Y_test]
|
||||
Y_probs = []
|
||||
for i in range(test_size):
|
||||
@ -114,7 +109,6 @@ class lpClassifier(object):
|
||||
if roc < 0.5:
|
||||
roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5
|
||||
print("roc=", "{:.9f}".format(roc))
|
||||
# plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value
|
||||
|
||||
|
||||
def norm(a):
|
||||
@ -128,19 +122,9 @@ def cosine_similarity(a, b):
|
||||
sum = 0.0
|
||||
for i in range(len(a)):
|
||||
sum = sum + a[i] * b[i]
|
||||
# return sum/(norm(a) * norm(b))
|
||||
# fix numerical issue 1e-100 almost = 0!
|
||||
return sum / (norm(a) * norm(b) + 1e-100)
|
||||
|
||||
|
||||
'''
|
||||
#cosine_similarity realized by use...
|
||||
#or try sklearn....
|
||||
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics
|
||||
#ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
|
||||
'''
|
||||
|
||||
|
||||
def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
|
||||
# randomly split links/edges into training set and testing set
|
||||
# *** note: we do not assume every node must be connected after removing links
|
||||
@ -160,8 +144,6 @@ def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
|
||||
|
||||
# generate testing set that contains both pos and neg samples
|
||||
test_pos_sample = random.sample(g.G.edges(), int(test_size))
|
||||
# test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do...
|
||||
# more efficient way:
|
||||
test_neg_sample = []
|
||||
num_neg_sample = int(test_size * neg_pos_link_ratio)
|
||||
num = 0
|
||||
|
@ -1,2 +0,0 @@
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
@ -1,162 +0,0 @@
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from . import models
|
||||
from .utils import *
|
||||
|
||||
|
||||
class GCN(object):
|
||||
|
||||
def __init__(self, graph, learning_rate=0.01, epochs=200,
|
||||
hidden1=16, dropout=0.5, weight_decay=5e-4, early_stopping=10,
|
||||
max_degree=3, clf_ratio=0.1):
|
||||
"""
|
||||
learning_rate: Initial learning rate
|
||||
epochs: Number of epochs to train
|
||||
hidden1: Number of units in hidden layer 1
|
||||
dropout: Dropout rate (1 - keep probability)
|
||||
weight_decay: Weight for L2 loss on embedding matrix
|
||||
early_stopping: Tolerance for early stopping (# of epochs)
|
||||
max_degree: Maximum Chebyshev polynomial degree
|
||||
"""
|
||||
self.graph = graph
|
||||
self.clf_ratio = clf_ratio
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.hidden1 = hidden1
|
||||
self.dropout = dropout
|
||||
self.weight_decay = weight_decay
|
||||
self.early_stopping = early_stopping
|
||||
self.max_degree = max_degree
|
||||
|
||||
self.preprocess_data()
|
||||
self.build_placeholders()
|
||||
# Create model
|
||||
self.model = models.GCN(self.placeholders, input_dim=self.features[2][1], hidden1=self.hidden1, weight_decay=self.weight_decay, logging=True)
|
||||
# Initialize session
|
||||
self.sess = tf.Session()
|
||||
# Init variables
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
cost_val = []
|
||||
|
||||
# Train model
|
||||
for epoch in range(self.epochs):
|
||||
|
||||
t = time.time()
|
||||
# Construct feed dictionary
|
||||
feed_dict = self.construct_feed_dict(self.train_mask)
|
||||
feed_dict.update({self.placeholders['dropout']: self.dropout})
|
||||
|
||||
# Training step
|
||||
outs = self.sess.run([self.model.opt_op, self.model.loss, self.model.accuracy], feed_dict=feed_dict)
|
||||
|
||||
# Validation
|
||||
cost, acc, duration = self.evaluate(self.val_mask)
|
||||
cost_val.append(cost)
|
||||
|
||||
# Print results
|
||||
print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
|
||||
"train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
|
||||
"val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))
|
||||
''' #something wrong for early stoppting?? to do...
|
||||
if epoch > self.early_stopping and cost_val[-1] > np.mean(cost_val[-(self.early_stopping+1):-1]):
|
||||
print("Early stopping...")
|
||||
break
|
||||
'''
|
||||
print("Optimization Finished!")
|
||||
|
||||
# Testing
|
||||
test_cost, test_acc, test_duration = self.evaluate(self.test_mask)
|
||||
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
|
||||
"accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))
|
||||
|
||||
# Define model evaluation function
|
||||
def evaluate(self, mask):
|
||||
t_test = time.time()
|
||||
feed_dict_val = self.construct_feed_dict(mask)
|
||||
outs_val = self.sess.run([self.model.loss, self.model.accuracy], feed_dict=feed_dict_val)
|
||||
return outs_val[0], outs_val[1], (time.time() - t_test)
|
||||
|
||||
def build_placeholders(self):
|
||||
num_supports = 1
|
||||
self.placeholders = {
|
||||
'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
|
||||
'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(self.features[2], dtype=tf.int64)),
|
||||
'labels': tf.placeholder(tf.float32, shape=(None, self.labels.shape[1])),
|
||||
'labels_mask': tf.placeholder(tf.int32),
|
||||
'dropout': tf.placeholder_with_default(0., shape=()),
|
||||
# helper variable for sparse dropout
|
||||
'num_features_nonzero': tf.placeholder(tf.int32)
|
||||
}
|
||||
|
||||
def build_label(self):
|
||||
g = self.graph.G
|
||||
look_up = self.graph.look_up_dict
|
||||
labels = []
|
||||
label_dict = {}
|
||||
label_id = 0
|
||||
for node in g.nodes():
|
||||
labels.append((node, g.nodes[node]['label']))
|
||||
for l in g.nodes[node]['label']:
|
||||
if l not in label_dict:
|
||||
label_dict[l] = label_id
|
||||
label_id += 1
|
||||
self.labels = np.zeros((len(labels), label_id))
|
||||
self.label_dict = label_dict
|
||||
for node, l in labels:
|
||||
node_id = look_up[node]
|
||||
for ll in l:
|
||||
l_id = label_dict[ll]
|
||||
self.labels[node_id][l_id] = 1
|
||||
|
||||
def build_train_val_test(self):
|
||||
"""
|
||||
build train_mask test_mask val_mask
|
||||
"""
|
||||
train_precent = self.clf_ratio
|
||||
training_size = int(train_precent * self.graph.G.number_of_nodes())
|
||||
state = np.random.get_state()
|
||||
np.random.seed(0)
|
||||
shuffle_indices = np.random.permutation(np.arange(self.graph.G.number_of_nodes()))
|
||||
np.random.set_state(state)
|
||||
|
||||
look_up = self.graph.look_up_dict
|
||||
g = self.graph.G
|
||||
|
||||
def sample_mask(begin, end):
|
||||
mask = np.zeros(g.number_of_nodes())
|
||||
for i in range(begin, end):
|
||||
mask[shuffle_indices[i]] = 1
|
||||
return mask
|
||||
|
||||
self.train_mask = sample_mask(0, training_size-100)
|
||||
self.val_mask = sample_mask(training_size-100, training_size)
|
||||
self.test_mask = sample_mask(training_size, g.number_of_nodes())
|
||||
|
||||
def preprocess_data(self):
|
||||
"""
|
||||
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
|
||||
y_train, y_val, y_test can merge to y
|
||||
"""
|
||||
g = self.graph.G
|
||||
look_back = self.graph.look_back_list
|
||||
self.features = np.vstack([g.nodes[look_back[i]]['feature']
|
||||
for i in range(g.number_of_nodes())])
|
||||
self.features = preprocess_features(self.features)
|
||||
self.build_label()
|
||||
self.build_train_val_test()
|
||||
adj = nx.adjacency_matrix(g) # the type of graph
|
||||
self.support = [preprocess_adj(adj)]
|
||||
|
||||
def construct_feed_dict(self, labels_mask):
|
||||
"""Construct feed dictionary."""
|
||||
feed_dict = dict()
|
||||
feed_dict.update({self.placeholders['labels']: self.labels})
|
||||
feed_dict.update({self.placeholders['labels_mask']: labels_mask})
|
||||
feed_dict.update({self.placeholders['features']: self.features})
|
||||
feed_dict.update({self.placeholders['support'][i]: self.support[i] for i in range(len(self.support))})
|
||||
feed_dict.update({self.placeholders['num_features_nonzero']: self.features[1].shape})
|
||||
return feed_dict
|
@ -1,27 +0,0 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def uniform(shape, scale=0.05, name=None):
|
||||
"""Uniform init."""
|
||||
initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
|
||||
return tf.Variable(initial, name=name)
|
||||
|
||||
|
||||
def glorot(shape, name=None):
|
||||
"""Glorot & Bengio (AISTATS 2010) init."""
|
||||
init_range = np.sqrt(6.0/(shape[0]+shape[1]))
|
||||
initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
|
||||
return tf.Variable(initial, name=name)
|
||||
|
||||
|
||||
def zeros(shape, name=None):
|
||||
"""All zeros."""
|
||||
initial = tf.zeros(shape, dtype=tf.float32)
|
||||
return tf.Variable(initial, name=name)
|
||||
|
||||
|
||||
def ones(shape, name=None):
|
||||
"""All ones."""
|
||||
initial = tf.ones(shape, dtype=tf.float32)
|
||||
return tf.Variable(initial, name=name)
|
@ -1,191 +0,0 @@
|
||||
import tensorflow as tf
|
||||
|
||||
from .inits import *
|
||||
|
||||
flags = tf.app.flags
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
# global unique layer ID dictionary for layer name assignment
|
||||
_LAYER_UIDS = {}
|
||||
|
||||
|
||||
def get_layer_uid(layer_name=''):
|
||||
"""Helper function, assigns unique layer IDs."""
|
||||
if layer_name not in _LAYER_UIDS:
|
||||
_LAYER_UIDS[layer_name] = 1
|
||||
return 1
|
||||
else:
|
||||
_LAYER_UIDS[layer_name] += 1
|
||||
return _LAYER_UIDS[layer_name]
|
||||
|
||||
|
||||
def sparse_dropout(x, keep_prob, noise_shape):
|
||||
"""Dropout for sparse tensors."""
|
||||
random_tensor = keep_prob
|
||||
random_tensor += tf.random_uniform(noise_shape)
|
||||
dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
|
||||
pre_out = tf.sparse_retain(x, dropout_mask)
|
||||
return pre_out * (1./keep_prob)
|
||||
|
||||
|
||||
def dot(x, y, sparse=False):
|
||||
"""Wrapper for tf.matmul (sparse vs dense)."""
|
||||
if sparse:
|
||||
res = tf.sparse_tensor_dense_matmul(x, y)
|
||||
else:
|
||||
res = tf.matmul(x, y)
|
||||
return res
|
||||
|
||||
|
||||
class Layer(object):
|
||||
"""Base layer class. Defines basic API for all layer objects.
|
||||
Implementation inspired by keras (http://keras.io).
|
||||
|
||||
# Properties
|
||||
name: String, defines the variable scope of the layer.
|
||||
logging: Boolean, switches Tensorflow histogram logging on/off
|
||||
|
||||
# Methods
|
||||
_call(inputs): Defines computation graph of layer
|
||||
(i.e. takes input, returns output)
|
||||
__call__(inputs): Wrapper for _call()
|
||||
_log_vars(): Log all variables
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
allowed_kwargs = {'name', 'logging'}
|
||||
for kwarg in kwargs.keys():
|
||||
assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
|
||||
name = kwargs.get('name')
|
||||
if not name:
|
||||
layer = self.__class__.__name__.lower()
|
||||
name = layer + '_' + str(get_layer_uid(layer))
|
||||
self.name = name
|
||||
self.vars = {}
|
||||
logging = kwargs.get('logging', False)
|
||||
self.logging = logging
|
||||
self.sparse_inputs = False
|
||||
|
||||
def _call(self, inputs):
|
||||
return inputs
|
||||
|
||||
def __call__(self, inputs):
|
||||
with tf.name_scope(self.name):
|
||||
if self.logging and not self.sparse_inputs:
|
||||
tf.summary.histogram(self.name + '/inputs', inputs)
|
||||
outputs = self._call(inputs)
|
||||
if self.logging:
|
||||
tf.summary.histogram(self.name + '/outputs', outputs)
|
||||
return outputs
|
||||
|
||||
def _log_vars(self):
|
||||
for var in self.vars:
|
||||
tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])
|
||||
|
||||
|
||||
class Dense(Layer):
|
||||
"""Dense layer."""
|
||||
|
||||
def __init__(self, input_dim, output_dim, placeholders, dropout=0., sparse_inputs=False,
|
||||
act=tf.nn.relu, bias=False, featureless=False, **kwargs):
|
||||
super(Dense, self).__init__(**kwargs)
|
||||
|
||||
if dropout:
|
||||
self.dropout = placeholders['dropout']
|
||||
else:
|
||||
self.dropout = 0.
|
||||
|
||||
self.act = act
|
||||
self.sparse_inputs = sparse_inputs
|
||||
self.featureless = featureless
|
||||
self.bias = bias
|
||||
|
||||
# helper variable for sparse dropout
|
||||
self.num_features_nonzero = placeholders['num_features_nonzero']
|
||||
|
||||
with tf.variable_scope(self.name + '_vars'):
|
||||
self.vars['weights'] = glorot([input_dim, output_dim],
|
||||
name='weights')
|
||||
if self.bias:
|
||||
self.vars['bias'] = zeros([output_dim], name='bias')
|
||||
|
||||
if self.logging:
|
||||
self._log_vars()
|
||||
|
||||
def _call(self, inputs):
|
||||
x = inputs
|
||||
|
||||
# dropout
|
||||
if self.sparse_inputs:
|
||||
x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
|
||||
else:
|
||||
x = tf.nn.dropout(x, 1-self.dropout)
|
||||
|
||||
# transform
|
||||
output = dot(x, self.vars['weights'], sparse=self.sparse_inputs)
|
||||
|
||||
# bias
|
||||
if self.bias:
|
||||
output += self.vars['bias']
|
||||
|
||||
return self.act(output)
|
||||
|
||||
|
||||
class GraphConvolution(Layer):
|
||||
"""Graph convolution layer."""
|
||||
|
||||
def __init__(self, input_dim, output_dim, placeholders, dropout=0.,
|
||||
sparse_inputs=False, act=tf.nn.relu, bias=False,
|
||||
featureless=False, **kwargs):
|
||||
super(GraphConvolution, self).__init__(**kwargs)
|
||||
|
||||
if dropout:
|
||||
self.dropout = placeholders['dropout']
|
||||
else:
|
||||
self.dropout = 0.
|
||||
|
||||
self.act = act
|
||||
self.support = placeholders['support']
|
||||
self.sparse_inputs = sparse_inputs
|
||||
self.featureless = featureless
|
||||
self.bias = bias
|
||||
|
||||
# helper variable for sparse dropout
|
||||
self.num_features_nonzero = placeholders['num_features_nonzero']
|
||||
|
||||
with tf.variable_scope(self.name + '_vars'):
|
||||
for i in range(len(self.support)):
|
||||
self.vars['weights_' + str(i)] = glorot([input_dim, output_dim],
|
||||
name='weights_' + str(i))
|
||||
if self.bias:
|
||||
self.vars['bias'] = zeros([output_dim], name='bias')
|
||||
|
||||
if self.logging:
|
||||
self._log_vars()
|
||||
|
||||
def _call(self, inputs):
|
||||
x = inputs
|
||||
|
||||
# dropout
|
||||
if self.sparse_inputs:
|
||||
x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
|
||||
else:
|
||||
x = tf.nn.dropout(x, 1-self.dropout)
|
||||
|
||||
# convolve
|
||||
supports = list()
|
||||
for i in range(len(self.support)):
|
||||
if not self.featureless:
|
||||
pre_sup = dot(x, self.vars['weights_' + str(i)],
|
||||
sparse=self.sparse_inputs)
|
||||
else:
|
||||
pre_sup = self.vars['weights_' + str(i)]
|
||||
support = dot(self.support[i], pre_sup, sparse=True)
|
||||
supports.append(support)
|
||||
output = tf.add_n(supports)
|
||||
|
||||
# bias
|
||||
if self.bias:
|
||||
output += self.vars['bias']
|
||||
|
||||
return self.act(output)
|
@ -1,20 +0,0 @@
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def masked_softmax_cross_entropy(preds, labels, mask):
|
||||
"""Softmax cross-entropy loss with masking."""
|
||||
loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
|
||||
mask = tf.cast(mask, dtype=tf.float32)
|
||||
mask /= tf.reduce_mean(mask)
|
||||
loss *= mask
|
||||
return tf.reduce_mean(loss)
|
||||
|
||||
|
||||
def masked_accuracy(preds, labels, mask):
|
||||
"""Accuracy with masking."""
|
||||
correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1))
|
||||
accuracy_all = tf.cast(correct_prediction, tf.float32)
|
||||
mask = tf.cast(mask, dtype=tf.float32)
|
||||
mask /= tf.reduce_mean(mask)
|
||||
accuracy_all *= mask
|
||||
return tf.reduce_mean(accuracy_all)
|
@ -1,179 +0,0 @@
|
||||
from .layers import *
|
||||
from .metrics import *
|
||||
|
||||
flags = tf.app.flags
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, **kwargs):
|
||||
allowed_kwargs = {'name', 'logging'}
|
||||
for kwarg in kwargs.keys():
|
||||
assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
|
||||
name = kwargs.get('name')
|
||||
if not name:
|
||||
name = self.__class__.__name__.lower()
|
||||
self.name = name
|
||||
|
||||
logging = kwargs.get('logging', False)
|
||||
self.logging = logging
|
||||
|
||||
self.vars = {}
|
||||
self.placeholders = {}
|
||||
|
||||
self.layers = []
|
||||
self.activations = []
|
||||
|
||||
self.inputs = None
|
||||
self.outputs = None
|
||||
|
||||
self.loss = 0
|
||||
self.accuracy = 0
|
||||
self.optimizer = None
|
||||
self.opt_op = None
|
||||
|
||||
def _build(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def build(self):
|
||||
""" Wrapper for _build() """
|
||||
with tf.variable_scope(self.name):
|
||||
self._build()
|
||||
|
||||
# Build sequential layer model
|
||||
self.activations.append(self.inputs)
|
||||
for layer in self.layers:
|
||||
hidden = layer(self.activations[-1])
|
||||
self.activations.append(hidden)
|
||||
self.outputs = self.activations[-1]
|
||||
|
||||
# Store model variables for easy access
|
||||
variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
|
||||
self.vars = {var.name: var for var in variables}
|
||||
|
||||
# Build metrics
|
||||
self._loss()
|
||||
self._accuracy()
|
||||
|
||||
self.opt_op = self.optimizer.minimize(self.loss)
|
||||
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
def _loss(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _accuracy(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def save(self, sess=None):
|
||||
if not sess:
|
||||
raise AttributeError("TensorFlow session not provided.")
|
||||
saver = tf.train.Saver(self.vars)
|
||||
save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
|
||||
print("Model saved in file: %s" % save_path)
|
||||
|
||||
def load(self, sess=None):
|
||||
if not sess:
|
||||
raise AttributeError("TensorFlow session not provided.")
|
||||
saver = tf.train.Saver(self.vars)
|
||||
save_path = "tmp/%s.ckpt" % self.name
|
||||
saver.restore(sess, save_path)
|
||||
print("Model restored from file: %s" % save_path)
|
||||
|
||||
|
||||
class MLP(Model):
|
||||
def __init__(self, placeholders, input_dim, **kwargs):
|
||||
super(MLP, self).__init__(**kwargs)
|
||||
|
||||
self.inputs = placeholders['features']
|
||||
self.input_dim = input_dim
|
||||
# self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions
|
||||
self.output_dim = placeholders['labels'].get_shape().as_list()[1]
|
||||
self.placeholders = placeholders
|
||||
|
||||
self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
|
||||
|
||||
self.build()
|
||||
|
||||
def _loss(self):
|
||||
# Weight decay loss
|
||||
for var in self.layers[0].vars.values():
|
||||
self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
|
||||
|
||||
# Cross entropy error
|
||||
self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
|
||||
self.placeholders['labels_mask'])
|
||||
|
||||
def _accuracy(self):
|
||||
self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'],
|
||||
self.placeholders['labels_mask'])
|
||||
|
||||
def _build(self):
|
||||
self.layers.append(Dense(input_dim=self.input_dim,
|
||||
output_dim=FLAGS.hidden1,
|
||||
placeholders=self.placeholders,
|
||||
act=tf.nn.relu,
|
||||
dropout=True,
|
||||
sparse_inputs=True,
|
||||
logging=self.logging))
|
||||
|
||||
self.layers.append(Dense(input_dim=FLAGS.hidden1,
|
||||
output_dim=self.output_dim,
|
||||
placeholders=self.placeholders,
|
||||
act=lambda x: x,
|
||||
dropout=True,
|
||||
logging=self.logging))
|
||||
|
||||
def predict(self):
|
||||
return tf.nn.softmax(self.outputs)
|
||||
|
||||
|
||||
class GCN(Model):
|
||||
def __init__(self, placeholders, input_dim, hidden1, weight_decay, **kwargs):
|
||||
super(GCN, self).__init__(**kwargs)
|
||||
|
||||
self.inputs = placeholders['features']
|
||||
self.hidden1 = hidden1
|
||||
self.weight_decay = weight_decay
|
||||
self.input_dim = input_dim
|
||||
# self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions
|
||||
self.output_dim = placeholders['labels'].get_shape().as_list()[1]
|
||||
self.placeholders = placeholders
|
||||
|
||||
self.optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
|
||||
|
||||
self.build()
|
||||
|
||||
def _loss(self):
|
||||
# Weight decay loss
|
||||
for var in self.layers[0].vars.values():
|
||||
self.loss += self.weight_decay * tf.nn.l2_loss(var)
|
||||
|
||||
# Cross entropy error
|
||||
self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
|
||||
self.placeholders['labels_mask'])
|
||||
|
||||
def _accuracy(self):
|
||||
self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'],
|
||||
self.placeholders['labels_mask'])
|
||||
|
||||
def _build(self):
|
||||
|
||||
self.layers.append(GraphConvolution(input_dim=self.input_dim,
|
||||
output_dim=self.hidden1,
|
||||
placeholders=self.placeholders,
|
||||
act=tf.nn.relu,
|
||||
dropout=True,
|
||||
sparse_inputs=True,
|
||||
logging=self.logging))
|
||||
|
||||
self.layers.append(GraphConvolution(input_dim=self.hidden1,
|
||||
output_dim=self.output_dim,
|
||||
placeholders=self.placeholders,
|
||||
act=lambda x: x,
|
||||
dropout=True,
|
||||
logging=self.logging))
|
||||
|
||||
def predict(self):
|
||||
return tf.nn.softmax(self.outputs)
|
@ -1,107 +0,0 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import time
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from gcn.models import GCN, MLP
|
||||
from gcn.utils import *
|
||||
|
||||
# Set random seed
|
||||
seed = 123
|
||||
np.random.seed(seed)
|
||||
tf.set_random_seed(seed)
|
||||
|
||||
# Settings
|
||||
flags = tf.app.flags
|
||||
FLAGS = flags.FLAGS
|
||||
flags.DEFINE_string('dataset', 'cora', 'Dataset string.') # 'cora', 'citeseer', 'pubmed'
|
||||
flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense'
|
||||
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
|
||||
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
|
||||
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
|
||||
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
|
||||
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
|
||||
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
|
||||
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
|
||||
|
||||
# Load data
|
||||
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset)
|
||||
|
||||
# Some preprocessing
|
||||
features = preprocess_features(features)
|
||||
if FLAGS.model == 'gcn':
|
||||
support = [preprocess_adj(adj)]
|
||||
num_supports = 1
|
||||
model_func = GCN
|
||||
elif FLAGS.model == 'gcn_cheby':
|
||||
support = chebyshev_polynomials(adj, FLAGS.max_degree)
|
||||
num_supports = 1 + FLAGS.max_degree
|
||||
model_func = GCN
|
||||
elif FLAGS.model == 'dense':
|
||||
support = [preprocess_adj(adj)] # Not used
|
||||
num_supports = 1
|
||||
model_func = MLP
|
||||
else:
|
||||
raise ValueError('Invalid argument for model: ' + str(FLAGS.model))
|
||||
|
||||
# Define placeholders
|
||||
placeholders = {
|
||||
'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
|
||||
'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
|
||||
'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
|
||||
'labels_mask': tf.placeholder(tf.int32),
|
||||
'dropout': tf.placeholder_with_default(0., shape=()),
|
||||
'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout
|
||||
}
|
||||
|
||||
# Create model
|
||||
model = model_func(placeholders, input_dim=features[2][1], logging=True)
|
||||
|
||||
# Initialize session
|
||||
sess = tf.Session()
|
||||
|
||||
|
||||
# Define model evaluation function
|
||||
def evaluate(features, support, labels, mask, placeholders):
|
||||
t_test = time.time()
|
||||
feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders)
|
||||
outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val)
|
||||
return outs_val[0], outs_val[1], (time.time() - t_test)
|
||||
|
||||
|
||||
# Init variables
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
cost_val = []
|
||||
|
||||
# Train model
|
||||
for epoch in range(FLAGS.epochs):
|
||||
|
||||
t = time.time()
|
||||
# Construct feed dictionary
|
||||
feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
|
||||
feed_dict.update({placeholders['dropout']: FLAGS.dropout})
|
||||
|
||||
# Training step
|
||||
outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)
|
||||
|
||||
# Validation
|
||||
cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders)
|
||||
cost_val.append(cost)
|
||||
|
||||
# Print results
|
||||
print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
|
||||
"train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
|
||||
"val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))
|
||||
|
||||
if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
|
||||
print("Early stopping...")
|
||||
break
|
||||
|
||||
print("Optimization Finished!")
|
||||
|
||||
# Testing
|
||||
test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders)
|
||||
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
|
||||
"accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))
|
@ -1,153 +0,0 @@
|
||||
import pickle as pkl
|
||||
import sys
|
||||
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from scipy.sparse.linalg.eigen.arpack import eigsh
|
||||
|
||||
|
||||
def parse_index_file(filename):
|
||||
"""Parse index file."""
|
||||
index = []
|
||||
for line in open(filename):
|
||||
index.append(int(line.strip()))
|
||||
return index
|
||||
|
||||
|
||||
def sample_mask(idx, l):
|
||||
"""Create mask."""
|
||||
mask = np.zeros(l)
|
||||
mask[idx] = 1
|
||||
return np.array(mask, dtype=np.bool)
|
||||
|
||||
|
||||
def load_data(dataset_str):
|
||||
"""Load data."""
|
||||
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
|
||||
objects = []
|
||||
for i in range(len(names)):
|
||||
with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
|
||||
if sys.version_info > (3, 0):
|
||||
objects.append(pkl.load(f, encoding='latin1'))
|
||||
else:
|
||||
objects.append(pkl.load(f))
|
||||
|
||||
x, y, tx, ty, allx, ally, graph = tuple(objects)
|
||||
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
|
||||
test_idx_range = np.sort(test_idx_reorder)
|
||||
|
||||
if dataset_str == 'citeseer':
|
||||
# Fix citeseer dataset (there are some isolated nodes in the graph)
|
||||
# Find isolated nodes, add them as zero-vecs into the right position
|
||||
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
|
||||
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
|
||||
tx_extended[test_idx_range-min(test_idx_range), :] = tx
|
||||
tx = tx_extended
|
||||
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
|
||||
ty_extended[test_idx_range-min(test_idx_range), :] = ty
|
||||
ty = ty_extended
|
||||
|
||||
features = sp.vstack((allx, tx)).tolil()
|
||||
features[test_idx_reorder, :] = features[test_idx_range, :]
|
||||
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
|
||||
|
||||
labels = np.vstack((ally, ty))
|
||||
labels[test_idx_reorder, :] = labels[test_idx_range, :]
|
||||
|
||||
idx_test = test_idx_range.tolist()
|
||||
idx_train = range(len(y))
|
||||
idx_val = range(len(y), len(y)+500)
|
||||
|
||||
train_mask = sample_mask(idx_train, labels.shape[0])
|
||||
val_mask = sample_mask(idx_val, labels.shape[0])
|
||||
test_mask = sample_mask(idx_test, labels.shape[0])
|
||||
|
||||
y_train = np.zeros(labels.shape)
|
||||
y_val = np.zeros(labels.shape)
|
||||
y_test = np.zeros(labels.shape)
|
||||
y_train[train_mask, :] = labels[train_mask, :]
|
||||
y_val[val_mask, :] = labels[val_mask, :]
|
||||
y_test[test_mask, :] = labels[test_mask, :]
|
||||
|
||||
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
|
||||
|
||||
|
||||
def sparse_to_tuple(sparse_mx):
|
||||
"""Convert sparse matrix to tuple representation."""
|
||||
def to_tuple(mx):
|
||||
if not sp.isspmatrix_coo(mx):
|
||||
mx = mx.tocoo()
|
||||
coords = np.vstack((mx.row, mx.col)).transpose()
|
||||
values = mx.data
|
||||
shape = mx.shape
|
||||
return coords, values, shape
|
||||
|
||||
if isinstance(sparse_mx, list):
|
||||
for i in range(len(sparse_mx)):
|
||||
sparse_mx[i] = to_tuple(sparse_mx[i])
|
||||
else:
|
||||
sparse_mx = to_tuple(sparse_mx)
|
||||
|
||||
return sparse_mx
|
||||
|
||||
|
||||
def preprocess_features(features):
|
||||
"""Row-normalize feature matrix and convert to tuple representation"""
|
||||
rowsum = np.array(features.sum(1))
|
||||
r_inv = np.power(rowsum, -1).flatten()
|
||||
r_inv[np.isinf(r_inv)] = 0.
|
||||
r_mat_inv = sp.diags(r_inv)
|
||||
features = sp.coo_matrix(features)
|
||||
features = r_mat_inv.dot(features)
|
||||
return sparse_to_tuple(features)
|
||||
|
||||
|
||||
def normalize_adj(adj):
|
||||
"""Symmetrically normalize adjacency matrix."""
|
||||
adj = sp.coo_matrix(adj)
|
||||
rowsum = np.array(adj.sum(1))
|
||||
d_inv_sqrt = np.power(rowsum, -0.5).flatten()
|
||||
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
|
||||
d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
|
||||
return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
|
||||
|
||||
|
||||
def preprocess_adj(adj):
|
||||
"""Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
|
||||
adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
|
||||
return sparse_to_tuple(adj_normalized)
|
||||
|
||||
|
||||
def construct_feed_dict(features, support, labels, labels_mask, placeholders):
|
||||
"""Construct feed dictionary."""
|
||||
feed_dict = dict()
|
||||
feed_dict.update({placeholders['labels']: labels})
|
||||
feed_dict.update({placeholders['labels_mask']: labels_mask})
|
||||
feed_dict.update({placeholders['features']: features})
|
||||
feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
|
||||
feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
|
||||
return feed_dict
|
||||
|
||||
|
||||
def chebyshev_polynomials(adj, k):
|
||||
"""Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
|
||||
print("Calculating Chebyshev polynomials up to order {}...".format(k))
|
||||
|
||||
adj_normalized = normalize_adj(adj)
|
||||
laplacian = sp.eye(adj.shape[0]) - adj_normalized
|
||||
largest_eigval, _ = eigsh(laplacian, 1, which='LM')
|
||||
scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
|
||||
|
||||
t_k = list()
|
||||
t_k.append(sp.eye(adj.shape[0]))
|
||||
t_k.append(scaled_laplacian)
|
||||
|
||||
def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
|
||||
s_lap = sp.csr_matrix(scaled_lap, copy=True)
|
||||
return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two
|
||||
|
||||
for i in range(2, k+1):
|
||||
t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))
|
||||
|
||||
return sparse_to_tuple(t_k)
|
@ -77,17 +77,6 @@ class Graph(object):
|
||||
vec = l.split()
|
||||
self.G.nodes[vec[0]]['attr'] = np.array([float(x) for x in vec[1:]])
|
||||
|
||||
def read_node_label(self, path):
|
||||
""" todo... read node labels and store as NetworkX graph {'node_id': {'label': values}} \n
|
||||
input file format: node_id1 labels \n
|
||||
node_id2 labels \n
|
||||
with open(path, 'r') as fin: \n
|
||||
for l in fin.readlines(): \n
|
||||
vec = l.split() \n
|
||||
self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]]) \n
|
||||
"""
|
||||
pass # to do...
|
||||
|
||||
def remove_edge(self, ratio=0.0):
|
||||
""" randomly remove edges/links \n
|
||||
ratio: the percentage of edges to be removed \n
|
||||
@ -100,17 +89,6 @@ class Graph(object):
|
||||
print('after removing, the # of edges: ', self.G.number_of_edges())
|
||||
return edges_removed
|
||||
|
||||
def remove_node_attr(self, ratio):
|
||||
""" todo... randomly remove node attributes; \n
|
||||
"""
|
||||
pass # to do...
|
||||
|
||||
def remove_node(self, ratio):
|
||||
""" todo... randomly remove nodes; \n
|
||||
#self.node_mapping() #update node id index mapping is needed \n
|
||||
"""
|
||||
pass # to do...
|
||||
|
||||
# ------------------------------------------------------------------------------------------
|
||||
# --------------------commonly used APIs that will not modify graph-------------------------
|
||||
# ------------------------------------------------------------------------------------------
|
||||
@ -164,9 +142,3 @@ class Graph(object):
|
||||
def get_common_neighbors(self, node1, node2):
|
||||
""" return common neighbors of two nodes """
|
||||
return list(nx.common_neighbors(self.G, node1, node2))
|
||||
|
||||
def get_centrality(self, centrality_type='degree'):
|
||||
""" todo... return specified type of centrality \n
|
||||
see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html \n
|
||||
"""
|
||||
pass # to do...
|
||||
|
@ -30,11 +30,6 @@ class graphSAGE(object):
|
||||
if not is_supervised:
|
||||
from libnrl.graphsage import unsupervised_train
|
||||
self.vectors = unsupervised_train.train(train_data=train_data, test_data=None, model=sage_model)
|
||||
else:
|
||||
# to do...
|
||||
# from libnrl.graphsage import supervised_train
|
||||
# self.vectors = supervised_train.train()
|
||||
pass
|
||||
|
||||
def add_train_val_test_to_G(self, test_perc=0.0, val_perc=0.1):
|
||||
''' add if 'val' and/or 'test' to each node in G '''
|
||||
@ -54,7 +49,7 @@ class graphSAGE(object):
|
||||
G.nodes[id]['test'] = False
|
||||
G.nodes[id]['val'] = False
|
||||
# Make sure the graph has edge train_removed annotations
|
||||
# (some datasets might already have this..)
|
||||
# some datasets might already have this
|
||||
print("Loaded data.. now preprocessing..")
|
||||
for edge in G.edges():
|
||||
if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
|
||||
|
@ -11,11 +11,6 @@ from libnrl.graphsage.aggregators import (GCNAggregator, MaxPoolingAggregator,
|
||||
MeanPoolingAggregator, SeqAggregator)
|
||||
from libnrl.graphsage.prediction import BipartiteEdgePredLayer
|
||||
|
||||
'''
|
||||
flags = tf.app.flags
|
||||
FLAGS = FLAGS
|
||||
'''
|
||||
|
||||
# DISCLAIMER:
|
||||
# Boilerplate parts of this code file were originally forked from
|
||||
# https://github.com/tkipf/gcn
|
||||
|
@ -258,11 +258,7 @@ def train(train_data, test_data, model):
|
||||
# only print the last iter result at the end of each epoch
|
||||
print("Epoch:", '%04d' % epoch,
|
||||
"train_loss=", "{:.5f}".format(train_cost),
|
||||
# "train_mrr=", "{:.5f}".format(train_mrr),
|
||||
# "train_mrr_ema=", "{:.5f}".format(train_shadow_mrr),
|
||||
"val_loss=", "{:.5f}".format(val_cost),
|
||||
# "val_mrr=", "{:.5f}".format(val_mrr),
|
||||
# "val_mrr_ema=", "{:.5f}".format(shadow_mrr),
|
||||
"time cost", "{:.2f}".format(t2-t1))
|
||||
|
||||
# no early stopping was used in original code---------------- auto-save-best-emb ------------------------------
|
||||
|
@ -18,7 +18,6 @@ from networkx.readwrite import json_graph
|
||||
version_info = list(map(int, nx.__version__.split('.')))
|
||||
major = version_info[0]
|
||||
minor = version_info[1]
|
||||
#assert (major <= 1) and (minor <= 11), "networkx major version > 1.11"
|
||||
|
||||
WALK_LEN = 5
|
||||
N_WALKS = 50
|
||||
@ -27,12 +26,6 @@ N_WALKS = 50
|
||||
def load_data(prefix, normalize=True, load_walks=False):
|
||||
G_data = json.load(open(prefix + "-G.json"))
|
||||
G = json_graph.node_link_graph(G_data)
|
||||
'''
|
||||
if isinstance(G.nodes()[0], int):
|
||||
conversion = lambda n : int(n)
|
||||
else:
|
||||
conversion = lambda n : n
|
||||
'''
|
||||
def conversion(n): return int(n) # compatible with networkx >2.0
|
||||
|
||||
if os.path.exists(prefix + "-feats.npy"):
|
||||
@ -61,7 +54,7 @@ def load_data(prefix, normalize=True, load_walks=False):
|
||||
print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count))
|
||||
|
||||
# Make sure the graph has edge train_removed annotations
|
||||
# (some datasets might already have this..)
|
||||
# some datasets might already have this
|
||||
print("Loaded data.. now preprocessing..")
|
||||
for edge in G.edges():
|
||||
if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
|
||||
@ -104,7 +97,7 @@ def run_random_walks(G, nodes, num_walks=N_WALKS):
|
||||
return pairs
|
||||
|
||||
|
||||
if __name__ == "__main__": # 这个地方需要改写,可以每次运行都跑一次
|
||||
if __name__ == "__main__":
|
||||
""" Run random walks """
|
||||
graph_file = sys.argv[1]
|
||||
out_file = sys.argv[2]
|
||||
|
@ -16,8 +16,7 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
from .downstream import \
|
||||
ncClassifier # to do... try use lpClassifier to choose best embeddings?
|
||||
from .downstream import ncClassifier
|
||||
from .utils import read_node_label_downstream
|
||||
|
||||
|
||||
@ -48,9 +47,6 @@ class _LINE(object):
|
||||
cur_seed = random.getrandbits(32)
|
||||
self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
|
||||
self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
|
||||
# self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1)
|
||||
# self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1)
|
||||
# self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1)
|
||||
self.h_e = tf.nn.embedding_lookup(self.embeddings, self.h)
|
||||
self.t_e = tf.nn.embedding_lookup(self.embeddings, self.t)
|
||||
self.t_e_context = tf.nn.embedding_lookup(self.context_embeddings, self.t)
|
||||
@ -88,7 +84,6 @@ class _LINE(object):
|
||||
edges = [(look_up[x[0]], look_up[x[1]]) for x in self.g.G.edges()]
|
||||
|
||||
data_size = self.g.G.number_of_edges()
|
||||
# edge_set = set([x[0]*numNodes+x[1] for x in edges])
|
||||
shuffle_indices = np.random.permutation(np.arange(data_size))
|
||||
|
||||
# positive or negative mod
|
||||
@ -193,7 +188,6 @@ class _LINE(object):
|
||||
def get_embeddings(self):
|
||||
vectors = {}
|
||||
embeddings = self.embeddings.eval(session=self.sess)
|
||||
# embeddings = self.sess.run(tf.nn.l2_normalize(self.embeddings.eval(session=self.sess), 1))
|
||||
look_back = self.g.look_back_list
|
||||
for i, embedding in enumerate(embeddings):
|
||||
vectors[look_back[i]] = embedding
|
||||
|
@ -36,7 +36,6 @@ class TADW(object):
|
||||
look_back = self.g.look_back_list
|
||||
self.features = np.vstack([g.nodes[look_back[i]]['attr']
|
||||
for i in range(g.number_of_nodes())])
|
||||
# self.features = self.g.get_attr_mat().todense()
|
||||
self.preprocessFeature()
|
||||
return self.features.T
|
||||
|
||||
@ -46,8 +45,6 @@ class TADW(object):
|
||||
Ud = U[:, 0:200]
|
||||
Sd = S[0:200]
|
||||
self.features = np.array(Ud)*Sd.reshape(200)
|
||||
# from .utils import dim_reduction
|
||||
# self.features = dim_reduction(self.features, dim=200, method='svd')
|
||||
|
||||
def train(self):
|
||||
self.adj = self.getAdj()
|
||||
|
@ -49,8 +49,6 @@ def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
|
||||
|
||||
|
||||
def pairwise_similarity(mat, type='cosine'):
|
||||
# XXX: possible to integrate pairwise_similarity with top_k to enhance performance?
|
||||
# we'll use it elsewhere. if really needed, write a new method for this purpose
|
||||
if type == 'cosine': # support sprase and dense mat
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
result = cosine_similarity(mat, dense_output=True)
|
||||
@ -62,50 +60,19 @@ def pairwise_similarity(mat, type='cosine'):
|
||||
elif type == 'euclidean':
|
||||
from sklearn.metrics.pairwise import euclidean_distances
|
||||
# note: similarity = - distance
|
||||
# other version: similarity = 1 - 2 / pi * arctan(distance)
|
||||
result = euclidean_distances(mat)
|
||||
result = -result
|
||||
# result = 1 - 2 / np.pi * np.arctan(result)
|
||||
elif type == 'manhattan':
|
||||
from sklearn.metrics.pairwise import manhattan_distances
|
||||
# note: similarity = - distance
|
||||
# other version: similarity = 1 - 2 / pi * arctan(distance)
|
||||
result = manhattan_distances(mat)
|
||||
result = -result
|
||||
# result = 1 - 2 / np.pi * np.arctan(result)
|
||||
else:
|
||||
print('Please choose from: cosine, jaccard, euclidean or manhattan')
|
||||
return 'Not found!'
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------ulits for preprocessing--------------------------------
|
||||
def node_auxi_to_attr(fin, fout):
|
||||
""" TODO...
|
||||
-> read auxi info associated with each node;
|
||||
-> preprocessing auxi via:
|
||||
1) NLP for sentences; or 2) one-hot for discrete features;
|
||||
-> then becomes node attr with m dim, and store them into attr file
|
||||
"""
|
||||
# https://radimrehurek.com/gensim/apiref.html
|
||||
# word2vec, doc2vec, 把句子转为vec
|
||||
# text2vec, tfidf, 把离散的features转为vec
|
||||
pass
|
||||
|
||||
|
||||
def simulate_incomplete_stru():
|
||||
pass
|
||||
|
||||
|
||||
def simulate_incomplete_attr():
|
||||
pass
|
||||
|
||||
|
||||
def simulate_noisy_world():
|
||||
pass
|
||||
|
||||
# ---------------------------------ulits for downstream tasks--------------------------------
|
||||
# XXX: read and save using panda or numpy
|
||||
|
||||
|
||||
def read_edge_label_downstream(filename):
|
||||
@ -143,37 +110,6 @@ def read_node_label_downstream(filename):
|
||||
return X, Y
|
||||
|
||||
|
||||
def store_embedddings(vectors, filename, dim):
|
||||
""" store embeddings to file
|
||||
"""
|
||||
fout = open(filename, 'w')
|
||||
num_nodes = len(vectors.keys())
|
||||
fout.write("{} {}\n".format(num_nodes, dim))
|
||||
for node, vec in vectors.items():
|
||||
fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
|
||||
fout.close()
|
||||
print('store the resulting embeddings in file: ', filename)
|
||||
|
||||
|
||||
def load_embeddings(filename):
|
||||
""" load embeddings from file
|
||||
"""
|
||||
fin = open(filename, 'r')
|
||||
num_nodes, size = [int(x) for x in fin.readline().strip().split()]
|
||||
vectors = {}
|
||||
while 1:
|
||||
line = fin.readline()
|
||||
if line == '':
|
||||
break
|
||||
vec = line.strip().split(' ')
|
||||
assert len(vec) == size + 1
|
||||
vectors[vec[0]] = [float(x) for x in vec[1:]]
|
||||
fin.close()
|
||||
assert len(vectors) == num_nodes
|
||||
return vectors
|
||||
|
||||
|
||||
# ----------------- 以下你整理到utils,有问题的我都用中文写出来了,没有中文的暂时没啥问题,可以先不用管-----------------------
|
||||
def generate_edges_for_linkpred(graph, edges_removed, balance_ratio=1.0):
|
||||
''' given a graph and edges_removed;
|
||||
generate non_edges not in [both graph and edges_removed];
|
||||
@ -216,52 +152,3 @@ def dim_reduction(mat, dim=128, method='pca'):
|
||||
t2 = time.time()
|
||||
print('END dimensionality reduction: {:.2f}s'.format(t2-t1))
|
||||
return mat_reduced
|
||||
|
||||
|
||||
def row_normalized(mat, is_transition_matrix=False):
|
||||
''' to do...
|
||||
两个问题:1)sparse矩阵在该场景下比dense慢,(至少我自己写的这块代码是)
|
||||
2)dense矩阵测试后发现所有元素加起来不是整数,似乎还是要用我以前笨方法来弥补
|
||||
3)在is_transition_matrix时候,需要给全零行赋值,sparse时候会有点小问题,不能直接mat[i, :] = p赋值
|
||||
'''
|
||||
p = 1.0/mat.shape[0] # probability = 1/num of rows
|
||||
norms = np.asarray(mat.sum(axis=1)).ravel()
|
||||
for i, norm in enumerate(norms):
|
||||
if norm != 0:
|
||||
mat[i, :] /= norm
|
||||
else:
|
||||
if is_transition_matrix:
|
||||
mat[i, :] = p # every row of transition matrix should sum up to 1
|
||||
else:
|
||||
pass # do nothing; keep all-zero row
|
||||
return mat
|
||||
|
||||
|
||||
''' 笨方法如下'''
|
||||
|
||||
|
||||
def rowAsPDF(mat): # make each row sum up to 1 i.e. a probabolity density distribution
|
||||
mat = np.array(mat)
|
||||
for i in range(mat.shape[0]):
|
||||
sum_row = mat[i, :].sum()
|
||||
if sum_row != 0:
|
||||
mat[i, :] = mat[i, :]/sum_row # if a row [0, 1, 1, 1] -> [0, 1/3, 1/3, 1/3] -> may have some small issue...
|
||||
else:
|
||||
# to do...
|
||||
# for node without any link... remain row as [0, 0, 0, 0] OR set to [1/n, 1/n, 1/n...]??
|
||||
pass
|
||||
if mat[i, :].sum() != 1.00: # small trick to make sure each row is a pdf 笨犯法。。。
|
||||
error = 1.00 - mat[i, :].sum()
|
||||
mat[i, -1] += error
|
||||
return mat
|
||||
|
||||
|
||||
def sparse_to_dense():
|
||||
''' to dense np.matrix format 记得dtype用float64'''
|
||||
pass
|
||||
|
||||
|
||||
def dense_to_sparse():
|
||||
''' to sparse crs format 记得dtype用float64'''
|
||||
|
||||
pass
|
||||
|
@ -23,9 +23,6 @@ class WeightedWalker:
|
||||
self.T = transition_mat
|
||||
self.workers = workers
|
||||
self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix
|
||||
# print(nx.adjacency_matrix(self.rec_G).todense()[0:6, 0:6])
|
||||
# print(transition_mat[0:6, 0:6])
|
||||
# print(nx.adjacency_matrix(self.rec_G).todense()==transition_mat)
|
||||
|
||||
# alias sampling for ABRW-------------------------
|
||||
def simulate_walks(self, num_walks, walk_length):
|
||||
|
21
src/main.py
21
src/main.py
@ -22,7 +22,6 @@ from libnrl import line # PNE method
|
||||
from libnrl import tadw # ANE method
|
||||
from libnrl.downstream import lpClassifier, ncClassifier
|
||||
from libnrl.graph import Graph
|
||||
# from libnrl.gcn import gcnAPI # ANE method
|
||||
from libnrl.graphsage import graphsageAPI # ANE method
|
||||
from libnrl.grarep import GraRep # PNE method
|
||||
from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream
|
||||
@ -30,8 +29,6 @@ from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream
|
||||
from sklearn.linear_model import LogisticRegression # to do... 1) put it in downstream.py; and 2) try SVM...
|
||||
from libnrl import abrw # ANE method; Attributed Biased Random Walk
|
||||
from libnrl import node2vec # PNE method; including deepwalk and node2vec
|
||||
# from libnrl import TriDNR #to do... ANE method
|
||||
# https://github.com/dfdazac/dgi #to do... ANE method
|
||||
|
||||
|
||||
def parse_args():
|
||||
@ -51,10 +48,6 @@ def parse_args():
|
||||
help='choices of downstream tasks: none, lp, nc, lp_and_nc')
|
||||
parser.add_argument('--link-remove', default=0.1, type=float,
|
||||
help='simulate randomly missing links if necessary; a ratio ranging [0.0, 1.0]')
|
||||
# parser.add_argument('--attr-remove', default=0.0, type=float,
|
||||
# help='simulate randomly missing attributes if necessary; a ratio ranging [0.0, 1.0]')
|
||||
# parser.add_argument('--link-reserved', default=0.7, type=float,
|
||||
# help='for lp task, train/test split, a ratio ranging [0.0, 1.0]')
|
||||
parser.add_argument('--label-reserved', default=0.7, type=float,
|
||||
help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
|
||||
parser.add_argument('--directed', default=False, action='store_true',
|
||||
@ -141,7 +134,6 @@ def main(args):
|
||||
assert args.attribute_file != ''
|
||||
g.read_node_attr(args.attribute_file)
|
||||
# load node label info------
|
||||
# to do... similar to attribute {'key_attribute': value}, label also loaded as {'key_label': value}
|
||||
t2 = time.time()
|
||||
print(f'STEP1: end loading data; time cost: {(t2-t1):.2f}s')
|
||||
|
||||
@ -204,16 +196,6 @@ def main(args):
|
||||
model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
|
||||
print(f'Save node embeddings in file: {args.emb_file}')
|
||||
|
||||
'''
|
||||
#to do.... semi-supervised methods: gcn, graphsage, etc...
|
||||
if args.method == 'gcn': #semi-supervised gcn
|
||||
assert args.label_file != ''
|
||||
assert args.feature_file != ''
|
||||
g.read_node_label(args.label_file)
|
||||
model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved)
|
||||
print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs
|
||||
exit(0)
|
||||
'''
|
||||
|
||||
# ---------------------------------------STEP4: downstream task-----------------------------------------------
|
||||
print('\nSTEP4: start evaluating ......: ')
|
||||
@ -222,7 +204,6 @@ def main(args):
|
||||
del model, g
|
||||
# ------lp task
|
||||
if args.task == 'lp' or args.task == 'lp_and_nc':
|
||||
# X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file) # if you want to load your own lp testing data
|
||||
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
|
||||
clf = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||
clf.evaluate(test_node_pairs, test_edge_labels)
|
||||
@ -238,7 +219,5 @@ def main(args):
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f'------ START @ {time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())} ------')
|
||||
# random.seed(2018)
|
||||
# np.random.seed(2018)
|
||||
main(parse_args())
|
||||
print(f'------ END @ {time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())} ------')
|
||||
|
Loading…
Reference in New Issue
Block a user