asne_checked_v0.0
This commit is contained in:
parent
7e1039cc3c
commit
2c52ce69e6
@ -1,86 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
Tensorflow implementation of Social Network Embedding framework (SNE)
|
||||
@author: Lizi Liao (liaolizi.llz@gmail.com)
|
||||
part of code was originally forked from https://github.com/lizi-git/ASNE
|
||||
ANE method: Attributed Social Network Embedding (ASNE)
|
||||
|
||||
modified by Chengbin Hou 2018
|
||||
1) convert OpenANE data format to ASNE data format
|
||||
2) compatible with latest tensorflow 1.2
|
||||
3) add more comments
|
||||
4) support eval testing set during each xx epoches
|
||||
5) as ASNE paper stated, we add two hidden layers with softsign activation func
|
||||
2) compatible with latest tensorflow 1.10.0
|
||||
3) add early stopping
|
||||
4) as ASNE paper stated, we add two hidden layers with softsign activation func
|
||||
|
||||
part of code was originally forked from https://github.com/lizi-git/ASNE
|
||||
'''
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from .classify import ncClassifier, lpClassifier, read_node_label
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
def format_data_from_OpenANE_to_ASNE(g, dim):
|
||||
'''
|
||||
convert OpenANE data format to ASNE data format
|
||||
g: OpenANE graph data structure
|
||||
dim: final embedding dim
|
||||
'''
|
||||
attr_Matrix = g.getX()
|
||||
#attr_Matrix = g.preprocessAttrInfo(attr_Matrix, dim=200, method='svd') #similar to aane, the same preprocessing
|
||||
#print('with this preprocessing, ASNE can get better result, as well as, faster speed----------------')
|
||||
id_N = attr_Matrix.shape[0] #n nodes
|
||||
attr_M = attr_Matrix.shape[1] #m features
|
||||
|
||||
edge_num = len(g.G.edges) #total edges for traning
|
||||
X={} #one-to-one correspondence
|
||||
X['data_id_list'] = np.zeros(edge_num) #start node list for traning
|
||||
X['data_label_list'] = np.zeros(edge_num) #end node list for training
|
||||
X['data_attr_list'] = np.zeros([edge_num, attr_M]) #attr corresponds to start node
|
||||
edgelist = [edge for edge in g.G.edges]
|
||||
i = 0
|
||||
for edge in edgelist: #traning sample = start node, end node, start node attr
|
||||
X['data_id_list'][i] = edge[0]
|
||||
X['data_label_list'][i] = edge[1]
|
||||
X['data_attr_list'][i] = attr_Matrix[ g.look_up_dict[edge[0]] ][:]
|
||||
i += 1
|
||||
X['data_id_list'] = X['data_id_list'].reshape(-1).astype(int)
|
||||
X['data_label_list'] = X['data_label_list'].reshape(-1,1).astype(int)
|
||||
|
||||
nodes={} #one-to-one correspondence
|
||||
nodes['node_id'] = g.look_back_list #n nodes
|
||||
nodes['node_attr'] = list(attr_Matrix) #m features -> n*m
|
||||
|
||||
id_embedding_size = int(dim/2)
|
||||
attr_embedding_size = int(dim/2)
|
||||
print('id_embedding_size', id_embedding_size, 'attr_embedding_size', attr_embedding_size)
|
||||
return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size
|
||||
|
||||
|
||||
def add_layer(inputs, in_size, out_size, activation_function=None):
|
||||
# add one more layer and return the output of this layer
|
||||
Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated
|
||||
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
|
||||
Wx_plus_b = tf.matmul(inputs, Weights) + biases
|
||||
if activation_function is None:
|
||||
outputs = Wx_plus_b
|
||||
else:
|
||||
outputs = activation_function(Wx_plus_b)
|
||||
return outputs
|
||||
|
||||
import time
|
||||
#from .classify import ncClassifier, lpClassifier, read_node_label
|
||||
#from sklearn.linear_model import LogisticRegression
|
||||
|
||||
class ASNE(BaseEstimator, TransformerMixin):
|
||||
def __init__(self, graph, dim, alpha = 1.0, batch_size=128, learning_rate=0.001,
|
||||
n_neg_samples=10, epoch=100, random_seed=2018, X_test=0, Y_test=0, task='nc', nc_ratio=0.5, lp_ratio=0.9, label_file=''):
|
||||
# bind params to class
|
||||
def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
|
||||
early_stopping=2000): #it seems that overfitting can get better result? try other early_stopping... to do...
|
||||
|
||||
t1 = time.time()
|
||||
X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
|
||||
t2 = time.time()
|
||||
print(f'transform data format from OpenANE to ASNE; time cost: {(t2-t1):.2f}s')
|
||||
|
||||
self.node_N = id_N #n
|
||||
self.attr_M = attr_M #m
|
||||
self.X_train = X #{'data_id_list': [], 'data_label_list': [], 'data_attr_list': []}
|
||||
self.nodes = nodes #{'node_id': [], 'node_attr: []'}
|
||||
self.id_embedding_size = id_embedding_size # set to dim/2
|
||||
self.attr_embedding_size = attr_embedding_size # set to dim/2
|
||||
self.vectors = {}
|
||||
self.dim = dim
|
||||
self.vectors = {} #final embs
|
||||
self.look_back_list = graph.look_back_list #from OpenANE data stcuture
|
||||
|
||||
self.alpha = alpha #set to 1.0 by default
|
||||
@ -88,19 +41,11 @@ class ASNE(BaseEstimator, TransformerMixin):
|
||||
self.batch_size = batch_size #set to 128 by default
|
||||
self.learning_rate = learning_rate
|
||||
self.epoch = epoch #set to 20 by default
|
||||
self.random_seed = random_seed
|
||||
self._init_graph() #init all variables in a tensorflow graph
|
||||
|
||||
self.task = task
|
||||
self.nc_ratio = nc_ratio
|
||||
self.lp_ratio = lp_ratio
|
||||
if self.task == 'lp': #if not lp task, we do not need to keep testing edges
|
||||
self.X_test = X_test
|
||||
self.Y_test = Y_test
|
||||
self.train() #train our tf asne model-----------------
|
||||
elif self.task == 'nc' or self.task == 'nclp':
|
||||
self.X_nc_label, self.Y_nc_label = read_node_label(label_file)
|
||||
self.train() #train our tf asne model-----------------
|
||||
self._init_graph() #init all variables in a tensorflow graph
|
||||
self.early_stopping = early_stopping #early stopping if training loss increased for xx iterations
|
||||
self.train()
|
||||
|
||||
|
||||
def _init_graph(self):
|
||||
'''
|
||||
@ -110,7 +55,7 @@ class ASNE(BaseEstimator, TransformerMixin):
|
||||
#with self.graph.as_default(), tf.device('/gpu:0'):
|
||||
with self.graph.as_default():
|
||||
# Set graph level random seed
|
||||
tf.set_random_seed(self.random_seed)
|
||||
#tf.set_random_seed(2018)
|
||||
# Input data.
|
||||
self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1
|
||||
self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M
|
||||
@ -126,25 +71,26 @@ class ASNE(BaseEstimator, TransformerMixin):
|
||||
self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim
|
||||
self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1) # batch_size * (id_dim + attr_dim) #an error due to old tf!
|
||||
|
||||
|
||||
## can add hidden_layers component here!
|
||||
'''
|
||||
## can add hidden_layers component here!----------------------------------
|
||||
#0) no hidden layer
|
||||
#1) 128
|
||||
#2) 256+128 ##--------paper stated it used two hidden layers with activation function softsign....
|
||||
#2) 256+128 ##--------paper stated it used two hidden layers with softsign
|
||||
#3) 512+256+128
|
||||
len_h1_in = self.id_embedding_size+self.attr_embedding_size
|
||||
len_h1_out = 256
|
||||
len_h1_in = self.id_embedding_size + self.attr_embedding_size
|
||||
len_h1_out = 256 #or self.id_embedding_size + self.attr_embedding_size # if only add h1
|
||||
len_h2_in = len_h1_out
|
||||
len_h2_out = 128
|
||||
len_h2_out = self.id_embedding_size + self.attr_embedding_size
|
||||
self.h1 = add_layer(inputs=self.embed_layer, in_size=len_h1_in, out_size=len_h1_out, activation_function=tf.nn.softsign)
|
||||
self.h2 = add_layer(inputs=self.h1, in_size=len_h2_in, out_size=len_h2_out, activation_function=tf.nn.softsign)
|
||||
|
||||
## -------------------------------------------------------------------------
|
||||
'''
|
||||
|
||||
# Compute the loss, using a sample of the negative labels each time.
|
||||
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'],
|
||||
inputs = self.h2, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N))
|
||||
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], #if one needs to change layers
|
||||
inputs = self.embed_layer, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) #try inputs = self.embed_layer or self.h1 or self.h2 or ...
|
||||
# Optimizer.
|
||||
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) #tune these parameters?
|
||||
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
|
||||
# print("AdamOptimizer")
|
||||
|
||||
# init
|
||||
@ -171,14 +117,18 @@ class ASNE(BaseEstimator, TransformerMixin):
|
||||
start_index = np.random.randint(0, len(data) - batch_size)
|
||||
return data[start_index:(start_index + batch_size)]
|
||||
|
||||
def train(self): # fit a dataset
|
||||
self.Embeddings = []
|
||||
print('Using in + out embedding')
|
||||
|
||||
for epoch in range( self.epoch ):
|
||||
total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) #total_batch*batch_size = numOFlinks??
|
||||
# print('total_batch in 1 epoch: ', total_batch)
|
||||
# Loop over all batches
|
||||
def train(self):
|
||||
self.Embeddings = []
|
||||
total_batch = int( len(self.X_train['data_id_list']) / self.batch_size)
|
||||
iter_count = 0
|
||||
train_loss_best = 0
|
||||
train_loss_keep_increasing = 0
|
||||
early_stopping = self.early_stopping #early stopping if training loss increased
|
||||
|
||||
for epoch in range(self.epoch):
|
||||
t1 = time.time()
|
||||
|
||||
for i in range(total_batch):
|
||||
# generate a batch data
|
||||
batch_xs = {}
|
||||
@ -188,25 +138,40 @@ class ASNE(BaseEstimator, TransformerMixin):
|
||||
batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)]
|
||||
|
||||
# Fit training using batch data
|
||||
cost = self.partial_fit(batch_xs)
|
||||
|
||||
# Display logs per epoch
|
||||
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
|
||||
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
|
||||
self.Embeddings = Embeddings_out + Embeddings_in #simply mean them and as final embedding; try concat? to do...
|
||||
#print('training tensorflow asne model, epoc: ', epoch+1 , ' / ', self.epoch)
|
||||
#to save training time, we delete eval testing data @ each epoch
|
||||
train_loss = self.partial_fit(batch_xs)
|
||||
iter_count += 1
|
||||
if iter_count == 1:
|
||||
train_loss_best = train_loss
|
||||
else:
|
||||
if train_loss_best > train_loss: # training loss decreasing
|
||||
train_loss_best = train_loss
|
||||
train_loss_keep_increasing = 0 # reset
|
||||
else: # training loss increasing
|
||||
train_loss_keep_increasing += 1
|
||||
if train_loss_keep_increasing > early_stopping: # early stopping
|
||||
print(f'early stopping @ iter {iter_count}; take out embs and return')
|
||||
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
|
||||
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
|
||||
self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do...
|
||||
ind = 0
|
||||
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
|
||||
self.vectors[id] = self.Embeddings[ind]
|
||||
ind += 1
|
||||
return self.vectors
|
||||
else:
|
||||
pass
|
||||
t2 = time.time()
|
||||
print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
|
||||
|
||||
#-----------for each xx epoches; save embeddings {node_id1: [], node_id2: [], ...}----------
|
||||
if (epoch+1)%1 == 0 and epoch != 0: #for every xx epoches, try eval
|
||||
print('@@@ epoch ------- ', epoch+1 , ' / ', self.epoch)
|
||||
ind = 0
|
||||
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
|
||||
self.vectors[id] = self.Embeddings[ind]
|
||||
ind += 1
|
||||
#self.eval(vectors=self.vectors)
|
||||
print('please note that: the fianl embedding returned and its output file are not the best embedding!')
|
||||
print('for the best embeddings, please check which epoch got the best eval metric(s)......')
|
||||
print(f'finish all {self.epoch} epochs; take out embs and return')
|
||||
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
|
||||
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
|
||||
self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do...
|
||||
ind = 0
|
||||
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
|
||||
self.vectors[id] = self.Embeddings[ind]
|
||||
ind += 1
|
||||
return self.vectors
|
||||
|
||||
|
||||
def getEmbedding(self, type, nodes):
|
||||
@ -224,21 +189,59 @@ class ASNE(BaseEstimator, TransformerMixin):
|
||||
'''
|
||||
fout = open(filename, 'w')
|
||||
node_num = len(self.vectors.keys())
|
||||
fout.write("{} {}\n".format(node_num, self.dim))
|
||||
fout.write("{} {}\n".format(node_num, self.id_embedding_size+self.attr_embedding_size))
|
||||
for node, vec in self.vectors.items():
|
||||
fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec])))
|
||||
fout.close()
|
||||
|
||||
def eval(self, vectors):
|
||||
#------nc task
|
||||
if self.task == 'nc' or self.task == 'nclp':
|
||||
print("Training nc classifier using {:.2f}% node labels...".format(self.nc_ratio*100))
|
||||
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones
|
||||
clf.split_train_evaluate(self.X_nc_label, self.Y_nc_label, self.nc_ratio)
|
||||
#------lp task
|
||||
if self.task == 'lp':
|
||||
#X_test, Y_test = read_edge_label(args.label_file) #enable this if you want to load your own lp testing data, see classfiy.py
|
||||
print("During embedding we have used {:.2f}% links and the remaining will be left for lp evaluation...".format(self.lp_ratio*100))
|
||||
clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||
clf.evaluate(self.X_test, self.Y_test)
|
||||
# ---------------------------------------------- ASNE utils ------------------------------------------------
|
||||
def format_data_from_OpenANE_to_ASNE(g, dim):
|
||||
''' convert OpenANE data format to ASNE data format '''
|
||||
attr_Matrix = g.get_attr_mat(is_sparse=False)
|
||||
id_N = attr_Matrix.shape[0] #n nodes
|
||||
attr_M = attr_Matrix.shape[1] #m features
|
||||
|
||||
X = {}
|
||||
X['data_id_list'] = []
|
||||
X['data_label_list'] = []
|
||||
X['data_attr_list'] = []
|
||||
edgelist = [edge for edge in g.G.edges]
|
||||
print('If an edge only have one direction, double it......')
|
||||
cnt = 0
|
||||
for edge in edgelist: #traning sample = start node, end node, start node attr
|
||||
X['data_id_list'].append(edge[0])
|
||||
X['data_label_list'].append(edge[1])
|
||||
X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[0]] ][:])
|
||||
cnt += 1
|
||||
if (edge[1], edge[0]) not in edgelist: # double! as paper said--------------
|
||||
X['data_id_list'].append(edge[1])
|
||||
X['data_label_list'].append(edge[0])
|
||||
X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[1]] ][:])
|
||||
cnt += 1
|
||||
print(f'edges before doubling: {g.get_num_edges()}')
|
||||
print(f'edges after doubling: {cnt}')
|
||||
|
||||
X['data_id_list'] = np.array(X['data_id_list']).reshape(-1).astype(int)
|
||||
X['data_label_list'] = np.array(X['data_label_list']).reshape(-1,1).astype(int)
|
||||
X['data_attr_list'] = np.array(X['data_attr_list']).reshape(cnt,attr_M)
|
||||
|
||||
nodes={}
|
||||
nodes['node_id'] = g.look_back_list
|
||||
nodes['node_attr'] = attr_Matrix
|
||||
|
||||
id_embedding_size = int(dim/2)
|
||||
attr_embedding_size = int(dim/2)
|
||||
print('id_embedding_size', id_embedding_size, '\nattr_embedding_size', attr_embedding_size)
|
||||
return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size
|
||||
|
||||
|
||||
def add_layer(inputs, in_size, out_size, activation_function=None):
|
||||
# add one more layer and return the output of this layer
|
||||
Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated
|
||||
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
|
||||
Wx_plus_b = tf.matmul(inputs, Weights) + biases
|
||||
if activation_function is None:
|
||||
outputs = Wx_plus_b
|
||||
else:
|
||||
outputs = activation_function(Wx_plus_b)
|
||||
return outputs
|
14
src/main.py
14
src/main.py
@ -194,14 +194,8 @@ def main(args):
|
||||
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
|
||||
elif args.method == 'sagegcn': #parameters for graphsage models are in 'graphsage' -> '__init__.py'
|
||||
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
|
||||
|
||||
elif args.method == 'asne':
|
||||
if args.task == 'nc':
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
|
||||
X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
|
||||
else:
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
|
||||
X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
|
||||
else:
|
||||
print('method not found...')
|
||||
exit(0)
|
||||
@ -215,9 +209,9 @@ def main(args):
|
||||
'''
|
||||
#to do.... semi-supervised methods: gcn, graphsage, etc...
|
||||
if args.method == 'gcn': #semi-supervised gcn
|
||||
assert args.label_file != '' #must have node label
|
||||
assert args.feature_file != '' #different from previous ANE methods
|
||||
g.read_node_label(args.label_file) #gcn is an end-to-end supervised ANE methoed
|
||||
assert args.label_file != ''
|
||||
assert args.feature_file != ''
|
||||
g.read_node_label(args.label_file)
|
||||
model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved)
|
||||
print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs
|
||||
exit(0)
|
||||
|
Loading…
Reference in New Issue
Block a user