asne_checked_v0.0

This commit is contained in:
Chengbin Hou 2018-11-22 22:49:28 +00:00
parent 7e1039cc3c
commit 2c52ce69e6
2 changed files with 132 additions and 135 deletions

View File

@ -1,86 +1,39 @@
# -*- coding: utf-8 -*-
''' '''
Tensorflow implementation of Social Network Embedding framework (SNE) ANE method: Attributed Social Network Embedding (ASNE)
@author: Lizi Liao (liaolizi.llz@gmail.com)
part of code was originally forked from https://github.com/lizi-git/ASNE
modified by Chengbin Hou 2018 modified by Chengbin Hou 2018
1) convert OpenANE data format to ASNE data format 1) convert OpenANE data format to ASNE data format
2) compatible with latest tensorflow 1.2 2) compatible with latest tensorflow 1.10.0
3) add more comments 3) add early stopping
4) support eval testing set during each xx epoches 4) as ASNE paper stated, we add two hidden layers with softsign activation func
5) as ASNE paper stated, we add two hidden layers with softsign activation func
part of code was originally forked from https://github.com/lizi-git/ASNE
''' '''
import math import math
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
from .classify import ncClassifier, lpClassifier, read_node_label import time
from sklearn.linear_model import LogisticRegression #from .classify import ncClassifier, lpClassifier, read_node_label
#from sklearn.linear_model import LogisticRegression
def format_data_from_OpenANE_to_ASNE(g, dim):
'''
convert OpenANE data format to ASNE data format
g: OpenANE graph data structure
dim: final embedding dim
'''
attr_Matrix = g.getX()
#attr_Matrix = g.preprocessAttrInfo(attr_Matrix, dim=200, method='svd') #similar to aane, the same preprocessing
#print('with this preprocessing, ASNE can get better result, as well as, faster speed----------------')
id_N = attr_Matrix.shape[0] #n nodes
attr_M = attr_Matrix.shape[1] #m features
edge_num = len(g.G.edges) #total edges for traning
X={} #one-to-one correspondence
X['data_id_list'] = np.zeros(edge_num) #start node list for traning
X['data_label_list'] = np.zeros(edge_num) #end node list for training
X['data_attr_list'] = np.zeros([edge_num, attr_M]) #attr corresponds to start node
edgelist = [edge for edge in g.G.edges]
i = 0
for edge in edgelist: #traning sample = start node, end node, start node attr
X['data_id_list'][i] = edge[0]
X['data_label_list'][i] = edge[1]
X['data_attr_list'][i] = attr_Matrix[ g.look_up_dict[edge[0]] ][:]
i += 1
X['data_id_list'] = X['data_id_list'].reshape(-1).astype(int)
X['data_label_list'] = X['data_label_list'].reshape(-1,1).astype(int)
nodes={} #one-to-one correspondence
nodes['node_id'] = g.look_back_list #n nodes
nodes['node_attr'] = list(attr_Matrix) #m features -> n*m
id_embedding_size = int(dim/2)
attr_embedding_size = int(dim/2)
print('id_embedding_size', id_embedding_size, 'attr_embedding_size', attr_embedding_size)
return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size
def add_layer(inputs, in_size, out_size, activation_function=None):
# add one more layer and return the output of this layer
Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs
class ASNE(BaseEstimator, TransformerMixin): class ASNE(BaseEstimator, TransformerMixin):
def __init__(self, graph, dim, alpha = 1.0, batch_size=128, learning_rate=0.001, def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
n_neg_samples=10, epoch=100, random_seed=2018, X_test=0, Y_test=0, task='nc', nc_ratio=0.5, lp_ratio=0.9, label_file=''): early_stopping=2000): #it seems that overfitting can get better result? try other early_stopping... to do...
# bind params to class
t1 = time.time()
X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim) X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
t2 = time.time()
print(f'transform data format from OpenANE to ASNE; time cost: {(t2-t1):.2f}s')
self.node_N = id_N #n self.node_N = id_N #n
self.attr_M = attr_M #m self.attr_M = attr_M #m
self.X_train = X #{'data_id_list': [], 'data_label_list': [], 'data_attr_list': []} self.X_train = X #{'data_id_list': [], 'data_label_list': [], 'data_attr_list': []}
self.nodes = nodes #{'node_id': [], 'node_attr: []'} self.nodes = nodes #{'node_id': [], 'node_attr: []'}
self.id_embedding_size = id_embedding_size # set to dim/2 self.id_embedding_size = id_embedding_size # set to dim/2
self.attr_embedding_size = attr_embedding_size # set to dim/2 self.attr_embedding_size = attr_embedding_size # set to dim/2
self.vectors = {} self.vectors = {} #final embs
self.dim = dim
self.look_back_list = graph.look_back_list #from OpenANE data stcuture self.look_back_list = graph.look_back_list #from OpenANE data stcuture
self.alpha = alpha #set to 1.0 by default self.alpha = alpha #set to 1.0 by default
@ -88,19 +41,11 @@ class ASNE(BaseEstimator, TransformerMixin):
self.batch_size = batch_size #set to 128 by default self.batch_size = batch_size #set to 128 by default
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.epoch = epoch #set to 20 by default self.epoch = epoch #set to 20 by default
self.random_seed = random_seed
self._init_graph() #init all variables in a tensorflow graph
self.task = task self._init_graph() #init all variables in a tensorflow graph
self.nc_ratio = nc_ratio self.early_stopping = early_stopping #early stopping if training loss increased for xx iterations
self.lp_ratio = lp_ratio self.train()
if self.task == 'lp': #if not lp task, we do not need to keep testing edges
self.X_test = X_test
self.Y_test = Y_test
self.train() #train our tf asne model-----------------
elif self.task == 'nc' or self.task == 'nclp':
self.X_nc_label, self.Y_nc_label = read_node_label(label_file)
self.train() #train our tf asne model-----------------
def _init_graph(self): def _init_graph(self):
''' '''
@ -110,7 +55,7 @@ class ASNE(BaseEstimator, TransformerMixin):
#with self.graph.as_default(), tf.device('/gpu:0'): #with self.graph.as_default(), tf.device('/gpu:0'):
with self.graph.as_default(): with self.graph.as_default():
# Set graph level random seed # Set graph level random seed
tf.set_random_seed(self.random_seed) #tf.set_random_seed(2018)
# Input data. # Input data.
self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1 self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1
self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M
@ -126,25 +71,26 @@ class ASNE(BaseEstimator, TransformerMixin):
self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim
self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1) # batch_size * (id_dim + attr_dim) #an error due to old tf! self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1) # batch_size * (id_dim + attr_dim) #an error due to old tf!
'''
## can add hidden_layers component here! ## can add hidden_layers component here!----------------------------------
#0) no hidden layer #0) no hidden layer
#1) 128 #1) 128
#2) 256+128 ##--------paper stated it used two hidden layers with activation function softsign.... #2) 256+128 ##--------paper stated it used two hidden layers with softsign
#3) 512+256+128 #3) 512+256+128
len_h1_in = self.id_embedding_size+self.attr_embedding_size len_h1_in = self.id_embedding_size + self.attr_embedding_size
len_h1_out = 256 len_h1_out = 256 #or self.id_embedding_size + self.attr_embedding_size # if only add h1
len_h2_in = len_h1_out len_h2_in = len_h1_out
len_h2_out = 128 len_h2_out = self.id_embedding_size + self.attr_embedding_size
self.h1 = add_layer(inputs=self.embed_layer, in_size=len_h1_in, out_size=len_h1_out, activation_function=tf.nn.softsign) self.h1 = add_layer(inputs=self.embed_layer, in_size=len_h1_in, out_size=len_h1_out, activation_function=tf.nn.softsign)
self.h2 = add_layer(inputs=self.h1, in_size=len_h2_in, out_size=len_h2_out, activation_function=tf.nn.softsign) self.h2 = add_layer(inputs=self.h1, in_size=len_h2_in, out_size=len_h2_out, activation_function=tf.nn.softsign)
## -------------------------------------------------------------------------
'''
# Compute the loss, using a sample of the negative labels each time. # Compute the loss, using a sample of the negative labels each time.
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], #if one needs to change layers
inputs = self.h2, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) inputs = self.embed_layer, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) #try inputs = self.embed_layer or self.h1 or self.h2 or ...
# Optimizer. # Optimizer.
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) #tune these parameters? self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
# print("AdamOptimizer") # print("AdamOptimizer")
# init # init
@ -171,14 +117,18 @@ class ASNE(BaseEstimator, TransformerMixin):
start_index = np.random.randint(0, len(data) - batch_size) start_index = np.random.randint(0, len(data) - batch_size)
return data[start_index:(start_index + batch_size)] return data[start_index:(start_index + batch_size)]
def train(self): # fit a dataset
self.Embeddings = []
print('Using in + out embedding')
for epoch in range( self.epoch ): def train(self):
total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) #total_batch*batch_size = numOFlinks?? self.Embeddings = []
# print('total_batch in 1 epoch: ', total_batch) total_batch = int( len(self.X_train['data_id_list']) / self.batch_size)
# Loop over all batches iter_count = 0
train_loss_best = 0
train_loss_keep_increasing = 0
early_stopping = self.early_stopping #early stopping if training loss increased
for epoch in range(self.epoch):
t1 = time.time()
for i in range(total_batch): for i in range(total_batch):
# generate a batch data # generate a batch data
batch_xs = {} batch_xs = {}
@ -188,25 +138,40 @@ class ASNE(BaseEstimator, TransformerMixin):
batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)] batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)]
# Fit training using batch data # Fit training using batch data
cost = self.partial_fit(batch_xs) train_loss = self.partial_fit(batch_xs)
iter_count += 1
# Display logs per epoch if iter_count == 1:
Embeddings_out = self.getEmbedding('out_embedding', self.nodes) train_loss_best = train_loss
Embeddings_in = self.getEmbedding('embed_layer', self.nodes) else:
self.Embeddings = Embeddings_out + Embeddings_in #simply mean them and as final embedding; try concat? to do... if train_loss_best > train_loss: # training loss decreasing
#print('training tensorflow asne model, epoc: ', epoch+1 , ' / ', self.epoch) train_loss_best = train_loss
#to save training time, we delete eval testing data @ each epoch train_loss_keep_increasing = 0 # reset
else: # training loss increasing
train_loss_keep_increasing += 1
if train_loss_keep_increasing > early_stopping: # early stopping
print(f'early stopping @ iter {iter_count}; take out embs and return')
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do...
ind = 0
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
self.vectors[id] = self.Embeddings[ind]
ind += 1
return self.vectors
else:
pass
t2 = time.time()
print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
#-----------for each xx epoches; save embeddings {node_id1: [], node_id2: [], ...}---------- print(f'finish all {self.epoch} epochs; take out embs and return')
if (epoch+1)%1 == 0 and epoch != 0: #for every xx epoches, try eval Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
print('@@@ epoch ------- ', epoch+1 , ' / ', self.epoch) Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
ind = 0 self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do...
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list ind = 0
self.vectors[id] = self.Embeddings[ind] for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
ind += 1 self.vectors[id] = self.Embeddings[ind]
#self.eval(vectors=self.vectors) ind += 1
print('please note that: the fianl embedding returned and its output file are not the best embedding!') return self.vectors
print('for the best embeddings, please check which epoch got the best eval metric(s)......')
def getEmbedding(self, type, nodes): def getEmbedding(self, type, nodes):
@ -224,21 +189,59 @@ class ASNE(BaseEstimator, TransformerMixin):
''' '''
fout = open(filename, 'w') fout = open(filename, 'w')
node_num = len(self.vectors.keys()) node_num = len(self.vectors.keys())
fout.write("{} {}\n".format(node_num, self.dim)) fout.write("{} {}\n".format(node_num, self.id_embedding_size+self.attr_embedding_size))
for node, vec in self.vectors.items(): for node, vec in self.vectors.items():
fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec]))) fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec])))
fout.close() fout.close()
def eval(self, vectors): # ---------------------------------------------- ASNE utils ------------------------------------------------
#------nc task def format_data_from_OpenANE_to_ASNE(g, dim):
if self.task == 'nc' or self.task == 'nclp': ''' convert OpenANE data format to ASNE data format '''
print("Training nc classifier using {:.2f}% node labels...".format(self.nc_ratio*100)) attr_Matrix = g.get_attr_mat(is_sparse=False)
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones id_N = attr_Matrix.shape[0] #n nodes
clf.split_train_evaluate(self.X_nc_label, self.Y_nc_label, self.nc_ratio) attr_M = attr_Matrix.shape[1] #m features
#------lp task
if self.task == 'lp':
#X_test, Y_test = read_edge_label(args.label_file) #enable this if you want to load your own lp testing data, see classfiy.py
print("During embedding we have used {:.2f}% links and the remaining will be left for lp evaluation...".format(self.lp_ratio*100))
clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm
clf.evaluate(self.X_test, self.Y_test)
X = {}
X['data_id_list'] = []
X['data_label_list'] = []
X['data_attr_list'] = []
edgelist = [edge for edge in g.G.edges]
print('If an edge only have one direction, double it......')
cnt = 0
for edge in edgelist: #traning sample = start node, end node, start node attr
X['data_id_list'].append(edge[0])
X['data_label_list'].append(edge[1])
X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[0]] ][:])
cnt += 1
if (edge[1], edge[0]) not in edgelist: # double! as paper said--------------
X['data_id_list'].append(edge[1])
X['data_label_list'].append(edge[0])
X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[1]] ][:])
cnt += 1
print(f'edges before doubling: {g.get_num_edges()}')
print(f'edges after doubling: {cnt}')
X['data_id_list'] = np.array(X['data_id_list']).reshape(-1).astype(int)
X['data_label_list'] = np.array(X['data_label_list']).reshape(-1,1).astype(int)
X['data_attr_list'] = np.array(X['data_attr_list']).reshape(cnt,attr_M)
nodes={}
nodes['node_id'] = g.look_back_list
nodes['node_attr'] = attr_Matrix
id_embedding_size = int(dim/2)
attr_embedding_size = int(dim/2)
print('id_embedding_size', id_embedding_size, '\nattr_embedding_size', attr_embedding_size)
return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size
def add_layer(inputs, in_size, out_size, activation_function=None):
# add one more layer and return the output of this layer
Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs

View File

@ -194,14 +194,8 @@ def main(args):
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False) model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
elif args.method == 'sagegcn': #parameters for graphsage models are in 'graphsage' -> '__init__.py' elif args.method == 'sagegcn': #parameters for graphsage models are in 'graphsage' -> '__init__.py'
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False) model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
elif args.method == 'asne': elif args.method == 'asne':
if args.task == 'nc': model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
else:
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
else: else:
print('method not found...') print('method not found...')
exit(0) exit(0)
@ -215,9 +209,9 @@ def main(args):
''' '''
#to do.... semi-supervised methods: gcn, graphsage, etc... #to do.... semi-supervised methods: gcn, graphsage, etc...
if args.method == 'gcn': #semi-supervised gcn if args.method == 'gcn': #semi-supervised gcn
assert args.label_file != '' #must have node label assert args.label_file != ''
assert args.feature_file != '' #different from previous ANE methods assert args.feature_file != ''
g.read_node_label(args.label_file) #gcn is an end-to-end supervised ANE methoed g.read_node_label(args.label_file)
model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved) model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved)
print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs
exit(0) exit(0)