asne_checked_v0.0

This commit is contained in:
Chengbin Hou 2018-11-22 22:49:28 +00:00
parent 7e1039cc3c
commit 2c52ce69e6
2 changed files with 132 additions and 135 deletions

View File

@ -1,86 +1,39 @@
# -*- coding: utf-8 -*-
'''
Tensorflow implementation of Social Network Embedding framework (SNE)
@author: Lizi Liao (liaolizi.llz@gmail.com)
part of code was originally forked from https://github.com/lizi-git/ASNE
ANE method: Attributed Social Network Embedding (ASNE)
modified by Chengbin Hou 2018
1) convert OpenANE data format to ASNE data format
2) compatible with latest tensorflow 1.2
3) add more comments
4) support eval testing set during each xx epoches
5) as ASNE paper stated, we add two hidden layers with softsign activation func
2) compatible with latest tensorflow 1.10.0
3) add early stopping
4) as ASNE paper stated, we add two hidden layers with softsign activation func
part of code was originally forked from https://github.com/lizi-git/ASNE
'''
import math
import numpy as np
import tensorflow as tf
from sklearn.base import BaseEstimator, TransformerMixin
from .classify import ncClassifier, lpClassifier, read_node_label
from sklearn.linear_model import LogisticRegression
def format_data_from_OpenANE_to_ASNE(g, dim):
'''
convert OpenANE data format to ASNE data format
g: OpenANE graph data structure
dim: final embedding dim
'''
attr_Matrix = g.getX()
#attr_Matrix = g.preprocessAttrInfo(attr_Matrix, dim=200, method='svd') #similar to aane, the same preprocessing
#print('with this preprocessing, ASNE can get better result, as well as, faster speed----------------')
id_N = attr_Matrix.shape[0] #n nodes
attr_M = attr_Matrix.shape[1] #m features
edge_num = len(g.G.edges) #total edges for traning
X={} #one-to-one correspondence
X['data_id_list'] = np.zeros(edge_num) #start node list for traning
X['data_label_list'] = np.zeros(edge_num) #end node list for training
X['data_attr_list'] = np.zeros([edge_num, attr_M]) #attr corresponds to start node
edgelist = [edge for edge in g.G.edges]
i = 0
for edge in edgelist: #traning sample = start node, end node, start node attr
X['data_id_list'][i] = edge[0]
X['data_label_list'][i] = edge[1]
X['data_attr_list'][i] = attr_Matrix[ g.look_up_dict[edge[0]] ][:]
i += 1
X['data_id_list'] = X['data_id_list'].reshape(-1).astype(int)
X['data_label_list'] = X['data_label_list'].reshape(-1,1).astype(int)
nodes={} #one-to-one correspondence
nodes['node_id'] = g.look_back_list #n nodes
nodes['node_attr'] = list(attr_Matrix) #m features -> n*m
id_embedding_size = int(dim/2)
attr_embedding_size = int(dim/2)
print('id_embedding_size', id_embedding_size, 'attr_embedding_size', attr_embedding_size)
return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size
def add_layer(inputs, in_size, out_size, activation_function=None):
# add one more layer and return the output of this layer
Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs
import time
#from .classify import ncClassifier, lpClassifier, read_node_label
#from sklearn.linear_model import LogisticRegression
class ASNE(BaseEstimator, TransformerMixin):
def __init__(self, graph, dim, alpha = 1.0, batch_size=128, learning_rate=0.001,
n_neg_samples=10, epoch=100, random_seed=2018, X_test=0, Y_test=0, task='nc', nc_ratio=0.5, lp_ratio=0.9, label_file=''):
# bind params to class
def __init__(self, graph, dim, alpha=1.0, learning_rate=0.0001, batch_size=128, epoch=20, n_neg_samples=10,
early_stopping=2000): #it seems that overfitting can get better result? try other early_stopping... to do...
t1 = time.time()
X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size = format_data_from_OpenANE_to_ASNE(g=graph, dim=dim)
t2 = time.time()
print(f'transform data format from OpenANE to ASNE; time cost: {(t2-t1):.2f}s')
self.node_N = id_N #n
self.attr_M = attr_M #m
self.X_train = X #{'data_id_list': [], 'data_label_list': [], 'data_attr_list': []}
self.nodes = nodes #{'node_id': [], 'node_attr: []'}
self.id_embedding_size = id_embedding_size # set to dim/2
self.attr_embedding_size = attr_embedding_size # set to dim/2
self.vectors = {}
self.dim = dim
self.vectors = {} #final embs
self.look_back_list = graph.look_back_list #from OpenANE data stcuture
self.alpha = alpha #set to 1.0 by default
@ -88,19 +41,11 @@ class ASNE(BaseEstimator, TransformerMixin):
self.batch_size = batch_size #set to 128 by default
self.learning_rate = learning_rate
self.epoch = epoch #set to 20 by default
self.random_seed = random_seed
self._init_graph() #init all variables in a tensorflow graph
self.task = task
self.nc_ratio = nc_ratio
self.lp_ratio = lp_ratio
if self.task == 'lp': #if not lp task, we do not need to keep testing edges
self.X_test = X_test
self.Y_test = Y_test
self.train() #train our tf asne model-----------------
elif self.task == 'nc' or self.task == 'nclp':
self.X_nc_label, self.Y_nc_label = read_node_label(label_file)
self.train() #train our tf asne model-----------------
self._init_graph() #init all variables in a tensorflow graph
self.early_stopping = early_stopping #early stopping if training loss increased for xx iterations
self.train()
def _init_graph(self):
'''
@ -110,7 +55,7 @@ class ASNE(BaseEstimator, TransformerMixin):
#with self.graph.as_default(), tf.device('/gpu:0'):
with self.graph.as_default():
# Set graph level random seed
tf.set_random_seed(self.random_seed)
#tf.set_random_seed(2018)
# Input data.
self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1
self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M
@ -126,25 +71,26 @@ class ASNE(BaseEstimator, TransformerMixin):
self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim
self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1) # batch_size * (id_dim + attr_dim) #an error due to old tf!
## can add hidden_layers component here!
'''
## can add hidden_layers component here!----------------------------------
#0) no hidden layer
#1) 128
#2) 256+128 ##--------paper stated it used two hidden layers with activation function softsign....
#2) 256+128 ##--------paper stated it used two hidden layers with softsign
#3) 512+256+128
len_h1_in = self.id_embedding_size+self.attr_embedding_size
len_h1_out = 256
len_h1_in = self.id_embedding_size + self.attr_embedding_size
len_h1_out = 256 #or self.id_embedding_size + self.attr_embedding_size # if only add h1
len_h2_in = len_h1_out
len_h2_out = 128
len_h2_out = self.id_embedding_size + self.attr_embedding_size
self.h1 = add_layer(inputs=self.embed_layer, in_size=len_h1_in, out_size=len_h1_out, activation_function=tf.nn.softsign)
self.h2 = add_layer(inputs=self.h1, in_size=len_h2_in, out_size=len_h2_out, activation_function=tf.nn.softsign)
## -------------------------------------------------------------------------
'''
# Compute the loss, using a sample of the negative labels each time.
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'],
inputs = self.h2, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N))
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = self.weights['out_embeddings'], biases = self.weights['biases'], #if one needs to change layers
inputs = self.embed_layer, labels = self.train_labels, num_sampled = self.n_neg_samples, num_classes=self.node_N)) #try inputs = self.embed_layer or self.h1 or self.h2 or ...
# Optimizer.
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) #tune these parameters?
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
# print("AdamOptimizer")
# init
@ -171,14 +117,18 @@ class ASNE(BaseEstimator, TransformerMixin):
start_index = np.random.randint(0, len(data) - batch_size)
return data[start_index:(start_index + batch_size)]
def train(self): # fit a dataset
self.Embeddings = []
print('Using in + out embedding')
for epoch in range( self.epoch ):
total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) #total_batch*batch_size = numOFlinks??
# print('total_batch in 1 epoch: ', total_batch)
# Loop over all batches
def train(self):
self.Embeddings = []
total_batch = int( len(self.X_train['data_id_list']) / self.batch_size)
iter_count = 0
train_loss_best = 0
train_loss_keep_increasing = 0
early_stopping = self.early_stopping #early stopping if training loss increased
for epoch in range(self.epoch):
t1 = time.time()
for i in range(total_batch):
# generate a batch data
batch_xs = {}
@ -188,25 +138,40 @@ class ASNE(BaseEstimator, TransformerMixin):
batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)]
# Fit training using batch data
cost = self.partial_fit(batch_xs)
# Display logs per epoch
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
self.Embeddings = Embeddings_out + Embeddings_in #simply mean them and as final embedding; try concat? to do...
#print('training tensorflow asne model, epoc: ', epoch+1 , ' / ', self.epoch)
#to save training time, we delete eval testing data @ each epoch
train_loss = self.partial_fit(batch_xs)
iter_count += 1
if iter_count == 1:
train_loss_best = train_loss
else:
if train_loss_best > train_loss: # training loss decreasing
train_loss_best = train_loss
train_loss_keep_increasing = 0 # reset
else: # training loss increasing
train_loss_keep_increasing += 1
if train_loss_keep_increasing > early_stopping: # early stopping
print(f'early stopping @ iter {iter_count}; take out embs and return')
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do...
ind = 0
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
self.vectors[id] = self.Embeddings[ind]
ind += 1
return self.vectors
else:
pass
t2 = time.time()
print(f'epoch @ {epoch+1}/{self.epoch}; time cost: {(t2-t1):.2f}s',)
#-----------for each xx epoches; save embeddings {node_id1: [], node_id2: [], ...}----------
if (epoch+1)%1 == 0 and epoch != 0: #for every xx epoches, try eval
print('@@@ epoch ------- ', epoch+1 , ' / ', self.epoch)
ind = 0
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
self.vectors[id] = self.Embeddings[ind]
ind += 1
#self.eval(vectors=self.vectors)
print('please note that: the fianl embedding returned and its output file are not the best embedding!')
print('for the best embeddings, please check which epoch got the best eval metric(s)......')
print(f'finish all {self.epoch} epochs; take out embs and return')
Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
self.Embeddings = Embeddings_out + Embeddings_in # simply mean them and as final embedding; try concat? to do...
ind = 0
for id in self.nodes['node_id']: #self.nodes['node_id']=self.look_back_list
self.vectors[id] = self.Embeddings[ind]
ind += 1
return self.vectors
def getEmbedding(self, type, nodes):
@ -224,21 +189,59 @@ class ASNE(BaseEstimator, TransformerMixin):
'''
fout = open(filename, 'w')
node_num = len(self.vectors.keys())
fout.write("{} {}\n".format(node_num, self.dim))
fout.write("{} {}\n".format(node_num, self.id_embedding_size+self.attr_embedding_size))
for node, vec in self.vectors.items():
fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec])))
fout.close()
def eval(self, vectors):
#------nc task
if self.task == 'nc' or self.task == 'nclp':
print("Training nc classifier using {:.2f}% node labels...".format(self.nc_ratio*100))
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones
clf.split_train_evaluate(self.X_nc_label, self.Y_nc_label, self.nc_ratio)
#------lp task
if self.task == 'lp':
#X_test, Y_test = read_edge_label(args.label_file) #enable this if you want to load your own lp testing data, see classfiy.py
print("During embedding we have used {:.2f}% links and the remaining will be left for lp evaluation...".format(self.lp_ratio*100))
clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm
clf.evaluate(self.X_test, self.Y_test)
# ---------------------------------------------- ASNE utils ------------------------------------------------
def format_data_from_OpenANE_to_ASNE(g, dim):
''' convert OpenANE data format to ASNE data format '''
attr_Matrix = g.get_attr_mat(is_sparse=False)
id_N = attr_Matrix.shape[0] #n nodes
attr_M = attr_Matrix.shape[1] #m features
X = {}
X['data_id_list'] = []
X['data_label_list'] = []
X['data_attr_list'] = []
edgelist = [edge for edge in g.G.edges]
print('If an edge only have one direction, double it......')
cnt = 0
for edge in edgelist: #traning sample = start node, end node, start node attr
X['data_id_list'].append(edge[0])
X['data_label_list'].append(edge[1])
X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[0]] ][:])
cnt += 1
if (edge[1], edge[0]) not in edgelist: # double! as paper said--------------
X['data_id_list'].append(edge[1])
X['data_label_list'].append(edge[0])
X['data_attr_list'].append(attr_Matrix[ g.look_up_dict[edge[1]] ][:])
cnt += 1
print(f'edges before doubling: {g.get_num_edges()}')
print(f'edges after doubling: {cnt}')
X['data_id_list'] = np.array(X['data_id_list']).reshape(-1).astype(int)
X['data_label_list'] = np.array(X['data_label_list']).reshape(-1,1).astype(int)
X['data_attr_list'] = np.array(X['data_attr_list']).reshape(cnt,attr_M)
nodes={}
nodes['node_id'] = g.look_back_list
nodes['node_attr'] = attr_Matrix
id_embedding_size = int(dim/2)
attr_embedding_size = int(dim/2)
print('id_embedding_size', id_embedding_size, '\nattr_embedding_size', attr_embedding_size)
return X, nodes, id_N, attr_M, id_embedding_size, attr_embedding_size
def add_layer(inputs, in_size, out_size, activation_function=None):
# add one more layer and return the output of this layer
Weights = tf.Variable(tf.random_uniform([in_size, out_size], -1.0, 1.0)) #init as paper stated
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs

View File

@ -194,14 +194,8 @@ def main(args):
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
elif args.method == 'sagegcn': #parameters for graphsage models are in 'graphsage' -> '__init__.py'
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
elif args.method == 'asne':
if args.task == 'nc':
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
else:
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
else:
print('method not found...')
exit(0)
@ -215,9 +209,9 @@ def main(args):
'''
#to do.... semi-supervised methods: gcn, graphsage, etc...
if args.method == 'gcn': #semi-supervised gcn
assert args.label_file != '' #must have node label
assert args.feature_file != '' #different from previous ANE methods
g.read_node_label(args.label_file) #gcn is an end-to-end supervised ANE methoed
assert args.label_file != ''
assert args.feature_file != ''
g.read_node_label(args.label_file)
model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.label_reserved)
print('semi-supervsied method, no embs, exit the program...') #semi-supervised gcn do not produce embs
exit(0)