From 11a13922f0b2002e5127e355a4614bb90d42f38c Mon Sep 17 00:00:00 2001 From: Dongzy Date: Sun, 18 Nov 2018 11:46:50 +0800 Subject: [PATCH 01/13] Integrate graph.py API Finished abrw, deepwalk, aane, attrpure, attrcomb and part of tadw and asne --- src/libnrl/aane.py | 4 ++-- src/libnrl/attrcomb.py | 13 ++++++++----- src/libnrl/attrpure.py | 10 +++++++--- src/libnrl/graphsage/graphsageAPI.py | 2 +- src/libnrl/grarep.py | 2 +- src/libnrl/tadw.py | 12 ++++++++---- src/libnrl/utils.py | 5 ++++- src/main.py | 2 +- 8 files changed, 32 insertions(+), 18 deletions(-) diff --git a/src/libnrl/aane.py b/src/libnrl/aane.py index d375788..ad50a4f 100644 --- a/src/libnrl/aane.py +++ b/src/libnrl/aane.py @@ -36,8 +36,8 @@ class AANE: self.look_back_list = graph.look_back_list #look back node id for A and X if mode == 'comb': print('==============AANE-comb mode: jointly learn emb from both structure and attribute info========') - Net = sparse.csr_matrix(graph.getA()) - Attri = sparse.csr_matrix(graph.getX()) + Net = sparse.csr_matrix(graph.get_adj_mat()) + Attri = sparse.csr_matrix(graph.get_attr_mat()) elif mode == 'pure': print('======================AANE-pure mode: learn emb from structure info purely====================') Net = graph.getA() diff --git a/src/libnrl/attrcomb.py b/src/libnrl/attrcomb.py index a52bfe8..de1e195 100644 --- a/src/libnrl/attrcomb.py +++ b/src/libnrl/attrcomb.py @@ -1,8 +1,12 @@ # -*- coding: utf-8 -*- -import numpy as np import time + import networkx as nx -from . import node2vec, line, grarep +import numpy as np + +from . import grarep, line, node2vec +from .utils import dim_reduction + ''' #----------------------------------------------------------------------------- @@ -58,8 +62,8 @@ class ATTRCOMB(object): def train_attr(self, dim): - X = self.g.getX() - X_compressed = self.g.preprocessAttrInfo(X=X, dim=dim, method='svd') #svd or pca for dim reduction + X = self.g.get_attr_mat() + X_compressed = dim_reduction(X, dim=dim, method='svd') #svd or pca for dim reduction print('X_compressed shape: ', X_compressed.shape) return np.array(X_compressed) #n*dim matrix, each row corresponding to node ID stored in graph.look_back_list @@ -93,4 +97,3 @@ class ATTRCOMB(object): fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) fout.close() - diff --git a/src/libnrl/attrpure.py b/src/libnrl/attrpure.py index ab3e234..63a89af 100644 --- a/src/libnrl/attrpure.py +++ b/src/libnrl/attrpure.py @@ -1,7 +1,11 @@ # -*- coding: utf-8 -*- -import numpy as np import time + import networkx as nx +import numpy as np + +from .utils import dim_reduction + ''' #----------------------------------------------------------------------------- @@ -23,8 +27,8 @@ class ATTRPURE(object): self.vectors[key] = embeddings[ind] def train(self): - X = self.g.getX() - X_compressed = self.g.preprocessAttrInfo(X=X, dim=self.dim, method='svd') #svd or pca for dim reduction + X = self.g.get_attr_mat() + X_compressed = dim_reduction(X, dim=self.dim, method='svd') #svd or pca for dim reduction return X_compressed #n*dim matrix, each row corresponding to node ID stored in graph.look_back_list diff --git a/src/libnrl/graphsage/graphsageAPI.py b/src/libnrl/graphsage/graphsageAPI.py index 9c0ca23..c40e127 100644 --- a/src/libnrl/graphsage/graphsageAPI.py +++ b/src/libnrl/graphsage/graphsageAPI.py @@ -75,7 +75,7 @@ def tranform_data_for_graphsage(graph): #conversion = lambda n : int(n) # compatible with networkx >2.0 #id_map = {conversion(k):int(v) for k,v in id_map.items()} # due to graphSAGE requirement - feats = np.array([G.nodes[id]['feature'] for id in id_map.keys()]) + feats = np.array([G.nodes[id]['attr'] for id in id_map.keys()]) normalize = True #have decleared in __init__.py if normalize and not feats is None: print("-------------row norm of node attributes/features------------------") diff --git a/src/libnrl/grarep.py b/src/libnrl/grarep.py index ebcb56e..db3f7f3 100644 --- a/src/libnrl/grarep.py +++ b/src/libnrl/grarep.py @@ -14,7 +14,7 @@ class GraRep(object): def getAdjMat(self): graph = self.g.G - node_size = self.g.node_size + node_size = self.g.get_num_nodes() look_up = self.g.look_up_dict adj = np.zeros((node_size, node_size)) for edge in self.g.G.edges(): diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py index 276eff4..11ddd75 100644 --- a/src/libnrl/tadw.py +++ b/src/libnrl/tadw.py @@ -1,10 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import print_function + import math + import numpy as np from numpy import linalg as la from sklearn.preprocessing import normalize + from .gcn.utils import * +from .utils import row_as_probdist + ''' #----------------------------------------------------------------------------- @@ -34,14 +39,14 @@ class TADW(object): # ScaleSimMat return adj/np.sum(adj, axis=1) #orignal way may get numerical error sometimes... ''' - A = self.g.getA() - return self.g.rowAsPDF(A) + A = self.g.get_adj_mat() + return row_as_probdist(A) def getT(self): #changed with the same data preprocessing method g = self.g.G look_back = self.g.look_back_list - self.features = np.vstack([g.nodes[look_back[i]]['feature'] + self.features = np.vstack([g.nodes[look_back[i]]['attr'] for i in range(g.number_of_nodes())]) self.preprocessFeature() #call the orig data preprocessing method return self.features.T @@ -125,4 +130,3 @@ class TADW(object): look_back = self.g.look_back_list for i, embedding in enumerate(self.Vecs): self.vectors[look_back[i]] = embedding - \ No newline at end of file diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index 72db1c8..57352b3 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- +import time + import numpy as np from scipy import sparse + # from sklearn.model_selection import train_test_split @@ -257,4 +260,4 @@ def sparse_to_dense(): def dense_to_sparse(): ''' to sparse crs format 你补充下,记得dtype用float64''' import scipy.sparse as sp - pass \ No newline at end of file + pass diff --git a/src/main.py b/src/main.py index ad7678a..aa3270a 100644 --- a/src/main.py +++ b/src/main.py @@ -177,7 +177,7 @@ def main(args): X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) else: model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=X_test_lp, Y_test=Y_test_lp, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) + X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) elif args.method == 'aane': model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, mode=args.AANE_mode) elif args.method == 'tadw': From 45f4600f984d5eb89d51fb7bdb096b1dfcad292b Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Sun, 18 Nov 2018 20:09:40 +0000 Subject: [PATCH 02/13] aane_checked_v0.0 --- src/libnrl/aane.py | 61 +++++++++++++++++++++++++--------------------- src/main.py | 19 ++++++++------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/src/libnrl/aane.py b/src/libnrl/aane.py index ad50a4f..c91755e 100644 --- a/src/libnrl/aane.py +++ b/src/libnrl/aane.py @@ -1,17 +1,17 @@ -# -*- coding: utf-8 -*- +""" +ANE method: Accelerated Attributed Network Embedding (AANE) + +modified by Chengbin Hou 2018 + +originally from https://github.com/xhuang31/AANE_Python +""" + import numpy as np from scipy import sparse from scipy.sparse import csc_matrix from scipy.sparse.linalg import svds from math import ceil -''' -#----------------------------------------------------------------------------- -# modified by Chengbin Hou 2018 -# part of code was originally forked from https://github.com/xhuang31/AANE_Python -#----------------------------------------------------------------------------- -''' - class AANE: """Jointly embed Net and Attri into embedding representation H H = AANE(Net,Attri,d).function() @@ -31,36 +31,36 @@ class AANE: Copyright 2017 & 2018, Xiao Huang and Jundong Li. $Revision: 1.0.2 $ $Date: 2018/02/19 00:00:00 $ """ - def __init__(self, graph, dim=100, lambd=0.05, rho=5, mode='comb', *varargs): #paper said lambd should not too large; suggest [0, 0.1]; lambd=0 -> attrpure - self.d = dim - self.look_back_list = graph.look_back_list #look back node id for A and X + def __init__(self, graph, dim, lambd=0.05, rho=5, maxiter=5, mode='comb', *varargs): + self.dim = dim + self.look_back_list = graph.look_back_list #look back node id for Net and Attr + self.lambd = lambd # Initial regularization parameter + self.rho = rho # Initial penalty parameter + self.maxiter = maxiter # Max num of iteration + splitnum = 1 # number of pieces we split the SA for limited cache if mode == 'comb': print('==============AANE-comb mode: jointly learn emb from both structure and attribute info========') - Net = sparse.csr_matrix(graph.get_adj_mat()) - Attri = sparse.csr_matrix(graph.get_attr_mat()) + Net = graph.get_adj_mat() + Attri = graph.get_attr_mat() elif mode == 'pure': - print('======================AANE-pure mode: learn emb from structure info purely====================') - Net = graph.getA() + print('======================AANE-pure mode: learn emb purely from structure info====================') + Net = graph.get_adj_mat() Attri = Net else: exit(0) - self.maxiter = 2 # Max num of iteration [self.n, m] = Attri.shape # n = Total num of nodes, m = attribute category num Net = sparse.lil_matrix(Net) Net.setdiag(np.zeros(self.n)) Net = csc_matrix(Net) Attri = csc_matrix(Attri) - self.lambd = 0.05 # Initial regularization parameter - self.rho = 5 # Initial penalty parameter - splitnum = 1 # number of pieces we split the SA for limited cache if len(varargs) >= 4 and varargs[3] == 'Att': sumcol = np.arange(m) np.random.shuffle(sumcol) - self.H = svds(Attri[:, sumcol[0:min(10 * d, m)]], d)[0] + self.H = svds(Attri[:, sumcol[0:min(10 * self.dim, m)]], self.dim)[0] else: sumcol = Net.sum(0) - self.H = svds(Net[:, sorted(range(self.n), key=lambda k: sumcol[0, k], reverse=True)[0:min(10 * self.d, self.n)]], self.d)[0] + self.H = svds(Net[:, sorted(range(self.n), key=lambda k: sumcol[0, k], reverse=True)[0:min(10 * self.dim, self.n)]], self.dim)[0] if len(varargs) > 0: self.lambd = varargs[0] @@ -75,17 +75,17 @@ class AANE: self.Attri = Attri.transpose() * sparse.diags(np.ravel(np.power(Attri.power(2).sum(1), -0.5))) self.Z = self.H.copy() self.affi = -1 # Index for affinity matrix sa - self.U = np.zeros((self.n, self.d)) + self.U = np.zeros((self.n, self.dim)) self.nexidx = np.split(Net.indices, Net.indptr[1:-1]) self.Net = np.split(Net.data, Net.indptr[1:-1]) self.vectors = {} - self.function() #run aane + self.function() #run aane---------------------------- '''################# Update functions #################''' def updateH(self): - xtx = np.dot(self.Z.transpose(), self.Z) * 2 + self.rho * np.eye(self.d) + xtx = np.dot(self.Z.transpose(), self.Z) * 2 + self.rho * np.eye(self.dim) for blocki in range(self.splitnum): # Split nodes into different Blocks indexblock = self.block * blocki # Index for splitting blocks if self.affi != blocki: @@ -99,14 +99,14 @@ class AANE: nzidx = normi_j != 0 # Non-equal Index if np.any(nzidx): normi_j = (self.lambd * self.Net[i][nzidx]) / normi_j[nzidx] - self.H[i, :] = np.linalg.solve(xtx + normi_j.sum() * np.eye(self.d), sums[i - indexblock, :] + ( + self.H[i, :] = np.linalg.solve(xtx + normi_j.sum() * np.eye(self.dim), sums[i - indexblock, :] + ( neighbor[nzidx, :] * normi_j.reshape((-1, 1))).sum(0) + self.rho * ( self.Z[i, :] - self.U[i, :])) else: self.H[i, :] = np.linalg.solve(xtx, sums[i - indexblock, :] + self.rho * ( self.Z[i, :] - self.U[i, :])) def updateZ(self): - xtx = np.dot(self.H.transpose(), self.H) * 2 + self.rho * np.eye(self.d) + xtx = np.dot(self.H.transpose(), self.H) * 2 + self.rho * np.eye(self.dim) for blocki in range(self.splitnum): # Split nodes into different Blocks indexblock = self.block * blocki # Index for splitting blocks if self.affi != blocki: @@ -120,7 +120,7 @@ class AANE: nzidx = normi_j != 0 # Non-equal Index if np.any(nzidx): normi_j = (self.lambd * self.Net[i][nzidx]) / normi_j[nzidx] - self.Z[i, :] = np.linalg.solve(xtx + normi_j.sum() * np.eye(self.d), sums[i - indexblock, :] + ( + self.Z[i, :] = np.linalg.solve(xtx + normi_j.sum() * np.eye(self.dim), sums[i - indexblock, :] + ( neighbor[nzidx, :] * normi_j.reshape((-1, 1))).sum(0) + self.rho * ( self.H[i, :] + self.U[i, :])) else: @@ -130,10 +130,15 @@ class AANE: def function(self): self.updateH() '''################# Iterations #################''' - for __ in range(self.maxiter - 1): + for i in range(self.maxiter): + import time + t1=time.time() self.updateZ() self.U = self.U + self.H - self.Z self.updateH() + t2=time.time() + print(f'iter: {i+1}/{self.maxiter}; time cost {t2-t1:0.2f}s') + #-------save emb to self.vectors and return ind = 0 for id in self.look_back_list: diff --git a/src/main.py b/src/main.py index 2832cb7..98d8626 100644 --- a/src/main.py +++ b/src/main.py @@ -78,8 +78,8 @@ def parse_args(): help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AANE-rho', default=5, type=float, help='penalty parameter; ranging [0, inf]') - parser.add_argument('--AANE-mode', default='comb', type=str, - help='choices of mode: comb, pure') + parser.add_argument('--AANE-maxiter', default=10, type=float, + help='penalty parameter; ranging [0, inf]') parser.add_argument('--ASNE-lamb', default=1.0, type=float, help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AttrComb-mode', default='concat', type=str, @@ -167,11 +167,16 @@ def main(args): if args.method == 'abrw': model = abrw.ABRW(graph=g, dim=args.dim, alpha=args.ABRW_alpha, topk=args.ABRW_topk, number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers) + elif args.method == 'aane': + model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter, + mode='comb') #mode: 'comb' struc and attri or 'pure' struc + elif args.method == 'tadw': + model = tadw.TADW(graph=g, dim=args.dim, lamb=args.TADW_lamb) elif args.method == 'attrpure': - model = attrpure.ATTRPURE(graph=g, dim=args.dim) + model = attrpure.ATTRPURE(graph=g, dim=args.dim) #mode: pca or svd elif args.method == 'attrcomb': - model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', - num_paths=args.number_walks, comb_method=args.AttrComb_mode) #concat, elementwise-mean, elementwise-max + model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', num_paths=args.number_walks, + comb_method=args.AttrComb_mode) #comb_method: concat, elementwise-mean, elementwise-max elif args.method == 'asne': if args.task == 'nc': model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, @@ -179,10 +184,6 @@ def main(args): else: model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) - elif args.method == 'aane': - model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, mode=args.AANE_mode) - elif args.method == 'tadw': - model = tadw.TADW(graph=g, dim=args.dim, lamb=args.TADW_lamb) elif args.method == 'deepwalk': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim, From 526787797f22d0f13d1afaffefc6c4ceb1f73f54 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Sun, 18 Nov 2018 22:23:13 +0000 Subject: [PATCH 03/13] tadw_checked_v0.0 --- src/libnrl/tadw.py | 86 +++++++++++++++++++++++---------------------- src/libnrl/utils.py | 3 +- src/main.py | 14 ++++---- 3 files changed, 54 insertions(+), 49 deletions(-) diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py index 11ddd75..312813a 100644 --- a/src/libnrl/tadw.py +++ b/src/libnrl/tadw.py @@ -1,4 +1,13 @@ -# -*- coding: utf-8 -*- +""" +ANE method: Text Associated DeepWalk (TADW) + +modified by Chengbin Hou 2018 + +originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/tadw.py +to do... sparse computation and remove unnecessary self vars; +otherwise, not scalable to large network; +""" + from __future__ import print_function import math @@ -7,27 +16,18 @@ import numpy as np from numpy import linalg as la from sklearn.preprocessing import normalize -from .gcn.utils import * from .utils import row_as_probdist - -''' -#----------------------------------------------------------------------------- -# part of code was originally forked from https://github.com/thunlp/OpenNE -# modified by Chengbin Hou 2018 -# Email: Chengbin.Hou10@foxmail.com -#----------------------------------------------------------------------------- -''' - class TADW(object): - def __init__(self, graph, dim, lamb=0.2): + def __init__(self, graph, dim, lamb=0.2, maxiter=10): self.g = graph self.lamb = lamb self.dim = dim + self.maxiter = maxiter self.train() - def getAdj(self): #changed with the same data preprocessing, and our preprocessing obtain better result + def getAdj(self): ''' graph = self.g.G node_size = self.g.node_size @@ -37,54 +37,45 @@ class TADW(object): adj[look_up[edge[0]]][look_up[edge[1]]] = 1.0 adj[look_up[edge[1]]][look_up[edge[0]]] = 1.0 # ScaleSimMat - return adj/np.sum(adj, axis=1) #orignal way may get numerical error sometimes... + return adj/np.sum(adj, axis=1) #original may get numerical error sometimes... ''' - A = self.g.get_adj_mat() - return row_as_probdist(A) + A = self.g.get_adj_mat() #by defalut, return a sparse matrix + return np.array(row_as_probdist(A).todense()) #only support np.array, otherwise dim error... - def getT(self): #changed with the same data preprocessing method + def getT(self): g = self.g.G look_back = self.g.look_back_list self.features = np.vstack([g.nodes[look_back[i]]['attr'] - for i in range(g.number_of_nodes())]) - self.preprocessFeature() #call the orig data preprocessing method + for i in range(g.number_of_nodes())]) + #self.features = self.g.get_attr_mat().todense() + self.preprocessFeature() return self.features.T - ''' - #changed with the same data preprocessing method, see self.g.preprocessAttrInfo(X=X, dim=200, method='svd') - #seems get better result? - X = self.g.getX() - self.features = self.g.preprocessAttrInfo(X=X, dim=200, method='svd') #svd or pca for dim reduction - return np.transpose(self.features) - ''' - def preprocessFeature(self): #the orignal data preprocess method - U, S, VT = la.svd(self.features) - Ud = U[:, 0:200] - Sd = S[0:200] - self.features = np.array(Ud)*Sd.reshape(200) + def preprocessFeature(self): + if self.features.shape[1] > 200: + U, S, VT = la.svd(self.features) + Ud = U[:, 0:200] + Sd = S[0:200] + self.features = np.array(Ud)*Sd.reshape(200) + #from .utils import dim_reduction + #self.features = dim_reduction(self.features, dim=200, method='svd') - def save_embeddings(self, filename): - fout = open(filename, 'w') - node_num = len(self.vectors.keys()) - fout.write("{} {}\n".format(node_num, self.dim)) - for node, vec in self.vectors.items(): - fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec]))) - fout.close() - def train(self): self.adj = self.getAdj() # M=(A+A^2)/2 where A is the row-normalized adjacency matrix self.M = (self.adj + np.dot(self.adj, self.adj))/2 # T is feature_size*node_num, text features - self.T = self.getT() #transpose of self.features!!! + self.T = self.getT() #transpose of self.features self.node_size = self.adj.shape[0] self.feature_size = self.features.shape[1] self.W = np.random.randn(self.dim, self.node_size) self.H = np.random.randn(self.dim, self.feature_size) # Update - for i in range(20): #trade-off between acc and speed, 20-50 - print('Iteration ', i) + + import time + for i in range(self.maxiter): + t1=time.time() # Update W B = np.dot(self.H, self.T) drv = 2 * np.dot(np.dot(B, B.T), self.W) - \ @@ -124,9 +115,20 @@ class TADW(object): bt = np.dot(rt.T, rt)/np.dot(rtmp.T, rtmp) dt = rt + bt * dt self.H = np.reshape(vecH, (self.dim, self.feature_size)) + t2=time.time() + print(f'iter: {i+1}/{self.maxiter}; time cost {t2-t1:0.2f}s') + self.Vecs = np.hstack((normalize(self.W.T), normalize(np.dot(self.T.T, self.H.T)))) # get embeddings self.vectors = {} look_back = self.g.look_back_list for i, embedding in enumerate(self.Vecs): self.vectors[look_back[i]] = embedding + + def save_embeddings(self, filename): + fout = open(filename, 'w') + node_num = len(self.vectors.keys()) + fout.write("{} {}\n".format(node_num, self.dim)) + for node, vec in self.vectors.items(): + fout.write("{} {}\n".format(node,' '.join([str(x) for x in vec]))) + fout.close() diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index 383639e..eb6f518 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -17,7 +17,7 @@ from scipy import sparse # ---------------------------------ulits for calculation-------------------------------- -def row_as_probdist(mat): +def row_as_probdist(mat): #to do... also return dense matrix via a flag setting """Make each row of matrix sums up to 1.0, i.e., a probability distribution. Support both dense and sparse matrix. @@ -35,6 +35,7 @@ def row_as_probdist(mat): dense or sparse matrix: return dense matrix if input is dense matrix or numpy array return sparse matrix for sparse matrix input + (note: np.array & np.matrix are diff; and may cause some dim issues...) """ row_sum = np.array(mat.sum(axis=1)).ravel() # type: np.array zero_rows = row_sum == 0 diff --git a/src/main.py b/src/main.py index 98d8626..5e49302 100644 --- a/src/main.py +++ b/src/main.py @@ -71,15 +71,17 @@ def parse_args(): parser.add_argument('--ABRW-topk', default=30, type=int, help='select the most attr similar top k nodes of a node; ranging [0, # of nodes]') parser.add_argument('--ABRW-alpha', default=0.8, type=float, - help='balance struc and attr info; ranging [0, 1]') - parser.add_argument('--TADW-lamb', default=0.2, type=float, - help='balance struc and attr info; ranging [0, inf]') + help='balance struc and attr info; ranging [0, 1]') parser.add_argument('--AANE-lamb', default=0.05, type=float, help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AANE-rho', default=5, type=float, help='penalty parameter; ranging [0, inf]') - parser.add_argument('--AANE-maxiter', default=10, type=float, - help='penalty parameter; ranging [0, inf]') + parser.add_argument('--AANE-maxiter', default=10, type=int, + help='max iter') + parser.add_argument('--TADW-lamb', default=0.2, type=float, + help='balance struc and attr info; ranging [0, inf]') + parser.add_argument('--TADW-maxiter', default=10, type=int, + help='max iter') parser.add_argument('--ASNE-lamb', default=1.0, type=float, help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AttrComb-mode', default='concat', type=str, @@ -171,7 +173,7 @@ def main(args): model = aane.AANE(graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter, mode='comb') #mode: 'comb' struc and attri or 'pure' struc elif args.method == 'tadw': - model = tadw.TADW(graph=g, dim=args.dim, lamb=args.TADW_lamb) + model = tadw.TADW(graph=g, dim=args.dim, lamb=args.TADW_lamb, maxiter=args.TADW_maxiter) elif args.method == 'attrpure': model = attrpure.ATTRPURE(graph=g, dim=args.dim) #mode: pca or svd elif args.method == 'attrcomb': From 5344141369b633e02b3c0357de7af4cfa69053c2 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Sun, 18 Nov 2018 22:34:35 +0000 Subject: [PATCH 04/13] attrpure_checked_v0.0 --- src/libnrl/attrpure.py | 30 +++++++++++++++++------------- src/main.py | 2 +- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/libnrl/attrpure.py b/src/libnrl/attrpure.py index 63a89af..4b112a3 100644 --- a/src/libnrl/attrpure.py +++ b/src/libnrl/attrpure.py @@ -1,4 +1,9 @@ -# -*- coding: utf-8 -*- +""" +NE method: use only attribute information (AttrPure) + +by Chengbin Hou 2018 +""" + import time import networkx as nx @@ -6,19 +11,12 @@ import numpy as np from .utils import dim_reduction - -''' -#----------------------------------------------------------------------------- -# author: Chengbin Hou 2018 -# Email: Chengbin.Hou10@foxmail.com -#----------------------------------------------------------------------------- -''' - class ATTRPURE(object): - def __init__(self, graph, dim): + def __init__(self, graph, dim, mode): self.g = graph self.dim = dim + self.mode = mode print("Learning representation...") self.vectors = {} @@ -27,9 +25,15 @@ class ATTRPURE(object): self.vectors[key] = embeddings[ind] def train(self): - X = self.g.get_attr_mat() - X_compressed = dim_reduction(X, dim=self.dim, method='svd') #svd or pca for dim reduction - return X_compressed #n*dim matrix, each row corresponding to node ID stored in graph.look_back_list + X = self.g.get_attr_mat().todense() + X_compressed = None + if self.mode == 'pca': + X_compressed = dim_reduction(X, dim=self.dim, method='pca') + elif self.mode == 'svd': + X_compressed = dim_reduction(X, dim=self.dim, method='svd') + else: + print('unknown dim reduction technique...') + return X_compressed def save_embeddings(self, filename): diff --git a/src/main.py b/src/main.py index 5e49302..0025495 100644 --- a/src/main.py +++ b/src/main.py @@ -175,7 +175,7 @@ def main(args): elif args.method == 'tadw': model = tadw.TADW(graph=g, dim=args.dim, lamb=args.TADW_lamb, maxiter=args.TADW_maxiter) elif args.method == 'attrpure': - model = attrpure.ATTRPURE(graph=g, dim=args.dim) #mode: pca or svd + model = attrpure.ATTRPURE(graph=g, dim=args.dim, mode='pca') #mode: pca or svd elif args.method == 'attrcomb': model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', num_paths=args.number_walks, comb_method=args.AttrComb_mode) #comb_method: concat, elementwise-mean, elementwise-max From 441ee99d76342d2683579b4021913923b238acc2 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Sun, 18 Nov 2018 22:52:54 +0000 Subject: [PATCH 05/13] attrcomb_checked_v0.0 --- src/libnrl/attrcomb.py | 31 ++++++++++++++++--------------- src/main.py | 4 ++-- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/libnrl/attrcomb.py b/src/libnrl/attrcomb.py index de1e195..7b60923 100644 --- a/src/libnrl/attrcomb.py +++ b/src/libnrl/attrcomb.py @@ -1,4 +1,9 @@ -# -*- coding: utf-8 -*- +""" +NE method: naively combine AttrPure and DeepWalk (AttrComb) + +by Chengbin Hou 2018 +""" + import time import networkx as nx @@ -7,20 +12,14 @@ import numpy as np from . import grarep, line, node2vec from .utils import dim_reduction - -''' -#----------------------------------------------------------------------------- -# author: Chengbin Hou 2018 -# Email: Chengbin.Hou10@foxmail.com -#----------------------------------------------------------------------------- -''' - class ATTRCOMB(object): - - def __init__(self, graph, dim, comb_method='concat', num_paths=10, comb_with='deepWalk'): + def __init__(self, graph, dim, comb_method='concat', comb_with='deepWalk', number_walks=10, walk_length=80, window=10, workers=8): self.g = graph self.dim = dim - self.num_paths = num_paths + self.number_walks= number_walks + self.walk_length = walk_length + self.window = window + self.workers = workers print("Learning representation...") self.vectors = {} @@ -71,14 +70,16 @@ class ATTRCOMB(object): def train_nrl(self, dim, comb_with): print('attr naively combined with ', comb_with, '=====================') if comb_with == 'deepWalk': - model = node2vec.Node2vec(graph=self.g, path_length=80, num_paths=self.num_paths, dim=dim, workers=4, window=10, dw=True) + model = node2vec.Node2vec(graph=self.g, dim=dim, path_length=self.walk_length, #do not use self.dim here + num_paths=self.number_walks, workers=self.workers, window=self.window, dw=True) nrl_embeddings = [] for key in self.g.look_back_list: nrl_embeddings.append(model.vectors[key]) return np.array(nrl_embeddings) - elif args.method == 'node2vec': - model = node2vec.Node2vec(graph=self.g, path_length=80, num_paths=self.num_paths, dim=dim, workers=4, p=0.8, q=0.8, window=10) + elif comb_with == 'node2vec': #to do... the parameters + model = node2vec.Node2vec(graph=self.g, path_length=80, num_paths=self.number_walks, + dim=dim, workers=4, p=0.8, q=0.8, window=10) nrl_embeddings = [] for key in self.g.look_back_list: nrl_embeddings.append(model.vectors[key]) diff --git a/src/main.py b/src/main.py index 0025495..bedf744 100644 --- a/src/main.py +++ b/src/main.py @@ -177,8 +177,8 @@ def main(args): elif args.method == 'attrpure': model = attrpure.ATTRPURE(graph=g, dim=args.dim, mode='pca') #mode: pca or svd elif args.method == 'attrcomb': - model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', num_paths=args.number_walks, - comb_method=args.AttrComb_mode) #comb_method: concat, elementwise-mean, elementwise-max + model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', number_walks=args.number_walks, walk_length=args.walk_length, + window=args.window_size, workers=args.workers, comb_method=args.AttrComb_mode) #comb_method: concat, elementwise-mean, elementwise-max elif args.method == 'asne': if args.task == 'nc': model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, From 2f17e0db5c84ce9d8a0894d84f21851ed1d6ed6a Mon Sep 17 00:00:00 2001 From: Dongzy Date: Mon, 19 Nov 2018 08:45:37 +0800 Subject: [PATCH 06/13] support dense output for row_as_probdist --- src/libnrl/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index eb6f518..920d9a7 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -17,7 +17,7 @@ from scipy import sparse # ---------------------------------ulits for calculation-------------------------------- -def row_as_probdist(mat): #to do... also return dense matrix via a flag setting +def row_as_probdist(mat, dense_output=False): """Make each row of matrix sums up to 1.0, i.e., a probability distribution. Support both dense and sparse matrix. @@ -44,6 +44,8 @@ def row_as_probdist(mat): #to do... also return dense matrix via a flag setting mat = diag.dot(mat) mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1]))) + if dense_output and sparse.issparse(mat): + return mat.todense() return mat From a3a99fc539889de0986a16c7e9f0825a50132673 Mon Sep 17 00:00:00 2001 From: Dongzy Date: Mon, 19 Nov 2018 08:47:09 +0800 Subject: [PATCH 07/13] fixup --- src/libnrl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index 920d9a7..d7a2803 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -25,7 +25,8 @@ def row_as_probdist(mat, dense_output=False): ---------- mat : scipy sparse matrix or dense matrix or numpy array The matrix to be normalized - + dense_output : bool + whether forced dense output Note ---- For row with all entries 0, we normalize it to a vector with all entries 1/n From d3f7555108e31d4099b13dd7c17b07e3b6900db1 Mon Sep 17 00:00:00 2001 From: Dongzy Date: Mon, 19 Nov 2018 08:57:25 +0800 Subject: [PATCH 08/13] support perserve_zeros for row_as_probdist and fix tadw --- src/libnrl/tadw.py | 2 +- src/libnrl/utils.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py index 312813a..3d9cbe5 100644 --- a/src/libnrl/tadw.py +++ b/src/libnrl/tadw.py @@ -40,7 +40,7 @@ class TADW(object): return adj/np.sum(adj, axis=1) #original may get numerical error sometimes... ''' A = self.g.get_adj_mat() #by defalut, return a sparse matrix - return np.array(row_as_probdist(A).todense()) #only support np.array, otherwise dim error... + return np.array(row_as_probdist(A, dense_output=True, preserve_zeros=True)) #only support np.array, otherwise dim error... def getT(self): diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index d7a2803..b9d99fe 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -17,7 +17,7 @@ from scipy import sparse # ---------------------------------ulits for calculation-------------------------------- -def row_as_probdist(mat, dense_output=False): +def row_as_probdist(mat, dense_output=False, preserve_zeros=False): """Make each row of matrix sums up to 1.0, i.e., a probability distribution. Support both dense and sparse matrix. @@ -27,10 +27,9 @@ def row_as_probdist(mat, dense_output=False): The matrix to be normalized dense_output : bool whether forced dense output - Note - ---- - For row with all entries 0, we normalize it to a vector with all entries 1/n - + perserve_zeros : bool + If False, for row with all entries 0, we normalize it to a vector with all entries 1/n. + Leave 0 otherwise Returns ------- dense or sparse matrix: @@ -43,7 +42,8 @@ def row_as_probdist(mat, dense_output=False): row_sum[zero_rows] = 1 diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0])) mat = diag.dot(mat) - mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1]))) + if not preserve_zeros: + mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1]))) if dense_output and sparse.issparse(mat): return mat.todense() From cd2399706f685e8c4486920b95d1909bc9c85244 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Mon, 19 Nov 2018 20:01:43 +0000 Subject: [PATCH 09/13] node2vec&deepwalk_checked_v0.0 & alias samp detail --- src/libnrl/graph.py | 77 ++++++++++++---------- src/libnrl/node2vec.py | 14 ++-- src/libnrl/walker.py | 141 +++++++++++++++++++++++------------------ src/main.py | 21 +++--- 4 files changed, 144 insertions(+), 109 deletions(-) diff --git a/src/libnrl/graph.py b/src/libnrl/graph.py index eb988c6..1d71fd8 100644 --- a/src/libnrl/graph.py +++ b/src/libnrl/graph.py @@ -22,9 +22,9 @@ class Graph(object): #--------------------commonly used APIs that will modify graph------------------------- #-------------------------------------------------------------------------------------- def node_mapping(self): - """ node id and index mapping; - based on the order given by networkx G.nodes(); - NB: updating is needed if any node is added/removed; + """ node id and index mapping; \n + based on the order given by networkx G.nodes(); \n + NB: updating is needed if any node is added/removed; \n """ i = 0 #node index self.look_up_dict = {} #init @@ -35,10 +35,10 @@ class Graph(object): i += 1 def read_adjlist(self, path, directed=False): - """ read adjacency list format graph; - support unweighted and (un)directed graph; - format: see https://networkx.github.io/documentation/stable/reference/readwrite/adjlist.html - NB: not supoort weighted graph + """ read adjacency list format graph; \n + support unweighted and (un)directed graph; \n + format: see https://networkx.github.io/documentation/stable/reference/readwrite/adjlist.html \n + NB: not supoort weighted graph \n """ if directed: self.G = nx.read_adjlist(path, create_using=nx.DiGraph()) @@ -47,9 +47,9 @@ class Graph(object): self.node_mapping() #update node id index mapping def read_edgelist(self, path, weighted=False, directed=False): - """ read edge list format graph; - support (un)weighted and (un)directed graph; - format: see https://networkx.github.io/documentation/stable/reference/readwrite/edgelist.html + """ read edge list format graph; \n + support (un)weighted and (un)directed graph; \n + format: see https://networkx.github.io/documentation/stable/reference/readwrite/edgelist.html \n """ if directed: self.G = nx.read_edgelist(path, create_using=nx.DiGraph()) @@ -57,10 +57,19 @@ class Graph(object): self.G = nx.read_edgelist(path, create_using=nx.Graph()) self.node_mapping() #update node id index mapping + def add_edge_weight(self, equal_weight=1.0): + ''' add weights to networkx graph; \n + currently only support adding 1.0 to all existing edges; \n + some NE method may require 'weight' attribute spcified in networkx graph; \n + to do... support user-specified weights e.g. from file (similar to read_node_attr): node_id1 node_id2 weight \n + https://networkx.github.io/documentation/stable/reference/generated/networkx.classes.function.set_edge_attributes.html#networkx.classes.function.set_edge_attributes + ''' + nx.set_edge_attributes(self.G, equal_weight, 'weight') #check the url and use dict to assign diff weights to diff edges + def read_node_attr(self, path): - """ read node attributes and store as NetworkX graph {'node_id': {'attr': values}} - input file format: node_id1 attr1 attr2 ... attrM - node_id2 attr1 attr2 ... attrM + """ read node attributes and store as NetworkX graph {'node_id': {'attr': values}} \n + input file format: node_id1 attr1 attr2 ... attrM \n + node_id2 attr1 attr2 ... attrM \n """ with open(path, 'r') as fin: for l in fin.readlines(): @@ -68,20 +77,20 @@ class Graph(object): self.G.nodes[vec[0]]['attr'] = np.array([float(x) for x in vec[1:]]) def read_node_label(self, path): - """ todo... read node labels and store as NetworkX graph {'node_id': {'label': values}} - input file format: node_id1 labels - node_id2 labels - with open(path, 'r') as fin: - for l in fin.readlines(): - vec = l.split() - self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]]) + """ todo... read node labels and store as NetworkX graph {'node_id': {'label': values}} \n + input file format: node_id1 labels \n + node_id2 labels \n + with open(path, 'r') as fin: \n + for l in fin.readlines(): \n + vec = l.split() \n + self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]]) \n """ pass #to do... def remove_edge(self, ratio=0.0): - """ randomly remove edges/links - ratio: the percentage of edges to be removed - edges_removed: return removed edges, each of which is a pair of nodes + """ randomly remove edges/links \n + ratio: the percentage of edges to be removed \n + edges_removed: return removed edges, each of which is a pair of nodes \n """ num_edges_removed = int( ratio * self.G.number_of_edges() ) #random.seed(2018) @@ -92,13 +101,13 @@ class Graph(object): return edges_removed def remove_node_attr(self, ratio): - """ todo... randomly remove node attributes; + """ todo... randomly remove node attributes; \n """ pass #to do... def remove_node(self, ratio): - """ todo... randomly remove nodes; - #self.node_mapping() #update node id index mapping is needed + """ todo... randomly remove nodes; \n + #self.node_mapping() #update node id index mapping is needed \n """ pass #to do... @@ -106,8 +115,8 @@ class Graph(object): #--------------------commonly used APIs that will not modify graph------------------------- #------------------------------------------------------------------------------------------ def get_adj_mat(self, is_sparse=True): - """ return adjacency matrix; - use 'csr' format for sparse matrix + """ return adjacency matrix; \n + use 'csr' format for sparse matrix \n """ if is_sparse: return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64') @@ -115,8 +124,8 @@ class Graph(object): return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64') def get_attr_mat(self, is_sparse=True): - """ return attribute matrix; - use 'csr' format for sparse matrix + """ return attribute matrix; \n + use 'csr' format for sparse matrix \n """ attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())]) if is_sparse: @@ -132,6 +141,10 @@ class Graph(object): """ return the number of edges """ return nx.number_of_edges(self.G) + def get_density(self): + """ return the density of a graph """ + return nx.density(self.G) + def get_num_isolates(self): """ return the number of isolated nodes """ return len(list(nx.isolates(self.G))) @@ -153,8 +166,8 @@ class Graph(object): return list(nx.common_neighbors(self.G, node1, node2)) def get_centrality(self, centrality_type='degree'): - """ todo... return specified type of centrality - see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html + """ todo... return specified type of centrality \n + see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html \n """ pass #to do... diff --git a/src/libnrl/node2vec.py b/src/libnrl/node2vec.py index 923fce2..3ab8076 100644 --- a/src/libnrl/node2vec.py +++ b/src/libnrl/node2vec.py @@ -1,3 +1,11 @@ +""" +NE method: DeepWalk and Node2Vec + +modified by Chengbin Hou and Zeyu Dong 2018 + +originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/node2vec.py +""" + from __future__ import print_function import time import warnings @@ -7,9 +15,7 @@ from . import walker class Node2vec(object): - def __init__(self, graph, path_length, num_paths, dim, p=1.0, q=1.0, dw=False, **kwargs): - kwargs["workers"] = kwargs.get("workers", 1) if dw: kwargs["hs"] = 1 @@ -18,9 +24,9 @@ class Node2vec(object): self.graph = graph if dw: - self.walker = walker.BasicWalker(graph, workers=kwargs["workers"]) + self.walker = walker.BasicWalker(graph, workers=kwargs["workers"]) #walker for deepwalk else: - self.walker = walker.Walker(graph, p=p, q=q, workers=kwargs["workers"]) + self.walker = walker.Walker(graph, p=p, q=q, workers=kwargs["workers"]) #walker for node2vec print("Preprocess transition probs...") self.walker.preprocess_transition_probs() sentences = self.walker.simulate_walks(num_walks=num_paths, walk_length=path_length) diff --git a/src/libnrl/walker.py b/src/libnrl/walker.py index a9c701d..53d131b 100644 --- a/src/libnrl/walker.py +++ b/src/libnrl/walker.py @@ -14,12 +14,7 @@ import numpy as np from networkx import nx -def deepwalk_walk_wrapper(class_instance, walk_length, start_node): - class_instance.deepwalk_walk(walk_length, start_node) - # ===========================================ABRW-weighted-walker============================================ - - class WeightedWalker: ''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method ''' @@ -28,52 +23,59 @@ class WeightedWalker: self.look_back_list = node_id_map self.T = transition_mat self.workers = workers - # self.G = nx.to_networkx_graph(self.T, create_using=nx.Graph()) # wrong... will return symt transition mat - self.G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed graph based on transition matrix - # print(nx.adjacency_matrix(self.G).todense()[0:6, 0:6]) + # self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.Graph()) # wrong... will return symt transition mat + self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix + # print(nx.adjacency_matrix(self.rec_G).todense()[0:6, 0:6]) # print(transition_mat[0:6, 0:6]) - # print(nx.adjacency_matrix(self.G).todense()==transition_mat) + # print(nx.adjacency_matrix(self.rec_G).todense()==transition_mat) # alias sampling for ABRW------------------------- def simulate_walks(self, num_walks, walk_length): t1 = time.time() - self.preprocess_transition_probs(G=self.G) # construct alias table; adapted from node2vec + self.preprocess_transition_probs(weighted_G=self.rec_G) # construct alias table; adapted from node2vec t2 = time.time() print(f'Time for construct alias table: {(t2-t1):.2f}') walks = [] - nodes = list(self.G.nodes()) - print('Walk iteration:') + nodes = list(self.rec_G.nodes()) for walk_iter in range(num_walks): - print(str(walk_iter+1), '/', str(num_walks)) + t1 = time.time() # random.seed(2018) random.shuffle(nodes) for node in nodes: - walks.append(self.node2vec_walk(G=self.G, walk_length=walk_length, start_node=node)) + walks.append(self.weighted_walk(weighted_G=self.rec_G, walk_length=walk_length, start_node=node)) + t2 = time.time() + print(f'Walk iteration: {walk_iter+1}/{num_walks}; time cost: {(t2-t1):.2f}') - for i in range(len(walks)): # use ind to retrive orignal node ID + for i in range(len(walks)): # use ind to retrive orignal node ID for j in range(len(walks[0])): walks[i][j] = self.look_back_list[int(walks[i][j])] return walks - def node2vec_walk(self, G, walk_length, start_node): # more efficient way instead of copy from node2vec - alias_nodes = self.alias_nodes + def weighted_walk(self, weighted_G, walk_length, start_node): # more efficient way instead of copy from node2vec + G = weighted_G walk = [start_node] while len(walk) < walk_length: cur = walk[-1] cur_nbrs = list(G.neighbors(cur)) - if len(cur_nbrs) > 0: - walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) - else: - break + if len(cur_nbrs) > 0: # if non-isolated node + walk.append(cur_nbrs[alias_draw(self.alias_nodes[cur][0], self.alias_nodes[cur][1])]) # alias sampling in O(1) time to get the index of + else: # if isolated node # 1) randomly choose a nbr; 2) judege if use nbr or its alias + break return walk - def preprocess_transition_probs(self, G): + def preprocess_transition_probs(self, weighted_G): + ''' reconstructed G mush be weighted; \n + return a dict of alias table for each node + ''' + G = weighted_G alias_nodes = {} # unlike node2vec, the reconstructed graph is based on transtion matrix for node in G.nodes(): # no need to normalize again - probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)] - alias_nodes[node] = alias_setup(probs) - self.alias_nodes = alias_nodes + probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)] #pick prob of neighbors with non-zero weight --> sum up to 1.0 + #print(f'sum of prob: {np.sum(probs)}') + alias_nodes[node] = alias_setup(probs) #alias table format {node_id: (array1, array2)} + self.alias_nodes = alias_nodes #where array1 gives alias node indexes; array2 gives its prob + #print(self.alias_nodes) ''' @@ -127,20 +129,23 @@ class WeightedWalker: return self.walks ''' + + +def deepwalk_walk_wrapper(class_instance, walk_length, start_node): + class_instance.deepwalk_walk(walk_length, start_node) + # ===========================================deepWalk-walker============================================ - - class BasicWalker: - def __init__(self, G, workers): - self.G = G.G - self.node_size = G.get_num_nodes() - self.look_up_dict = G.look_up_dict + def __init__(self, g, workers): + self.g = g + self.node_size = g.get_num_nodes() + self.look_up_dict = g.look_up_dict def deepwalk_walk(self, walk_length, start_node): ''' Simulate a random walk starting from start node. ''' - G = self.G + G = self.g.G look_up_dict = self.look_up_dict node_size = self.node_size @@ -159,37 +164,48 @@ class BasicWalker: ''' Repeatedly simulate random walks from each node. ''' - G = self.G + G = self.g.G walks = [] nodes = list(G.nodes()) - print('Walk iteration:') for walk_iter in range(num_walks): + t1 = time.time() # pool = multiprocessing.Pool(processes = 4) - print(str(walk_iter+1), '/', str(num_walks)) random.shuffle(nodes) for node in nodes: # walks.append(pool.apply_async(deepwalk_walk_wrapper, (self, walk_length, node, ))) walks.append(self.deepwalk_walk(walk_length=walk_length, start_node=node)) # pool.close() # pool.join() + t2 = time.time() + print(f'Walk iteration: {walk_iter+1}/{num_walks}; time cost: {(t2-t1):.2f}') # print(len(walks)) return walks # ===========================================node2vec-walker============================================ class Walker: - def __init__(self, G, p, q, workers): - self.G = G.G + def __init__(self, g, p, q, workers): + self.g = g self.p = p self.q = q - self.node_size = G.node_size - self.look_up_dict = G.look_up_dict + + if self.g.get_isweighted(): + #print('is weighted graph: ', self.g.get_isweighted()) + #print(self.g.get_adj_mat(is_sparse=False)[0:6,0:6]) + pass + else: #otherwise, add equal weights 1.0 to all existing edges + #print('is weighted graph: ', self.g.get_isweighted()) + self.g.add_edge_weight(equal_weight=1.0) #add 'weight' to networkx graph + #print(self.g.get_adj_mat(is_sparse=False)[0:6,0:6]) + + self.node_size = g.get_num_nodes() + self.look_up_dict = g.look_up_dict def node2vec_walk(self, walk_length, start_node): ''' Simulate a random walk starting from start node. ''' - G = self.G + G = self.g.G alias_nodes = self.alias_nodes alias_edges = self.alias_edges look_up_dict = self.look_up_dict @@ -205,9 +221,7 @@ class Walker: walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) else: prev = walk[-2] - pos = (prev, cur) - next = cur_nbrs[alias_draw(alias_edges[pos][0], - alias_edges[pos][1])] + next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], alias_edges[(prev, cur)][1])] walk.append(next) else: break @@ -217,22 +231,23 @@ class Walker: ''' Repeatedly simulate random walks from each node. ''' - G = self.G + G = self.g.G walks = [] nodes = list(G.nodes()) - print('Walk iteration:') for walk_iter in range(num_walks): - print(str(walk_iter+1), '/', str(num_walks)) + t1 = time.time() random.shuffle(nodes) for node in nodes: walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) + t2 = time.time() + print(f'Walk iteration: {walk_iter+1}/{num_walks}; time cost: {(t2-t1):.2f}') return walks def get_alias_edge(self, src, dst): ''' Get the alias edge setup lists for a given edge. ''' - G = self.G + G = self.g.G p = self.p q = self.q @@ -246,18 +261,16 @@ class Walker: unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] - return alias_setup(normalized_probs) def preprocess_transition_probs(self): ''' Preprocessing of transition probabilities for guiding the random walks. ''' - G = self.G - + G = self.g.G alias_nodes = {} for node in G.nodes(): - unnormalized_probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)] + unnormalized_probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)] #pick prob of neighbors with non-zero weight norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] alias_nodes[node] = alias_setup(normalized_probs) @@ -266,16 +279,20 @@ class Walker: triads = {} look_up_dict = self.look_up_dict - node_size = self.node_size #to do... node2vec directed and undirected - for edge in G.edges(): #https://github.com/aditya-grover/node2vec/blob/master/src/node2vec.py - alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + node_size = self.node_size + if self.g.get_isdirected(): + for edge in G.edges(): + alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + else: #if undirected, duplicate the reverse direction; otherwise may get key error + for edge in G.edges(): + alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) self.alias_nodes = alias_nodes self.alias_edges = alias_edges - return - +#========================================= utils: alias sampling method ==================================================== def alias_setup(probs): ''' Compute utility lists for non-uniform sampling from discrete distributions. @@ -295,7 +312,7 @@ def alias_setup(probs): else: larger.append(kk) - while len(smaller) > 0 and len(larger) > 0: + while len(smaller) > 0 and len(larger) > 0: #it is all about use large prob to compensate small prob untill reach the average small = smaller.pop() large = larger.pop() @@ -306,7 +323,7 @@ def alias_setup(probs): else: larger.append(large) - return J, q + return J, q #the values in J are indexes; it is possible to have repeated indexes if that that index have large prob to compensate others def alias_draw(J, q): @@ -315,8 +332,8 @@ def alias_draw(J, q): ''' K = len(J) - kk = int(np.floor(np.random.rand()*K)) - if np.random.rand() < q[kk]: - return kk + kk = int(np.floor(np.random.rand()*K)) #randomly choose a nbr (an index) + if np.random.rand() < q[kk]: #use alias table to choose + return kk #either that nbr node (an index) else: - return J[kk] + return J[kk] #or the nbr's alias node (an index) \ No newline at end of file diff --git a/src/main.py b/src/main.py index bedf744..f954851 100644 --- a/src/main.py +++ b/src/main.py @@ -86,7 +86,7 @@ def parse_args(): help='balance struc and attr info; ranging [0, inf]') parser.add_argument('--AttrComb-mode', default='concat', type=str, help='choices of mode: concat, elementwise-mean, elementwise-max') - parser.add_argument('--Node2Vec-p', default=0.5, type=float, + parser.add_argument('--Node2Vec-p', default=0.5, type=float, #if p=q=1.0 node2vec = deepwalk help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]') parser.add_argument('--Node2Vec-q', default=0.5, type=float, help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]') @@ -179,20 +179,12 @@ def main(args): elif args.method == 'attrcomb': model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers, comb_method=args.AttrComb_mode) #comb_method: concat, elementwise-mean, elementwise-max - elif args.method == 'asne': - if args.task == 'nc': - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) - else: - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, - X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) elif args.method == 'deepwalk': - model = node2vec.Node2vec(graph=g, path_length=args.walk_length, - num_paths=args.number_walks, dim=args.dim, + model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim, workers=args.workers, window=args.window_size, dw=True) elif args.method == 'node2vec': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim, - workers=args.workers, p=args.Node2Vec_p, q=args.Node2Vec_q, window=args.window_size) + workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q) elif args.method == 'grarep': model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim) elif args.method == 'line': @@ -201,6 +193,13 @@ def main(args): label_file=args.label_file, clf_ratio=args.label_reserved) else: model = line.LINE(g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order) + elif args.method == 'asne': + if args.task == 'nc': + model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, + X_test=None, Y_test=None, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) + else: + model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, + X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) elif args.method == 'graphsage': model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'graphsage_mean') #we follow the default parameters, see __inti__.py in graphsage file From 359d0aeea7894aa2043c9893184b9492416098d6 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Mon, 19 Nov 2018 21:14:19 +0000 Subject: [PATCH 10/13] abrw_v0.0 fixup --- src/libnrl/abrw.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/libnrl/abrw.py b/src/libnrl/abrw.py index c4e03a2..37066aa 100644 --- a/src/libnrl/abrw.py +++ b/src/libnrl/abrw.py @@ -64,7 +64,9 @@ class ABRW(object): ''' print("obtaining biased transition matrix where each row sums up to 1.0...") - T_A = row_as_probdist(A) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row + preserve_zeros = False # compare them: 1) accuracy; 2) efficiency + T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row + print('Preserve zero rows of the adj matrix: ', preserve_zeros) t1 = time.time() X_sim = pairwise_similarity(X) # attr similarity mat; X_sim is a square mat, but X is not @@ -72,7 +74,7 @@ class ABRW(object): t2 = time.time() print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node') cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1) - X_sim[(X_sim < cutoff)] = 0 + X_sim[(X_sim < cutoff)] = 0 # improve both accuracy and efficiency X_sim = sparse.csr_matrix(X_sim) t3 = time.time() From 768d04eda02f23317111e61ff7754b4445a7901f Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Tue, 20 Nov 2018 11:33:24 +0000 Subject: [PATCH 11/13] grarep_checked_v0.0 --- src/libnrl/grarep.py | 13 +++++++++++++ src/main.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/libnrl/grarep.py b/src/libnrl/grarep.py index db3f7f3..2649a27 100644 --- a/src/libnrl/grarep.py +++ b/src/libnrl/grarep.py @@ -1,7 +1,16 @@ +""" +a matrix factorization based NE method: GraRep + +modified by Chengbin Hou 2018 + +originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/grarep.py +""" + import math import numpy as np from numpy import linalg as la from sklearn.preprocessing import normalize +from .utils import row_as_probdist class GraRep(object): @@ -13,6 +22,7 @@ class GraRep(object): self.train() def getAdjMat(self): + ''' graph = self.g.G node_size = self.g.get_num_nodes() look_up = self.g.look_up_dict @@ -22,6 +32,9 @@ class GraRep(object): adj[look_up[edge[1]]][look_up[edge[0]]] = 1.0 # ScaleSimMat return np.matrix(adj/np.sum(adj, axis=1)) + ''' + adj = self.g.get_adj_mat() #for isolated node row, normalize to [1/n, 1/n, ...] + return row_as_probdist(adj, dense_output=True, preserve_zeros=False) def GetProbTranMat(self, Ak): probTranMat = np.log(Ak/np.tile( diff --git a/src/main.py b/src/main.py index f954851..7d2b6d4 100644 --- a/src/main.py +++ b/src/main.py @@ -91,7 +91,7 @@ def parse_args(): parser.add_argument('--Node2Vec-q', default=0.5, type=float, help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]') parser.add_argument('--GraRep-kstep', default=4, type=int, - help='use k-step transition probability matrix') + help='use k-step transition probability matrix, error if dim%Kstep!=0') parser.add_argument('--LINE-order', default=3, type=int, help='choices of the order(s), 1st order, 2nd order, 1st+2nd order') parser.add_argument('--LINE-no-auto-save', action='store_true', From 1dd38e348d429c7f65d22238a5247b15785bb123 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Tue, 20 Nov 2018 21:31:39 +0000 Subject: [PATCH 12/13] line_checked_v0.0 & fixup args --- src/libnrl/line.py | 53 ++++++++++++++++++++++++++++------------------ src/libnrl/tadw.py | 1 + src/main.py | 27 ++++++++++------------- 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/src/libnrl/line.py b/src/libnrl/line.py index 9e39f58..97e5e94 100644 --- a/src/libnrl/line.py +++ b/src/libnrl/line.py @@ -1,10 +1,19 @@ +""" +ANE method: Text Associated DeepWalk (TADW) + +modified by Chengbin Hou 2018 + +originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/line.py +the main diff: adapt to our graph.py APIs; and use 'micro-F1' to find the best emb if auto_save +""" + from __future__ import print_function import random import math import numpy as np from sklearn.linear_model import LogisticRegression import tensorflow as tf -from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label +from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings? class _LINE(object): @@ -32,8 +41,8 @@ class _LINE(object): self.sign = tf.placeholder(tf.float32, [None]) cur_seed = random.getrandbits(32) - self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform = False, seed=cur_seed)) - self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform = False, seed=cur_seed)) + self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed)) + self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed)) # self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1) # self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1) # self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1) @@ -61,7 +70,7 @@ class _LINE(object): self.t : t, self.sign : sign, } - _, cur_loss = self.sess.run([self.train_op, self.loss],feed_dict) + _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict) sum_loss += cur_loss batch_id += 1 print('epoch:{} sum of loss:{!s}'.format(self.cur_epoch, sum_loss)) @@ -163,7 +172,7 @@ class _LINE(object): cur_large_block = large_block[num_large_block] self.edge_prob[cur_small_block] = norm_prob[cur_small_block] self.edge_alias[cur_small_block] = cur_large_block - norm_prob[cur_large_block] = norm_prob[cur_large_block] + norm_prob[cur_small_block] -1 + norm_prob[cur_large_block] = norm_prob[cur_large_block] + norm_prob[cur_small_block]-1 if norm_prob[cur_large_block] < 1: small_block[num_small_block] = cur_large_block num_small_block += 1 @@ -188,55 +197,57 @@ class _LINE(object): vectors[look_back[i]] = embedding return vectors + class LINE(object): - def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_save = True): + def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, clf_ratio=0.5, auto_save=True): + print('auto save the best embeddings', auto_save) self.rep_size = rep_size self.order = order self.best_result = 0 self.vectors = {} - if order == 3: + self.g = graph + + if not self.g.get_isweighted(): #add equal weights 1.0 to all existing edges + self.g.add_edge_weight(equal_weight=1.0) #add 'weight' to networkx graph + + if order == 3: #if order 3 i.e. concat embeddings by 1 and 2 self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1) self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2) for i in range(epoch): self.model1.train_one_epoch() self.model2.train_one_epoch() - ''' if label_file: self.get_embeddings() X, Y = read_node_label(label_file) print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100)) - clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) + clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) - if result['macro'] > self.best_result: - self.best_result = result['macro'] + if result['micro'] > self.best_result: + self.best_result = result['micro'] if auto_save: self.best_vector = self.vectors - ''' - else: + else: #if order 1 or 2 self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order) for i in range(epoch): self.model.train_one_epoch() - ''' if label_file: self.get_embeddings() X, Y = read_node_label(label_file) print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100)) - clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) + clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) - if result['macro'] > self.best_result: - self.best_result = result['macro'] + if result['micro'] > self.best_result: + self.best_result = result['micro'] if auto_save: self.best_vector = self.vectors - ''' self.get_embeddings() if auto_save and label_file: - #self.vectors = self.best_vector - pass + self.vectors = self.best_vector def get_embeddings(self): self.last_vectors = self.vectors @@ -256,4 +267,4 @@ class LINE(object): for node, vec in self.vectors.items(): fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) - fout.close() + fout.close() \ No newline at end of file diff --git a/src/libnrl/tadw.py b/src/libnrl/tadw.py index 3d9cbe5..1dbc57f 100644 --- a/src/libnrl/tadw.py +++ b/src/libnrl/tadw.py @@ -4,6 +4,7 @@ ANE method: Text Associated DeepWalk (TADW) modified by Chengbin Hou 2018 originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/tadw.py +the main diff: adapt to our graph.py APIs to do... sparse computation and remove unnecessary self vars; otherwise, not scalable to large network; """ diff --git a/src/main.py b/src/main.py index 7d2b6d4..f9307ef 100644 --- a/src/main.py +++ b/src/main.py @@ -43,11 +43,7 @@ def parse_args(): parser.add_argument('--attribute-file', default='data/cora/cora_attr.txt', help='node attribute/feature file') parser.add_argument('--label-file', default='data/cora/cora_label.txt', - help='node label file') - parser.add_argument('--emb-file', default='emb/unnamed_node_embs.txt', - help='node embeddings file; suggest: data_method_dim_embs.txt') - parser.add_argument('--save-emb', default=False, type=bool, - help='save emb to disk if True') + help='node label file') parser.add_argument('--dim', default=128, type=int, help='node embeddings dimensions') parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'], @@ -60,10 +56,14 @@ def parse_args(): # help='for lp task, train/test split, a ratio ranging [0.0, 1.0]') parser.add_argument('--label-reserved', default=0.7, type=float, help='for nc task, train/test split, a ratio ranging [0.0, 1.0]') - parser.add_argument('--directed', default=False, type=bool, + parser.add_argument('--directed', default=False, action='store_true', help='directed or undirected graph') - parser.add_argument('--weighted', default=False, type=bool, + parser.add_argument('--weighted', default=False, action='store_true', help='weighted or unweighted graph') + parser.add_argument('--save-emb', default=False, action='store_true', + help='save emb to disk if True') + parser.add_argument('--emb-file', default='emb/unnamed_node_embs.txt', + help='node embeddings file; suggest: data_method_dim_embs.txt') #-------------------------------------------------method settings----------------------------------------------------------- parser.add_argument('--method', default='abrw', choices=['node2vec', 'deepwalk', 'line', 'gcn', 'grarep', 'tadw', 'abrw', 'asne', 'aane', 'attrpure', 'attrcomb', 'graphsage'], @@ -93,9 +93,7 @@ def parse_args(): parser.add_argument('--GraRep-kstep', default=4, type=int, help='use k-step transition probability matrix, error if dim%Kstep!=0') parser.add_argument('--LINE-order', default=3, type=int, - help='choices of the order(s), 1st order, 2nd order, 1st+2nd order') - parser.add_argument('--LINE-no-auto-save', action='store_true', - help='no save the best embeddings when training LINE') + help='choices of the order(s): 1->1st, 2->2nd, 3->1st+2nd') parser.add_argument('--LINE-negative-ratio', default=5, type=int, help='the negative ratio') #for walk based methods; some Word2Vec SkipGram parameters are not specified here @@ -187,12 +185,9 @@ def main(args): workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q) elif args.method == 'grarep': model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim) - elif args.method == 'line': - if args.label_file and not args.LINE_no_auto_save: - model = line.LINE(g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order, - label_file=args.label_file, clf_ratio=args.label_reserved) - else: - model = line.LINE(g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order) + elif args.method == 'line': #if auto_save, use label to justifiy the best embeddings by looking at node classification micro-F1 score + model = line.LINE(graph=g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio, + label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True) elif args.method == 'asne': if args.task == 'nc': model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, From ce0ce07d4c2d641efc87d02aab1e862882f72283 Mon Sep 17 00:00:00 2001 From: Chengbin Hou Date: Tue, 20 Nov 2018 21:47:12 +0000 Subject: [PATCH 13/13] line_checked_v0.0_minor_fixup --- src/libnrl/line.py | 12 ++++++------ src/main.py | 11 ++++------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/libnrl/line.py b/src/libnrl/line.py index 97e5e94..0161b8a 100644 --- a/src/libnrl/line.py +++ b/src/libnrl/line.py @@ -200,8 +200,8 @@ class _LINE(object): class LINE(object): - def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, clf_ratio=0.5, auto_save=True): - print('auto save the best embeddings', auto_save) + def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, clf_ratio=0.5, auto_save=True, best='micro'): + print('auto save the best embeddings: ', auto_save, ' by looking at: ', best, '-F1') self.rep_size = rep_size self.order = order self.best_result = 0 @@ -224,8 +224,8 @@ class LINE(object): clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) - if result['micro'] > self.best_result: - self.best_result = result['micro'] + if result[best] > self.best_result: + self.best_result = result[best] if auto_save: self.best_vector = self.vectors @@ -240,8 +240,8 @@ class LINE(object): clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) - if result['micro'] > self.best_result: - self.best_result = result['micro'] + if result[best] > self.best_result: + self.best_result = result[best] if auto_save: self.best_vector = self.vectors diff --git a/src/main.py b/src/main.py index f9307ef..ced48d0 100644 --- a/src/main.py +++ b/src/main.py @@ -185,9 +185,9 @@ def main(args): workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q) elif args.method == 'grarep': model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim) - elif args.method == 'line': #if auto_save, use label to justifiy the best embeddings by looking at node classification micro-F1 score + elif args.method == 'line': #if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score model = line.LINE(graph=g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio, - label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True) + label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro') elif args.method == 'asne': if args.task == 'nc': model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, @@ -195,11 +195,8 @@ def main(args): else: model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file) - elif args.method == 'graphsage': - model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'graphsage_mean') - #we follow the default parameters, see __inti__.py in graphsage file - #choices: graphsage_mean, gcn ...... - #model.save_embeddings(args.emb_file) #to do... + elif args.method == 'graphsage': #we follow the default parameters, see __inti__.py in graphsage file + model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'graphsage_mean') elif args.method == 'gcn': model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'gcn') #graphsage-gcn else: