Merge branch 'master' into parallel
This commit is contained in:
commit
b48f2e38b7
@ -64,7 +64,9 @@ class ABRW(object):
|
||||
'''
|
||||
print("obtaining biased transition matrix where each row sums up to 1.0...")
|
||||
|
||||
T_A = row_as_probdist(A) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
|
||||
preserve_zeros = False # compare them: 1) accuracy; 2) efficiency
|
||||
T_A = row_as_probdist(A, preserve_zeros) # norm adj/struc info mat; for isolated node, return all-zeros row or all-1/m row
|
||||
print('Preserve zero rows of the adj matrix: ', preserve_zeros)
|
||||
|
||||
t1 = time.time()
|
||||
X_sim = pairwise_similarity(X) # attr similarity mat; X_sim is a square mat, but X is not
|
||||
@ -72,7 +74,7 @@ class ABRW(object):
|
||||
t2 = time.time()
|
||||
print(f'keep the top {self.topk} attribute similar nodes w.r.t. a node')
|
||||
cutoff = np.partition(X_sim, -self.topk, axis=1)[:, -self.topk:].min(axis=1)
|
||||
X_sim[(X_sim < cutoff)] = 0
|
||||
X_sim[(X_sim < cutoff)] = 0 # improve both accuracy and efficiency
|
||||
X_sim = sparse.csr_matrix(X_sim)
|
||||
|
||||
t3 = time.time()
|
||||
|
@ -22,9 +22,9 @@ class Graph(object):
|
||||
#--------------------commonly used APIs that will modify graph-------------------------
|
||||
#--------------------------------------------------------------------------------------
|
||||
def node_mapping(self):
|
||||
""" node id and index mapping;
|
||||
based on the order given by networkx G.nodes();
|
||||
NB: updating is needed if any node is added/removed;
|
||||
""" node id and index mapping; \n
|
||||
based on the order given by networkx G.nodes(); \n
|
||||
NB: updating is needed if any node is added/removed; \n
|
||||
"""
|
||||
i = 0 #node index
|
||||
self.look_up_dict = {} #init
|
||||
@ -35,10 +35,10 @@ class Graph(object):
|
||||
i += 1
|
||||
|
||||
def read_adjlist(self, path, directed=False):
|
||||
""" read adjacency list format graph;
|
||||
support unweighted and (un)directed graph;
|
||||
format: see https://networkx.github.io/documentation/stable/reference/readwrite/adjlist.html
|
||||
NB: not supoort weighted graph
|
||||
""" read adjacency list format graph; \n
|
||||
support unweighted and (un)directed graph; \n
|
||||
format: see https://networkx.github.io/documentation/stable/reference/readwrite/adjlist.html \n
|
||||
NB: not supoort weighted graph \n
|
||||
"""
|
||||
if directed:
|
||||
self.G = nx.read_adjlist(path, create_using=nx.DiGraph())
|
||||
@ -47,9 +47,9 @@ class Graph(object):
|
||||
self.node_mapping() #update node id index mapping
|
||||
|
||||
def read_edgelist(self, path, weighted=False, directed=False):
|
||||
""" read edge list format graph;
|
||||
support (un)weighted and (un)directed graph;
|
||||
format: see https://networkx.github.io/documentation/stable/reference/readwrite/edgelist.html
|
||||
""" read edge list format graph; \n
|
||||
support (un)weighted and (un)directed graph; \n
|
||||
format: see https://networkx.github.io/documentation/stable/reference/readwrite/edgelist.html \n
|
||||
"""
|
||||
if directed:
|
||||
self.G = nx.read_edgelist(path, create_using=nx.DiGraph())
|
||||
@ -57,10 +57,19 @@ class Graph(object):
|
||||
self.G = nx.read_edgelist(path, create_using=nx.Graph())
|
||||
self.node_mapping() #update node id index mapping
|
||||
|
||||
def add_edge_weight(self, equal_weight=1.0):
|
||||
''' add weights to networkx graph; \n
|
||||
currently only support adding 1.0 to all existing edges; \n
|
||||
some NE method may require 'weight' attribute spcified in networkx graph; \n
|
||||
to do... support user-specified weights e.g. from file (similar to read_node_attr): node_id1 node_id2 weight \n
|
||||
https://networkx.github.io/documentation/stable/reference/generated/networkx.classes.function.set_edge_attributes.html#networkx.classes.function.set_edge_attributes
|
||||
'''
|
||||
nx.set_edge_attributes(self.G, equal_weight, 'weight') #check the url and use dict to assign diff weights to diff edges
|
||||
|
||||
def read_node_attr(self, path):
|
||||
""" read node attributes and store as NetworkX graph {'node_id': {'attr': values}}
|
||||
input file format: node_id1 attr1 attr2 ... attrM
|
||||
node_id2 attr1 attr2 ... attrM
|
||||
""" read node attributes and store as NetworkX graph {'node_id': {'attr': values}} \n
|
||||
input file format: node_id1 attr1 attr2 ... attrM \n
|
||||
node_id2 attr1 attr2 ... attrM \n
|
||||
"""
|
||||
with open(path, 'r') as fin:
|
||||
for l in fin.readlines():
|
||||
@ -68,20 +77,20 @@ class Graph(object):
|
||||
self.G.nodes[vec[0]]['attr'] = np.array([float(x) for x in vec[1:]])
|
||||
|
||||
def read_node_label(self, path):
|
||||
""" todo... read node labels and store as NetworkX graph {'node_id': {'label': values}}
|
||||
input file format: node_id1 labels
|
||||
node_id2 labels
|
||||
with open(path, 'r') as fin:
|
||||
for l in fin.readlines():
|
||||
vec = l.split()
|
||||
self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]])
|
||||
""" todo... read node labels and store as NetworkX graph {'node_id': {'label': values}} \n
|
||||
input file format: node_id1 labels \n
|
||||
node_id2 labels \n
|
||||
with open(path, 'r') as fin: \n
|
||||
for l in fin.readlines(): \n
|
||||
vec = l.split() \n
|
||||
self.G.nodes[vec[0]]['label'] = np.array([float(x) for x in vec[1:]]) \n
|
||||
"""
|
||||
pass #to do...
|
||||
|
||||
def remove_edge(self, ratio=0.0):
|
||||
""" randomly remove edges/links
|
||||
ratio: the percentage of edges to be removed
|
||||
edges_removed: return removed edges, each of which is a pair of nodes
|
||||
""" randomly remove edges/links \n
|
||||
ratio: the percentage of edges to be removed \n
|
||||
edges_removed: return removed edges, each of which is a pair of nodes \n
|
||||
"""
|
||||
num_edges_removed = int( ratio * self.G.number_of_edges() )
|
||||
#random.seed(2018)
|
||||
@ -92,13 +101,13 @@ class Graph(object):
|
||||
return edges_removed
|
||||
|
||||
def remove_node_attr(self, ratio):
|
||||
""" todo... randomly remove node attributes;
|
||||
""" todo... randomly remove node attributes; \n
|
||||
"""
|
||||
pass #to do...
|
||||
|
||||
def remove_node(self, ratio):
|
||||
""" todo... randomly remove nodes;
|
||||
#self.node_mapping() #update node id index mapping is needed
|
||||
""" todo... randomly remove nodes; \n
|
||||
#self.node_mapping() #update node id index mapping is needed \n
|
||||
"""
|
||||
pass #to do...
|
||||
|
||||
@ -106,8 +115,8 @@ class Graph(object):
|
||||
#--------------------commonly used APIs that will not modify graph-------------------------
|
||||
#------------------------------------------------------------------------------------------
|
||||
def get_adj_mat(self, is_sparse=True):
|
||||
""" return adjacency matrix;
|
||||
use 'csr' format for sparse matrix
|
||||
""" return adjacency matrix; \n
|
||||
use 'csr' format for sparse matrix \n
|
||||
"""
|
||||
if is_sparse:
|
||||
return nx.to_scipy_sparse_matrix(self.G, nodelist=self.look_back_list, format='csr', dtype='float64')
|
||||
@ -115,8 +124,8 @@ class Graph(object):
|
||||
return nx.to_numpy_matrix(self.G, nodelist=self.look_back_list, dtype='float64')
|
||||
|
||||
def get_attr_mat(self, is_sparse=True):
|
||||
""" return attribute matrix;
|
||||
use 'csr' format for sparse matrix
|
||||
""" return attribute matrix; \n
|
||||
use 'csr' format for sparse matrix \n
|
||||
"""
|
||||
attr_dense_narray = np.vstack([self.G.nodes[self.look_back_list[i]]['attr'] for i in range(self.get_num_nodes())])
|
||||
if is_sparse:
|
||||
@ -132,6 +141,10 @@ class Graph(object):
|
||||
""" return the number of edges """
|
||||
return nx.number_of_edges(self.G)
|
||||
|
||||
def get_density(self):
|
||||
""" return the density of a graph """
|
||||
return nx.density(self.G)
|
||||
|
||||
def get_num_isolates(self):
|
||||
""" return the number of isolated nodes """
|
||||
return len(list(nx.isolates(self.G)))
|
||||
@ -153,8 +166,8 @@ class Graph(object):
|
||||
return list(nx.common_neighbors(self.G, node1, node2))
|
||||
|
||||
def get_centrality(self, centrality_type='degree'):
|
||||
""" todo... return specified type of centrality
|
||||
see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html
|
||||
""" todo... return specified type of centrality \n
|
||||
see https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html \n
|
||||
"""
|
||||
pass #to do...
|
||||
|
||||
|
@ -1,7 +1,16 @@
|
||||
"""
|
||||
a matrix factorization based NE method: GraRep
|
||||
|
||||
modified by Chengbin Hou 2018
|
||||
|
||||
originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/grarep.py
|
||||
"""
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
from numpy import linalg as la
|
||||
from sklearn.preprocessing import normalize
|
||||
from .utils import row_as_probdist
|
||||
|
||||
class GraRep(object):
|
||||
|
||||
@ -13,6 +22,7 @@ class GraRep(object):
|
||||
self.train()
|
||||
|
||||
def getAdjMat(self):
|
||||
'''
|
||||
graph = self.g.G
|
||||
node_size = self.g.get_num_nodes()
|
||||
look_up = self.g.look_up_dict
|
||||
@ -22,6 +32,9 @@ class GraRep(object):
|
||||
adj[look_up[edge[1]]][look_up[edge[0]]] = 1.0
|
||||
# ScaleSimMat
|
||||
return np.matrix(adj/np.sum(adj, axis=1))
|
||||
'''
|
||||
adj = self.g.get_adj_mat() #for isolated node row, normalize to [1/n, 1/n, ...]
|
||||
return row_as_probdist(adj, dense_output=True, preserve_zeros=False)
|
||||
|
||||
def GetProbTranMat(self, Ak):
|
||||
probTranMat = np.log(Ak/np.tile(
|
||||
|
@ -1,10 +1,19 @@
|
||||
"""
|
||||
ANE method: Text Associated DeepWalk (TADW)
|
||||
|
||||
modified by Chengbin Hou 2018
|
||||
|
||||
originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/line.py
|
||||
the main diff: adapt to our graph.py APIs; and use 'micro-F1' to find the best emb if auto_save
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import tensorflow as tf
|
||||
from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label
|
||||
from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings?
|
||||
|
||||
|
||||
class _LINE(object):
|
||||
@ -32,8 +41,8 @@ class _LINE(object):
|
||||
self.sign = tf.placeholder(tf.float32, [None])
|
||||
|
||||
cur_seed = random.getrandbits(32)
|
||||
self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform = False, seed=cur_seed))
|
||||
self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform = False, seed=cur_seed))
|
||||
self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
|
||||
self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[self.node_size, self.rep_size], initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
|
||||
# self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1)
|
||||
# self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1)
|
||||
# self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1)
|
||||
@ -61,7 +70,7 @@ class _LINE(object):
|
||||
self.t : t,
|
||||
self.sign : sign,
|
||||
}
|
||||
_, cur_loss = self.sess.run([self.train_op, self.loss],feed_dict)
|
||||
_, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict)
|
||||
sum_loss += cur_loss
|
||||
batch_id += 1
|
||||
print('epoch:{} sum of loss:{!s}'.format(self.cur_epoch, sum_loss))
|
||||
@ -163,7 +172,7 @@ class _LINE(object):
|
||||
cur_large_block = large_block[num_large_block]
|
||||
self.edge_prob[cur_small_block] = norm_prob[cur_small_block]
|
||||
self.edge_alias[cur_small_block] = cur_large_block
|
||||
norm_prob[cur_large_block] = norm_prob[cur_large_block] + norm_prob[cur_small_block] -1
|
||||
norm_prob[cur_large_block] = norm_prob[cur_large_block] + norm_prob[cur_small_block]-1
|
||||
if norm_prob[cur_large_block] < 1:
|
||||
small_block[num_small_block] = cur_large_block
|
||||
num_small_block += 1
|
||||
@ -188,55 +197,57 @@ class _LINE(object):
|
||||
vectors[look_back[i]] = embedding
|
||||
return vectors
|
||||
|
||||
|
||||
class LINE(object):
|
||||
|
||||
def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_save = True):
|
||||
def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, clf_ratio=0.5, auto_save=True, best='micro'):
|
||||
print('auto save the best embeddings: ', auto_save, ' by looking at: ', best, '-F1')
|
||||
self.rep_size = rep_size
|
||||
self.order = order
|
||||
self.best_result = 0
|
||||
self.vectors = {}
|
||||
if order == 3:
|
||||
self.g = graph
|
||||
|
||||
if not self.g.get_isweighted(): #add equal weights 1.0 to all existing edges
|
||||
self.g.add_edge_weight(equal_weight=1.0) #add 'weight' to networkx graph
|
||||
|
||||
if order == 3: #if order 3 i.e. concat embeddings by 1 and 2
|
||||
self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1)
|
||||
self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2)
|
||||
for i in range(epoch):
|
||||
self.model1.train_one_epoch()
|
||||
self.model2.train_one_epoch()
|
||||
'''
|
||||
if label_file:
|
||||
self.get_embeddings()
|
||||
X, Y = read_node_label(label_file)
|
||||
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
||||
clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
|
||||
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
||||
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
||||
|
||||
if result['macro'] > self.best_result:
|
||||
self.best_result = result['macro']
|
||||
if result[best] > self.best_result:
|
||||
self.best_result = result[best]
|
||||
if auto_save:
|
||||
self.best_vector = self.vectors
|
||||
'''
|
||||
|
||||
else:
|
||||
else: #if order 1 or 2
|
||||
self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order)
|
||||
for i in range(epoch):
|
||||
self.model.train_one_epoch()
|
||||
'''
|
||||
if label_file:
|
||||
self.get_embeddings()
|
||||
X, Y = read_node_label(label_file)
|
||||
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
||||
clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
|
||||
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
||||
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
||||
|
||||
if result['macro'] > self.best_result:
|
||||
self.best_result = result['macro']
|
||||
if result[best] > self.best_result:
|
||||
self.best_result = result[best]
|
||||
if auto_save:
|
||||
self.best_vector = self.vectors
|
||||
'''
|
||||
|
||||
self.get_embeddings()
|
||||
if auto_save and label_file:
|
||||
#self.vectors = self.best_vector
|
||||
pass
|
||||
self.vectors = self.best_vector
|
||||
|
||||
def get_embeddings(self):
|
||||
self.last_vectors = self.vectors
|
||||
@ -256,4 +267,4 @@ class LINE(object):
|
||||
for node, vec in self.vectors.items():
|
||||
fout.write("{} {}\n".format(node,
|
||||
' '.join([str(x) for x in vec])))
|
||||
fout.close()
|
||||
fout.close()
|
@ -1,3 +1,11 @@
|
||||
"""
|
||||
NE method: DeepWalk and Node2Vec
|
||||
|
||||
modified by Chengbin Hou and Zeyu Dong 2018
|
||||
|
||||
originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/node2vec.py
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import time
|
||||
import warnings
|
||||
@ -7,9 +15,7 @@ from . import walker
|
||||
|
||||
|
||||
class Node2vec(object):
|
||||
|
||||
def __init__(self, graph, path_length, num_paths, dim, p=1.0, q=1.0, dw=False, **kwargs):
|
||||
|
||||
kwargs["workers"] = kwargs.get("workers", 1)
|
||||
if dw:
|
||||
kwargs["hs"] = 1
|
||||
@ -18,9 +24,9 @@ class Node2vec(object):
|
||||
|
||||
self.graph = graph
|
||||
if dw:
|
||||
self.walker = walker.BasicWalker(graph, workers=kwargs["workers"])
|
||||
self.walker = walker.BasicWalker(graph, workers=kwargs["workers"]) #walker for deepwalk
|
||||
else:
|
||||
self.walker = walker.Walker(graph, p=p, q=q, workers=kwargs["workers"])
|
||||
self.walker = walker.Walker(graph, p=p, q=q, workers=kwargs["workers"]) #walker for node2vec
|
||||
print("Preprocess transition probs...")
|
||||
self.walker.preprocess_transition_probs()
|
||||
sentences = self.walker.simulate_walks(num_walks=num_paths, walk_length=path_length)
|
||||
|
@ -4,6 +4,7 @@ ANE method: Text Associated DeepWalk (TADW)
|
||||
modified by Chengbin Hou 2018
|
||||
|
||||
originally from https://github.com/thunlp/OpenNE/blob/master/src/openne/tadw.py
|
||||
the main diff: adapt to our graph.py APIs
|
||||
to do... sparse computation and remove unnecessary self vars;
|
||||
otherwise, not scalable to large network;
|
||||
"""
|
||||
@ -40,7 +41,7 @@ class TADW(object):
|
||||
return adj/np.sum(adj, axis=1) #original may get numerical error sometimes...
|
||||
'''
|
||||
A = self.g.get_adj_mat() #by defalut, return a sparse matrix
|
||||
return np.array(row_as_probdist(A).todense()) #only support np.array, otherwise dim error...
|
||||
return np.array(row_as_probdist(A, dense_output=True, preserve_zeros=True)) #only support np.array, otherwise dim error...
|
||||
|
||||
|
||||
def getT(self):
|
||||
|
@ -17,7 +17,7 @@ from scipy import sparse
|
||||
# ---------------------------------ulits for calculation--------------------------------
|
||||
|
||||
|
||||
def row_as_probdist(mat): #to do... also return dense matrix via a flag setting
|
||||
def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
|
||||
"""Make each row of matrix sums up to 1.0, i.e., a probability distribution.
|
||||
Support both dense and sparse matrix.
|
||||
|
||||
@ -25,11 +25,11 @@ def row_as_probdist(mat): #to do... also return dense matrix via a flag setting
|
||||
----------
|
||||
mat : scipy sparse matrix or dense matrix or numpy array
|
||||
The matrix to be normalized
|
||||
|
||||
Note
|
||||
----
|
||||
For row with all entries 0, we normalize it to a vector with all entries 1/n
|
||||
|
||||
dense_output : bool
|
||||
whether forced dense output
|
||||
perserve_zeros : bool
|
||||
If False, for row with all entries 0, we normalize it to a vector with all entries 1/n.
|
||||
Leave 0 otherwise
|
||||
Returns
|
||||
-------
|
||||
dense or sparse matrix:
|
||||
@ -42,8 +42,11 @@ def row_as_probdist(mat): #to do... also return dense matrix via a flag setting
|
||||
row_sum[zero_rows] = 1
|
||||
diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
|
||||
mat = diag.dot(mat)
|
||||
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
|
||||
if not preserve_zeros:
|
||||
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
|
||||
|
||||
if dense_output and sparse.issparse(mat):
|
||||
return mat.todense()
|
||||
return mat
|
||||
|
||||
|
||||
|
@ -14,12 +14,7 @@ import numpy as np
|
||||
from networkx import nx
|
||||
|
||||
|
||||
def deepwalk_walk_wrapper(class_instance, walk_length, start_node):
|
||||
class_instance.deepwalk_walk(walk_length, start_node)
|
||||
|
||||
# ===========================================ABRW-weighted-walker============================================
|
||||
|
||||
|
||||
class WeightedWalker:
|
||||
''' Weighted Walker for Attributed Biased Randomw Walks (ABRW) method
|
||||
'''
|
||||
@ -28,11 +23,11 @@ class WeightedWalker:
|
||||
self.look_back_list = node_id_map
|
||||
self.T = transition_mat
|
||||
self.workers = workers
|
||||
# self.G = nx.to_networkx_graph(self.T, create_using=nx.Graph()) # wrong... will return symt transition mat
|
||||
self.G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed graph based on transition matrix
|
||||
# print(nx.adjacency_matrix(self.G).todense()[0:6, 0:6])
|
||||
# self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.Graph()) # wrong... will return symt transition mat
|
||||
self.rec_G = nx.to_networkx_graph(self.T, create_using=nx.DiGraph()) # reconstructed "directed" "weighted" graph based on transition matrix
|
||||
# print(nx.adjacency_matrix(self.rec_G).todense()[0:6, 0:6])
|
||||
# print(transition_mat[0:6, 0:6])
|
||||
# print(nx.adjacency_matrix(self.G).todense()==transition_mat)
|
||||
# print(nx.adjacency_matrix(self.rec_G).todense()==transition_mat)
|
||||
|
||||
# alias sampling for ABRW-------------------------
|
||||
def simulate_walks(self, num_walks, walk_length):
|
||||
@ -40,7 +35,7 @@ class WeightedWalker:
|
||||
P_G = self.G
|
||||
|
||||
t1 = time.time()
|
||||
self.preprocess_transition_probs(G=self.G) # construct alias table; adapted from node2vec
|
||||
self.preprocess_transition_probs(weighted_G=self.rec_G) # construct alias table; adapted from node2vec
|
||||
t2 = time.time()
|
||||
|
||||
global alias_nodes
|
||||
@ -48,23 +43,28 @@ class WeightedWalker:
|
||||
print(f'Time for construct alias table: {(t2-t1):.2f}')
|
||||
|
||||
walks = []
|
||||
nodes = list(self.G.nodes())
|
||||
print('Walk iteration:')
|
||||
nodes = list(self.rec_G.nodes())
|
||||
pool = multiprocessing.Pool(self.workers)
|
||||
for walk_iter in range(num_walks):
|
||||
print(str(walk_iter+1), '/', str(num_walks))
|
||||
t1 = time.time()
|
||||
random.shuffle(nodes)
|
||||
walks += pool.map(functools.partial(node2vec_walk, walk_length=walk_length), nodes)
|
||||
t2 = time.time()
|
||||
print(f'Walk iteration: {walk_iter+1}/{num_walks}; time cost: {(t2-t1):.2f}')
|
||||
pool.close()
|
||||
pool.join()
|
||||
del alias_nodes, P_G
|
||||
|
||||
for i in range(len(walks)): # use ind to retrive orignal node ID
|
||||
for i in range(len(walks)): # use ind to retrive orignal node ID
|
||||
for j in range(len(walks[0])):
|
||||
walks[i][j] = self.look_back_list[int(walks[i][j])]
|
||||
return walks
|
||||
|
||||
def preprocess_transition_probs(self, G):
|
||||
def preprocess_transition_probs(self, weighted_G):
|
||||
''' reconstructed G mush be weighted; \n
|
||||
return a dict of alias table for each node
|
||||
'''
|
||||
G = weighted_G
|
||||
alias_nodes = {}
|
||||
nodes = G.nodes()
|
||||
|
||||
@ -95,19 +95,23 @@ def get_alias_node(node):
|
||||
probs = [P_G[node][nbr]['weight'] for nbr in P_G.neighbors(node)]
|
||||
return alias_setup(probs)
|
||||
|
||||
|
||||
|
||||
def deepwalk_walk_wrapper(class_instance, walk_length, start_node):
|
||||
class_instance.deepwalk_walk(walk_length, start_node)
|
||||
|
||||
# ===========================================deepWalk-walker============================================
|
||||
|
||||
|
||||
class BasicWalker:
|
||||
def __init__(self, G, workers):
|
||||
self.G = G.G
|
||||
self.look_up_dict = G.look_up_dict
|
||||
def __init__(self, g, workers):
|
||||
self.g = g
|
||||
self.node_size = g.get_num_nodes()
|
||||
self.look_up_dict = g.look_up_dict
|
||||
|
||||
def deepwalk_walk(self, walk_length, start_node):
|
||||
'''
|
||||
Simulate a random walk starting from start node.
|
||||
'''
|
||||
G = self.G
|
||||
G = self.g.G
|
||||
look_up_dict = self.look_up_dict
|
||||
node_size = self.node_size
|
||||
|
||||
@ -126,37 +130,48 @@ class BasicWalker:
|
||||
'''
|
||||
Repeatedly simulate random walks from each node.
|
||||
'''
|
||||
G = self.G
|
||||
G = self.g.G
|
||||
walks = []
|
||||
nodes = list(G.nodes())
|
||||
print('Walk iteration:')
|
||||
for walk_iter in range(num_walks):
|
||||
t1 = time.time()
|
||||
# pool = multiprocessing.Pool(processes = 4)
|
||||
print(str(walk_iter+1), '/', str(num_walks))
|
||||
random.shuffle(nodes)
|
||||
for node in nodes:
|
||||
# walks.append(pool.apply_async(deepwalk_walk_wrapper, (self, walk_length, node, )))
|
||||
walks.append(self.deepwalk_walk(walk_length=walk_length, start_node=node))
|
||||
# pool.close()
|
||||
# pool.join()
|
||||
t2 = time.time()
|
||||
print(f'Walk iteration: {walk_iter+1}/{num_walks}; time cost: {(t2-t1):.2f}')
|
||||
# print(len(walks))
|
||||
return walks
|
||||
|
||||
|
||||
# ===========================================node2vec-walker============================================
|
||||
class Walker:
|
||||
def __init__(self, G, p, q, workers):
|
||||
self.G = G.G
|
||||
def __init__(self, g, p, q, workers):
|
||||
self.g = g
|
||||
self.p = p
|
||||
self.q = q
|
||||
self.node_size = G.node_size
|
||||
self.look_up_dict = G.look_up_dict
|
||||
|
||||
if self.g.get_isweighted():
|
||||
#print('is weighted graph: ', self.g.get_isweighted())
|
||||
#print(self.g.get_adj_mat(is_sparse=False)[0:6,0:6])
|
||||
pass
|
||||
else: #otherwise, add equal weights 1.0 to all existing edges
|
||||
#print('is weighted graph: ', self.g.get_isweighted())
|
||||
self.g.add_edge_weight(equal_weight=1.0) #add 'weight' to networkx graph
|
||||
#print(self.g.get_adj_mat(is_sparse=False)[0:6,0:6])
|
||||
|
||||
self.node_size = g.get_num_nodes()
|
||||
self.look_up_dict = g.look_up_dict
|
||||
|
||||
def node2vec_walk(self, walk_length, start_node):
|
||||
'''
|
||||
Simulate a random walk starting from start node.
|
||||
'''
|
||||
G = self.G
|
||||
G = self.g.G
|
||||
alias_nodes = self.alias_nodes
|
||||
alias_edges = self.alias_edges
|
||||
look_up_dict = self.look_up_dict
|
||||
@ -172,9 +187,7 @@ class Walker:
|
||||
walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
|
||||
else:
|
||||
prev = walk[-2]
|
||||
pos = (prev, cur)
|
||||
next = cur_nbrs[alias_draw(alias_edges[pos][0],
|
||||
alias_edges[pos][1])]
|
||||
next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], alias_edges[(prev, cur)][1])]
|
||||
walk.append(next)
|
||||
else:
|
||||
break
|
||||
@ -184,22 +197,23 @@ class Walker:
|
||||
'''
|
||||
Repeatedly simulate random walks from each node.
|
||||
'''
|
||||
G = self.G
|
||||
G = self.g.G
|
||||
walks = []
|
||||
nodes = list(G.nodes())
|
||||
print('Walk iteration:')
|
||||
for walk_iter in range(num_walks):
|
||||
print(str(walk_iter+1), '/', str(num_walks))
|
||||
t1 = time.time()
|
||||
random.shuffle(nodes)
|
||||
for node in nodes:
|
||||
walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
|
||||
t2 = time.time()
|
||||
print(f'Walk iteration: {walk_iter+1}/{num_walks}; time cost: {(t2-t1):.2f}')
|
||||
return walks
|
||||
|
||||
def get_alias_edge(self, src, dst):
|
||||
'''
|
||||
Get the alias edge setup lists for a given edge.
|
||||
'''
|
||||
G = self.G
|
||||
G = self.g.G
|
||||
p = self.p
|
||||
q = self.q
|
||||
|
||||
@ -213,18 +227,16 @@ class Walker:
|
||||
unnormalized_probs.append(G[dst][dst_nbr]['weight']/q)
|
||||
norm_const = sum(unnormalized_probs)
|
||||
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
|
||||
|
||||
return alias_setup(normalized_probs)
|
||||
|
||||
def preprocess_transition_probs(self):
|
||||
'''
|
||||
Preprocessing of transition probabilities for guiding the random walks.
|
||||
'''
|
||||
G = self.G
|
||||
|
||||
G = self.g.G
|
||||
alias_nodes = {}
|
||||
for node in G.nodes():
|
||||
unnormalized_probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)]
|
||||
unnormalized_probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)] #pick prob of neighbors with non-zero weight
|
||||
norm_const = sum(unnormalized_probs)
|
||||
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
|
||||
alias_nodes[node] = alias_setup(normalized_probs)
|
||||
@ -233,16 +245,20 @@ class Walker:
|
||||
triads = {}
|
||||
|
||||
look_up_dict = self.look_up_dict
|
||||
node_size = self.node_size #to do... node2vec directed and undirected
|
||||
for edge in G.edges(): #https://github.com/aditya-grover/node2vec/blob/master/src/node2vec.py
|
||||
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
|
||||
node_size = self.node_size
|
||||
if self.g.get_isdirected():
|
||||
for edge in G.edges():
|
||||
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
|
||||
else: #if undirected, duplicate the reverse direction; otherwise may get key error
|
||||
for edge in G.edges():
|
||||
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
|
||||
alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
|
||||
|
||||
self.alias_nodes = alias_nodes
|
||||
self.alias_edges = alias_edges
|
||||
|
||||
return
|
||||
|
||||
|
||||
#========================================= utils: alias sampling method ====================================================
|
||||
def alias_setup(probs):
|
||||
'''
|
||||
Compute utility lists for non-uniform sampling from discrete distributions.
|
||||
@ -262,7 +278,7 @@ def alias_setup(probs):
|
||||
else:
|
||||
larger.append(kk)
|
||||
|
||||
while len(smaller) > 0 and len(larger) > 0:
|
||||
while len(smaller) > 0 and len(larger) > 0: #it is all about use large prob to compensate small prob untill reach the average
|
||||
small = smaller.pop()
|
||||
large = larger.pop()
|
||||
|
||||
@ -273,7 +289,7 @@ def alias_setup(probs):
|
||||
else:
|
||||
larger.append(large)
|
||||
|
||||
return J, q
|
||||
return J, q #the values in J are indexes; it is possible to have repeated indexes if that that index have large prob to compensate others
|
||||
|
||||
|
||||
def alias_draw(J, q):
|
||||
@ -282,8 +298,8 @@ def alias_draw(J, q):
|
||||
'''
|
||||
K = len(J)
|
||||
|
||||
kk = int(np.floor(np.random.rand()*K))
|
||||
if np.random.rand() < q[kk]:
|
||||
return kk
|
||||
kk = int(np.floor(np.random.rand()*K)) #randomly choose a nbr (an index)
|
||||
if np.random.rand() < q[kk]: #use alias table to choose
|
||||
return kk #either that nbr node (an index)
|
||||
else:
|
||||
return J[kk]
|
||||
return J[kk] #or the nbr's alias node (an index)
|
55
src/main.py
55
src/main.py
@ -43,11 +43,7 @@ def parse_args():
|
||||
parser.add_argument('--attribute-file', default='data/cora/cora_attr.txt',
|
||||
help='node attribute/feature file')
|
||||
parser.add_argument('--label-file', default='data/cora/cora_label.txt',
|
||||
help='node label file')
|
||||
parser.add_argument('--emb-file', default='emb/unnamed_node_embs.txt',
|
||||
help='node embeddings file; suggest: data_method_dim_embs.txt')
|
||||
parser.add_argument('--save-emb', default=False, type=bool,
|
||||
help='save emb to disk if True')
|
||||
help='node label file')
|
||||
parser.add_argument('--dim', default=128, type=int,
|
||||
help='node embeddings dimensions')
|
||||
parser.add_argument('--task', default='lp_and_nc', choices=['none', 'lp', 'nc', 'lp_and_nc'],
|
||||
@ -60,10 +56,14 @@ def parse_args():
|
||||
# help='for lp task, train/test split, a ratio ranging [0.0, 1.0]')
|
||||
parser.add_argument('--label-reserved', default=0.7, type=float,
|
||||
help='for nc task, train/test split, a ratio ranging [0.0, 1.0]')
|
||||
parser.add_argument('--directed', default=False, type=bool,
|
||||
parser.add_argument('--directed', default=False, action='store_true',
|
||||
help='directed or undirected graph')
|
||||
parser.add_argument('--weighted', default=False, type=bool,
|
||||
parser.add_argument('--weighted', default=False, action='store_true',
|
||||
help='weighted or unweighted graph')
|
||||
parser.add_argument('--save-emb', default=False, action='store_true',
|
||||
help='save emb to disk if True')
|
||||
parser.add_argument('--emb-file', default='emb/unnamed_node_embs.txt',
|
||||
help='node embeddings file; suggest: data_method_dim_embs.txt')
|
||||
#-------------------------------------------------method settings-----------------------------------------------------------
|
||||
parser.add_argument('--method', default='abrw', choices=['node2vec', 'deepwalk', 'line', 'gcn', 'grarep', 'tadw',
|
||||
'abrw', 'asne', 'aane', 'attrpure', 'attrcomb', 'graphsage'],
|
||||
@ -86,16 +86,14 @@ def parse_args():
|
||||
help='balance struc and attr info; ranging [0, inf]')
|
||||
parser.add_argument('--AttrComb-mode', default='concat', type=str,
|
||||
help='choices of mode: concat, elementwise-mean, elementwise-max')
|
||||
parser.add_argument('--Node2Vec-p', default=0.5, type=float,
|
||||
parser.add_argument('--Node2Vec-p', default=0.5, type=float, #if p=q=1.0 node2vec = deepwalk
|
||||
help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
|
||||
parser.add_argument('--Node2Vec-q', default=0.5, type=float,
|
||||
help='trade-off BFS and DFS; rid search [0.25; 0.50; 1; 2; 4]')
|
||||
parser.add_argument('--GraRep-kstep', default=4, type=int,
|
||||
help='use k-step transition probability matrix')
|
||||
help='use k-step transition probability matrix, error if dim%Kstep!=0')
|
||||
parser.add_argument('--LINE-order', default=3, type=int,
|
||||
help='choices of the order(s), 1st order, 2nd order, 1st+2nd order')
|
||||
parser.add_argument('--LINE-no-auto-save', action='store_true',
|
||||
help='no save the best embeddings when training LINE')
|
||||
help='choices of the order(s): 1->1st, 2->2nd, 3->1st+2nd')
|
||||
parser.add_argument('--LINE-negative-ratio', default=5, type=int,
|
||||
help='the negative ratio')
|
||||
#for walk based methods; some Word2Vec SkipGram parameters are not specified here
|
||||
@ -179,6 +177,17 @@ def main(args):
|
||||
elif args.method == 'attrcomb':
|
||||
model = attrcomb.ATTRCOMB(graph=g, dim=args.dim, comb_with='deepwalk', number_walks=args.number_walks, walk_length=args.walk_length,
|
||||
window=args.window_size, workers=args.workers, comb_method=args.AttrComb_mode) #comb_method: concat, elementwise-mean, elementwise-max
|
||||
elif args.method == 'deepwalk':
|
||||
model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim,
|
||||
workers=args.workers, window=args.window_size, dw=True)
|
||||
elif args.method == 'node2vec':
|
||||
model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim,
|
||||
workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q)
|
||||
elif args.method == 'grarep':
|
||||
model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
|
||||
elif args.method == 'line': #if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score
|
||||
model = line.LINE(graph=g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio,
|
||||
label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro')
|
||||
elif args.method == 'asne':
|
||||
if args.task == 'nc':
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
|
||||
@ -186,26 +195,8 @@ def main(args):
|
||||
else:
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size,
|
||||
X_test=test_node_pairs, Y_test=test_edge_labels, task=args.task, nc_ratio=args.label_reserved, lp_ratio=args.link_reserved, label_file=args.label_file)
|
||||
elif args.method == 'deepwalk':
|
||||
model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
|
||||
num_paths=args.number_walks, dim=args.dim,
|
||||
workers=args.workers, window=args.window_size, dw=True)
|
||||
elif args.method == 'node2vec':
|
||||
model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim,
|
||||
workers=args.workers, p=args.Node2Vec_p, q=args.Node2Vec_q, window=args.window_size)
|
||||
elif args.method == 'grarep':
|
||||
model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
|
||||
elif args.method == 'line':
|
||||
if args.label_file and not args.LINE_no_auto_save:
|
||||
model = line.LINE(g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order,
|
||||
label_file=args.label_file, clf_ratio=args.label_reserved)
|
||||
else:
|
||||
model = line.LINE(g, epoch = args.epochs, rep_size=args.dim, order=args.LINE_order)
|
||||
elif args.method == 'graphsage':
|
||||
model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'graphsage_mean')
|
||||
#we follow the default parameters, see __inti__.py in graphsage file
|
||||
#choices: graphsage_mean, gcn ......
|
||||
#model.save_embeddings(args.emb_file) #to do...
|
||||
elif args.method == 'graphsage': #we follow the default parameters, see __inti__.py in graphsage file
|
||||
model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'graphsage_mean')
|
||||
elif args.method == 'gcn':
|
||||
model = graphsageAPI.graphsage_unsupervised_train(graph=g, graphsage_model = 'gcn') #graphsage-gcn
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user