Merge branch 'master' into parallel
This commit is contained in:
commit
4c788680ad
@ -1,15 +1,15 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import time
|
||||
from numpy import linalg as la
|
||||
import warnings
|
||||
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
|
||||
import gensim
|
||||
|
||||
import numpy as np
|
||||
from gensim.models import Word2Vec
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
from . import walker
|
||||
import networkx as nx
|
||||
from libnrl.utils import *
|
||||
import multiprocessing
|
||||
from .utils import *
|
||||
|
||||
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
|
||||
|
||||
'''
|
||||
#-----------------------------------------------------------------------------
|
||||
@ -18,11 +18,6 @@ import multiprocessing
|
||||
#-----------------------------------------------------------------------------
|
||||
'''
|
||||
|
||||
def multiprocessor_argpartition(vec):
|
||||
topk = 20
|
||||
print('len of vec...',len(vec))
|
||||
return np.argpartition(vec, -topk)[-topk:]
|
||||
|
||||
|
||||
class ABRW(object):
|
||||
|
||||
@ -32,27 +27,27 @@ class ABRW(object):
|
||||
self.topk = int(topk)
|
||||
kwargs["workers"] = kwargs.get("workers", 1)
|
||||
|
||||
self.P = self.biasedTransProb() #obtain biased transition probs mat
|
||||
weighted_walker = walker.BiasedWalker(g=self.g, P=self.P, workers=kwargs["workers"]) #instance weighted walker
|
||||
#generate sentences according to biased transition probs mat P
|
||||
self.P = self.biasedTransProb() # obtain biased transition probs mat
|
||||
weighted_walker = walker.BiasedWalker(g=self.g, P=self.P, workers=kwargs["workers"]) # instance weighted walker
|
||||
# generate sentences according to biased transition probs mat P
|
||||
sentences = weighted_walker.simulate_walks(num_walks=num_paths, walk_length=path_length)
|
||||
|
||||
#skip-gram parameters
|
||||
# skip-gram parameters
|
||||
kwargs["sentences"] = sentences
|
||||
kwargs["min_count"] = kwargs.get("min_count", 0)
|
||||
kwargs["size"] = kwargs.get("size", dim)
|
||||
kwargs["sg"] = 1 #use skip-gram; but see deepwalk which uses 'hs' = 1
|
||||
kwargs["sg"] = 1 # use skip-gram; but see deepwalk which uses 'hs' = 1
|
||||
self.size = kwargs["size"]
|
||||
#learning embedding by skip-gram model
|
||||
# learning embedding by skip-gram model
|
||||
print("Learning representation...")
|
||||
word2vec = Word2Vec(**kwargs)
|
||||
#save emb for later eval
|
||||
# save emb for later eval
|
||||
self.vectors = {}
|
||||
for word in self.g.G.nodes():
|
||||
self.vectors[word] = word2vec.wv[word] #save emb
|
||||
self.vectors[word] = word2vec.wv[word] # save emb
|
||||
del word2vec
|
||||
|
||||
#----------------------------------------key of our method---------------------------------------------
|
||||
# ----------------------------------------key of our method---------------------------------------------
|
||||
def biasedTransProb(self):
|
||||
'''
|
||||
given: A and X --> P_A and P_X
|
||||
@ -70,62 +65,54 @@ class ABRW(object):
|
||||
print("obtaining biased transition probs mat...")
|
||||
t1 = time.time()
|
||||
|
||||
A = self.g.get_adj_mat() #adj/struc info mat
|
||||
P_A = row_as_probdist(A) #if single node, return [0, 0, 0 ..] we will fix this later
|
||||
A = self.g.get_adj_mat() # adj/struc info mat
|
||||
P_A = row_as_probdist(A) # if single node, return [0, 0, 0 ..] we will fix this later
|
||||
|
||||
X = self.g.get_attr_mat() #attr info mat
|
||||
X_compressed = X #if need speed up, try to use svd or pca for compression, but will loss some acc
|
||||
#X_compressed = self.g.preprocessAttrInfo(X=X, dim=200, method='pca') #svd or pca for dim reduction; follow TADW setting use svd with dim=200
|
||||
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics
|
||||
#ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
|
||||
#t1=time.time()
|
||||
X = self.g.get_attr_mat() # attr info mat
|
||||
X_compressed = X # if need speed up, try to use svd or pca for compression, but will loss some acc
|
||||
# X_compressed = self.g.preprocessAttrInfo(X=X, dim=200, method='pca') #svd or pca for dim reduction; follow TADW setting use svd with dim=200
|
||||
X_sim = cosine_similarity(X_compressed, X_compressed)
|
||||
#t2=time.time()
|
||||
#print('======no need pre proce', t2-t1)
|
||||
|
||||
|
||||
#way5: a faster implementation of way5 by Zeyu Dong
|
||||
# way5: a faster implementation of way5 by Zeyu Dong
|
||||
topk = self.topk
|
||||
print('way5 remain self---------topk = ', topk)
|
||||
t1 = time.time()
|
||||
cutoff = np.partition(X_sim, -topk, axis=1)[:,-topk:].min(axis=1)
|
||||
cutoff = np.partition(X_sim, -topk, axis=1)[:, -topk:].min(axis=1)
|
||||
X_sim[(X_sim < cutoff)] = 0
|
||||
t2 = time.time()
|
||||
|
||||
|
||||
P_X = row_as_probdist(X_sim)
|
||||
t3 = time.time()
|
||||
for i in range(P_X.shape[0]):
|
||||
sum_row = P_X[i].sum()
|
||||
if sum_row != 1.0: #to avoid some numerical issue...
|
||||
delta = 1.0 - sum_row #delta is very very samll number say 1e-10 or even less...
|
||||
P_X[i][i] = P_X[i][i] + delta #the diagnoal must be largest of the that row + delta --> almost no effect
|
||||
if sum_row != 1.0: # to avoid some numerical issue...
|
||||
delta = 1.0 - sum_row # delta is very very samll number say 1e-10 or even less...
|
||||
P_X[i, i] = P_X[i, i] + delta # the diagnoal must be largest of the that row + delta --> almost no effect
|
||||
t4 = time.time()
|
||||
print('topk time: ',t2-t1 ,'row normlize time: ',t3-t2, 'dealing numerical issue time: ', t4-t3)
|
||||
print('topk time: ', t2-t1, 'row normlize time: ', t3-t2, 'dealing numerical issue time: ', t4-t3)
|
||||
del A, X, X_compressed, X_sim
|
||||
|
||||
#=====================================core of our idea========================================
|
||||
# =====================================core of our idea========================================
|
||||
print('------alpha for P = alpha * P_A + (1-alpha) * P_X----: ', self.alpha)
|
||||
n = self.g.get_num_nodes()
|
||||
P = np.zeros((n,n), dtype=float)
|
||||
P = np.zeros((n, n), dtype=float)
|
||||
# TODO: Vectorization
|
||||
for i in range(n):
|
||||
if (P_A[i] == 0).all(): #single node case if the whole row are 0s
|
||||
#if P_A[i].sum() == 0:
|
||||
P[i] = P_X[i] #use 100% attr info to compensate
|
||||
else: #non-single node case; use (1.0-self.alpha) attr info to compensate
|
||||
if (P_A[i] == 0).toarray().all(): # single node case if the whole row are 0s
|
||||
# if P_A[i].sum() == 0:
|
||||
P[i] = P_X[i] # use 100% attr info to compensate
|
||||
else: # non-single node case; use (1.0-self.alpha) attr info to compensate
|
||||
P[i] = self.alpha * P_A[i] + (1.0-self.alpha) * P_X[i]
|
||||
print('# of single nodes for P_A: ', n - P_A.sum(axis=1).sum(), ' # of non-zero entries of P_A: ', np.count_nonzero(P_A))
|
||||
print('# of single nodes for P_A: ', n - P_A.sum(axis=1).sum(), ' # of non-zero entries of P_A: ', P_A.count_nonzero())
|
||||
print('# of single nodes for P_X: ', n - P_X.sum(axis=1).sum(), ' # of non-zero entries of P_X: ', np.count_nonzero(P_X))
|
||||
t5 = time.time()
|
||||
print('ABRW biased transition prob preprocessing time: {:.2f}s'.format(t5-t4))
|
||||
return P
|
||||
|
||||
|
||||
def save_embeddings(self, filename):
|
||||
fout = open(filename, 'w')
|
||||
node_num = len(self.vectors.keys())
|
||||
fout.write("{} {}\n".format(node_num, self.size))
|
||||
for node, vec in self.vectors.items():
|
||||
fout.write("{} {}\n".format(node,
|
||||
' '.join([str(x) for x in vec])))
|
||||
fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
|
||||
fout.close()
|
@ -33,12 +33,12 @@ def row_as_probdist(mat):
|
||||
return dense matrix if input is dense matrix or numpy array
|
||||
return sparse matrix for sparse matrix input
|
||||
"""
|
||||
row_sum = np.array(mat.sum(axis=1)) # type: np.array
|
||||
row_sum = np.array(mat.sum(axis=1)).ravel() # type: np.array
|
||||
zero_rows = row_sum == 0
|
||||
row_sum[zero_rows] = 1
|
||||
diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
|
||||
mat = diag.dot(mat)
|
||||
mat += sparse.bsr_matrix(zero_rows.astype(int)).T.dot(sparse.bsr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
|
||||
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
|
||||
|
||||
return mat
|
||||
|
||||
|
@ -33,7 +33,6 @@ class BiasedWalker: # ------ our method
|
||||
self.G = g.G # nx data stcuture
|
||||
self.P = P # biased transition probability; n*n; each row is a pdf for a node
|
||||
self.workers = workers
|
||||
self.node_size = g.node_size
|
||||
self.look_back_list = g.look_back_list
|
||||
self.look_up_dict = g.look_up_dict
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user