merge_abrw

This commit is contained in:
Chengbin Hou 2018-11-17 16:59:28 +00:00
commit 252d4fdb47
4 changed files with 153 additions and 56 deletions

113
.gitignore vendored
View File

@ -1,3 +1,4 @@
#chengbin--------------------------
# Windows:
Thumbs.db
ehthumbs.db
@ -14,4 +15,114 @@ __pycache__
# My configurations:
db.ini
deploy_key_rsa
deploy_key_rsa
#zeyu--------------------------------
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# vscode
.vscode

View File

@ -1,15 +1,15 @@
# -*- coding: utf-8 -*-
import numpy as np
import time
from numpy import linalg as la
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from . import walker
import networkx as nx
from libnrl.utils import *
import multiprocessing
from .utils import *
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
'''
#-----------------------------------------------------------------------------
@ -18,11 +18,6 @@ import multiprocessing
#-----------------------------------------------------------------------------
'''
def multiprocessor_argpartition(vec):
topk = 20
print('len of vec...',len(vec))
return np.argpartition(vec, -topk)[-topk:]
class ABRW(object):
@ -32,27 +27,27 @@ class ABRW(object):
self.topk = int(topk)
kwargs["workers"] = kwargs.get("workers", 1)
self.P = self.biasedTransProb() #obtain biased transition probs mat
weighted_walker = walker.BiasedWalker(g=self.g, P=self.P, workers=kwargs["workers"]) #instance weighted walker
#generate sentences according to biased transition probs mat P
self.P = self.biasedTransProb() # obtain biased transition probs mat
weighted_walker = walker.BiasedWalker(g=self.g, P=self.P, workers=kwargs["workers"]) # instance weighted walker
# generate sentences according to biased transition probs mat P
sentences = weighted_walker.simulate_walks(num_walks=num_paths, walk_length=path_length)
#skip-gram parameters
# skip-gram parameters
kwargs["sentences"] = sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["size"] = kwargs.get("size", dim)
kwargs["sg"] = 1 #use skip-gram; but see deepwalk which uses 'hs' = 1
kwargs["sg"] = 1 # use skip-gram; but see deepwalk which uses 'hs' = 1
self.size = kwargs["size"]
#learning embedding by skip-gram model
# learning embedding by skip-gram model
print("Learning representation...")
word2vec = Word2Vec(**kwargs)
#save emb for later eval
# save emb for later eval
self.vectors = {}
for word in self.g.G.nodes():
self.vectors[word] = word2vec.wv[word] #save emb
self.vectors[word] = word2vec.wv[word] # save emb
del word2vec
#----------------------------------------key of our method---------------------------------------------
# ----------------------------------------key of our method---------------------------------------------
def biasedTransProb(self):
'''
given: A and X --> P_A and P_X
@ -70,62 +65,54 @@ class ABRW(object):
print("obtaining biased transition probs mat...")
t1 = time.time()
A = self.g.get_adj_mat() #adj/struc info mat
P_A = row_as_probdist(A) #if single node, return [0, 0, 0 ..] we will fix this later
A = self.g.get_adj_mat() # adj/struc info mat
P_A = row_as_probdist(A) # if single node, return [0, 0, 0 ..] we will fix this later
X = self.g.get_attr_mat() #attr info mat
X_compressed = X #if need speed up, try to use svd or pca for compression, but will loss some acc
#X_compressed = self.g.preprocessAttrInfo(X=X, dim=200, method='pca') #svd or pca for dim reduction; follow TADW setting use svd with dim=200
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics
#ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
#t1=time.time()
X = self.g.get_attr_mat() # attr info mat
X_compressed = X # if need speed up, try to use svd or pca for compression, but will loss some acc
# X_compressed = self.g.preprocessAttrInfo(X=X, dim=200, method='pca') #svd or pca for dim reduction; follow TADW setting use svd with dim=200
X_sim = cosine_similarity(X_compressed, X_compressed)
#t2=time.time()
#print('======no need pre proce', t2-t1)
#way5: a faster implementation of way5 by Zeyu Dong
# way5: a faster implementation of way5 by Zeyu Dong
topk = self.topk
print('way5 remain self---------topk = ', topk)
t1 = time.time()
cutoff = np.partition(X_sim, -topk, axis=1)[:,-topk:].min(axis=1)
cutoff = np.partition(X_sim, -topk, axis=1)[:, -topk:].min(axis=1)
X_sim[(X_sim < cutoff)] = 0
t2 = time.time()
P_X = row_as_probdist(X_sim)
t3 = time.time()
for i in range(P_X.shape[0]):
sum_row = P_X[i].sum()
if sum_row != 1.0: #to avoid some numerical issue...
delta = 1.0 - sum_row #delta is very very samll number say 1e-10 or even less...
P_X[i][i] = P_X[i][i] + delta #the diagnoal must be largest of the that row + delta --> almost no effect
if sum_row != 1.0: # to avoid some numerical issue...
delta = 1.0 - sum_row # delta is very very samll number say 1e-10 or even less...
P_X[i, i] = P_X[i, i] + delta # the diagnoal must be largest of the that row + delta --> almost no effect
t4 = time.time()
print('topk time: ',t2-t1 ,'row normlize time: ',t3-t2, 'dealing numerical issue time: ', t4-t3)
print('topk time: ', t2-t1, 'row normlize time: ', t3-t2, 'dealing numerical issue time: ', t4-t3)
del A, X, X_compressed, X_sim
#=====================================core of our idea========================================
# =====================================core of our idea========================================
print('------alpha for P = alpha * P_A + (1-alpha) * P_X----: ', self.alpha)
n = self.g.get_num_nodes()
P = np.zeros((n,n), dtype=float)
P = np.zeros((n, n), dtype=float)
# TODO: Vectorization
for i in range(n):
if (P_A[i] == 0).all(): #single node case if the whole row are 0s
#if P_A[i].sum() == 0:
P[i] = P_X[i] #use 100% attr info to compensate
else: #non-single node case; use (1.0-self.alpha) attr info to compensate
if (P_A[i] == 0).toarray().all(): # single node case if the whole row are 0s
# if P_A[i].sum() == 0:
P[i] = P_X[i] # use 100% attr info to compensate
else: # non-single node case; use (1.0-self.alpha) attr info to compensate
P[i] = self.alpha * P_A[i] + (1.0-self.alpha) * P_X[i]
print('# of single nodes for P_A: ', n - P_A.sum(axis=1).sum(), ' # of non-zero entries of P_A: ', np.count_nonzero(P_A))
print('# of single nodes for P_A: ', n - P_A.sum(axis=1).sum(), ' # of non-zero entries of P_A: ', P_A.count_nonzero())
print('# of single nodes for P_X: ', n - P_X.sum(axis=1).sum(), ' # of non-zero entries of P_X: ', np.count_nonzero(P_X))
t5 = time.time()
print('ABRW biased transition prob preprocessing time: {:.2f}s'.format(t5-t4))
return P
def save_embeddings(self, filename):
fout = open(filename, 'w')
node_num = len(self.vectors.keys())
fout.write("{} {}\n".format(node_num, self.size))
for node, vec in self.vectors.items():
fout.write("{} {}\n".format(node,
' '.join([str(x) for x in vec])))
fout.close()
fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
fout.close()

View File

@ -33,12 +33,12 @@ def row_as_probdist(mat):
return dense matrix if input is dense matrix or numpy array
return sparse matrix for sparse matrix input
"""
row_sum = np.array(mat.sum(axis=1)) # type: np.array
row_sum = np.array(mat.sum(axis=1)).ravel() # type: np.array
zero_rows = row_sum == 0
row_sum[zero_rows] = 1
diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
mat = diag.dot(mat)
mat += sparse.bsr_matrix(zero_rows.astype(int)).T.dot(sparse.bsr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
return mat

View File

@ -33,7 +33,6 @@ class BiasedWalker: # ------ our method
self.G = g.G # nx data stcuture
self.P = P # biased transition probability; n*n; each row is a pdf for a node
self.workers = workers
self.node_size = g.node_size
self.look_back_list = g.look_back_list
self.look_up_dict = g.look_up_dict