merge_abrw

2018-11-17 16:59:28 +00:00 · 2018-11-17 16:59:28 +00:00 · 252d4fdb47
commit 252d4fdb47
parent 5abb48939f eab350d19f
4 changed files with 153 additions and 56 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+#chengbin--------------------------
 # Windows:
 Thumbs.db
 ehthumbs.db
@ -14,4 +15,114 @@ __pycache__

 # My configurations:
 db.ini
-deploy_key_rsa
+deploy_key_rsa
+
+
+#zeyu--------------------------------
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# vscode
+.vscode
--- a/src/libnrl/abrw.py
+++ b/src/libnrl/abrw.py
@ -1,15 +1,15 @@
 # -*- coding: utf-8 -*-
-import numpy as np
 import time
-from numpy import linalg as la
 import warnings
-warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
-import gensim
+
+import numpy as np
 from gensim.models import Word2Vec
+from sklearn.metrics.pairwise import cosine_similarity
+
 from . import walker
-import networkx as nx
-from libnrl.utils import *
-import multiprocessing
+from .utils import *
+
+warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

 '''
 #-----------------------------------------------------------------------------
@ -18,11 +18,6 @@ import multiprocessing
 #-----------------------------------------------------------------------------
 '''

-def multiprocessor_argpartition(vec):
-    topk = 20
-    print('len of vec...',len(vec))
-    return np.argpartition(vec, -topk)[-topk:]
-

 class ABRW(object):

@ -32,27 +27,27 @@ class ABRW(object):
        self.topk = int(topk)
        kwargs["workers"] = kwargs.get("workers", 1)

-        self.P = self.biasedTransProb() #obtain biased transition probs mat
-        weighted_walker = walker.BiasedWalker(g=self.g, P=self.P, workers=kwargs["workers"]) #instance weighted walker
-        #generate sentences according to biased transition probs mat P
+        self.P = self.biasedTransProb()  # obtain biased transition probs mat
+        weighted_walker = walker.BiasedWalker(g=self.g, P=self.P, workers=kwargs["workers"])  # instance weighted walker
+        # generate sentences according to biased transition probs mat P
        sentences = weighted_walker.simulate_walks(num_walks=num_paths, walk_length=path_length)
-        
-        #skip-gram parameters
+
+        # skip-gram parameters
        kwargs["sentences"] = sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["size"] = kwargs.get("size", dim)
-        kwargs["sg"] = 1  #use skip-gram; but see deepwalk which uses 'hs' = 1
+        kwargs["sg"] = 1  # use skip-gram; but see deepwalk which uses 'hs' = 1
        self.size = kwargs["size"]
-        #learning embedding by skip-gram model
+        # learning embedding by skip-gram model
        print("Learning representation...")
        word2vec = Word2Vec(**kwargs)
-        #save emb for later eval
+        # save emb for later eval
        self.vectors = {}
        for word in self.g.G.nodes():
-            self.vectors[word] = word2vec.wv[word] #save emb
+            self.vectors[word] = word2vec.wv[word]  # save emb
        del word2vec

-#----------------------------------------key of our method---------------------------------------------
+# ----------------------------------------key of our method---------------------------------------------
    def biasedTransProb(self):
        '''
        given: A and X --> P_A and P_X
@ -70,62 +65,54 @@ class ABRW(object):
        print("obtaining biased transition probs mat...")
        t1 = time.time()

-        A = self.g.get_adj_mat()   #adj/struc info mat
-        P_A = row_as_probdist(A)  #if single node, return [0, 0, 0 ..] we will fix this later
+        A = self.g.get_adj_mat()  # adj/struc info mat
+        P_A = row_as_probdist(A)  # if single node, return [0, 0, 0 ..] we will fix this later

-        X = self.g.get_attr_mat()   #attr info mat
-        X_compressed = X    #if need speed up, try to use svd or pca for compression, but will loss some acc
-        #X_compressed = self.g.preprocessAttrInfo(X=X, dim=200, method='pca')  #svd or pca for dim reduction; follow TADW setting use svd with dim=200
-        from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances  # we may try diff metrics
-        #ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
-        #t1=time.time()
+        X = self.g.get_attr_mat()  # attr info mat
+        X_compressed = X  # if need speed up, try to use svd or pca for compression, but will loss some acc
+        # X_compressed = self.g.preprocessAttrInfo(X=X, dim=200, method='pca')  #svd or pca for dim reduction; follow TADW setting use svd with dim=200
        X_sim = cosine_similarity(X_compressed, X_compressed)
-        #t2=time.time()
-        #print('======no need pre proce', t2-t1)
-
        
-        #way5: a faster implementation of way5 by Zeyu Dong
+        # way5: a faster implementation of way5 by Zeyu Dong
        topk = self.topk
        print('way5 remain self---------topk = ', topk)
        t1 = time.time()
-        cutoff = np.partition(X_sim, -topk, axis=1)[:,-topk:].min(axis=1)
+        cutoff = np.partition(X_sim, -topk, axis=1)[:, -topk:].min(axis=1)
        X_sim[(X_sim < cutoff)] = 0
        t2 = time.time()

-
        P_X = row_as_probdist(X_sim)
        t3 = time.time()
        for i in range(P_X.shape[0]):
            sum_row = P_X[i].sum()
-            if sum_row != 1.0:          #to avoid some numerical issue...
-                delta = 1.0 - sum_row   #delta is very very samll number say 1e-10 or even less...
-                P_X[i][i] = P_X[i][i] + delta  #the diagnoal must be largest of the that row + delta --> almost no effect
+            if sum_row != 1.0:  # to avoid some numerical issue...
+                delta = 1.0 - sum_row  # delta is very very samll number say 1e-10 or even less...
+                P_X[i, i] = P_X[i, i] + delta  # the diagnoal must be largest of the that row + delta --> almost no effect
        t4 = time.time()
-        print('topk time: ',t2-t1 ,'row normlize time: ',t3-t2, 'dealing numerical issue time: ', t4-t3)
+        print('topk time: ', t2-t1, 'row normlize time: ', t3-t2, 'dealing numerical issue time: ', t4-t3)
        del A, X, X_compressed, X_sim
-        
-        #=====================================core of our idea========================================
+
+        # =====================================core of our idea========================================
        print('------alpha for P = alpha * P_A + (1-alpha) * P_X----: ', self.alpha)
        n = self.g.get_num_nodes()
-        P = np.zeros((n,n), dtype=float)
+        P = np.zeros((n, n), dtype=float)
+        # TODO: Vectorization
        for i in range(n):
-            if (P_A[i] == 0).all():  #single node case if the whole row are 0s
-            #if P_A[i].sum() == 0:
-                P[i] = P_X[i]        #use 100% attr info to compensate 
-            else:                    #non-single node case; use (1.0-self.alpha) attr info to compensate
+            if (P_A[i] == 0).toarray().all():  # single node case if the whole row are 0s
+                # if P_A[i].sum() == 0:
+                P[i] = P_X[i]  # use 100% attr info to compensate
+            else:  # non-single node case; use (1.0-self.alpha) attr info to compensate
                P[i] = self.alpha * P_A[i] + (1.0-self.alpha) * P_X[i]
-        print('# of single nodes for P_A: ', n - P_A.sum(axis=1).sum(), ' # of non-zero entries of P_A: ', np.count_nonzero(P_A))
+        print('# of single nodes for P_A: ', n - P_A.sum(axis=1).sum(), ' # of non-zero entries of P_A: ', P_A.count_nonzero())
        print('# of single nodes for P_X: ', n - P_X.sum(axis=1).sum(), ' # of non-zero entries of P_X: ', np.count_nonzero(P_X))
        t5 = time.time()
        print('ABRW biased transition prob preprocessing time: {:.2f}s'.format(t5-t4))
        return P

-
    def save_embeddings(self, filename):
        fout = open(filename, 'w')
        node_num = len(self.vectors.keys())
        fout.write("{} {}\n".format(node_num, self.size))
        for node, vec in self.vectors.items():
-            fout.write("{} {}\n".format(node,
-                                        ' '.join([str(x) for x in vec])))
-        fout.close()
+            fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec])))
+        fout.close()
--- a/src/libnrl/utils.py
+++ b/src/libnrl/utils.py
@ -33,12 +33,12 @@ def row_as_probdist(mat):
        return dense matrix if input is dense matrix or numpy array
        return sparse matrix for sparse matrix input
    """
-    row_sum = np.array(mat.sum(axis=1))  # type: np.array
+    row_sum = np.array(mat.sum(axis=1)).ravel()  # type: np.array
    zero_rows = row_sum == 0
    row_sum[zero_rows] = 1
    diag = sparse.dia_matrix((1 / row_sum, 0), (mat.shape[0], mat.shape[0]))
    mat = diag.dot(mat)
-    mat += sparse.bsr_matrix(zero_rows.astype(int)).T.dot(sparse.bsr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))
+    mat += sparse.csr_matrix(zero_rows.astype(int)).T.dot(sparse.csr_matrix(np.repeat(1 / mat.shape[1], mat.shape[1])))

    return mat

--- a/src/libnrl/walker.py
+++ b/src/libnrl/walker.py
@ -33,7 +33,6 @@ class BiasedWalker:  # ------ our method
        self.G = g.G  # nx data stcuture
        self.P = P  # biased transition probability; n*n; each row is a pdf for a node
        self.workers = workers
-        self.node_size = g.node_size
        self.look_back_list = g.look_back_list
        self.look_up_dict = g.look_up_dict