parallel walker

2018-11-17 21:39:30 +08:00 · 2018-11-17 21:39:30 +08:00 · cbf674e3cd
commit cbf674e3cd
parent b046b20090
1 changed files with 36 additions and 71 deletions
--- a/src/libnrl/walker.py
+++ b/src/libnrl/walker.py
@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function

+import functools
 import multiprocessing
 import random
 import time
-from itertools import chain

 import numpy as np
 from networkx import nx
@ -40,105 +40,70 @@ class BiasedWalker:  # ------ our method
    # alias sampling for ABRW-------------------------------------------------------------------
    def simulate_walks(self, num_walks, walk_length):
        self.P_G = nx.to_networkx_graph(self.P, create_using=nx.DiGraph())  # create a new nx graph based on ABRW transition prob matrix
+        global P_G
+        P_G = self.P_G
        t1 = time.time()
        self.preprocess_transition_probs()  # note: we simply adapt node2vec
        t2 = time.time()
+        global alias_nodes
+        alias_nodes = self.alias_nodes
        print('Time for construct alias table: {:.2f}'.format(t2-t1))
        walks = []
        nodes = list(self.P_G.nodes())
        print('Walk iteration:')
+        pool = multiprocessing.Pool(self.workers)
        for walk_iter in range(num_walks):
            print(str(walk_iter+1), '/', str(num_walks))
-            random.shuffle(nodes)
-            for node in nodes:
-                walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
+            # random.shuffle(nodes)
+            walks += pool.map(functools.partial(node2vec_walk, walk_length=walk_length), nodes)
+        pool.close()
+        pool.join()
+        del alias_nodes, P_G

        for i in range(len(walks)):  # use ind to retrive orignal node ID
            for j in range(len(walks[0])):
                walks[i][j] = self.look_back_list[int(walks[i][j])]
        return walks

-    def node2vec_walk(self, walk_length, start_node):  # to do...
-        G = self.P_G  # more efficient way instead of copy from node2vec
-        alias_nodes = self.alias_nodes
-        walk = [start_node]
-        while len(walk) < walk_length:
-            cur = walk[-1]
-            cur_nbrs = list(G.neighbors(cur))
-            if len(cur_nbrs) > 0:
-                walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
-            else:
-                break
-        return walk
-
    def preprocess_transition_probs(self):
        G = self.P_G
        alias_nodes = {}
-        for node in G.nodes():
-            unnormalized_probs = [G[node][nbr]['weight'] for nbr in G.neighbors(node)]
-            norm_const = sum(unnormalized_probs)
-            normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
-            alias_nodes[node] = alias_setup(normalized_probs)
+        nodes = G.nodes()
+
+        pool = multiprocessing.Pool(self.workers)
+        alias_nodes = dict(zip(nodes, pool.map(get_alias_node, nodes)))
+        pool.close()
+        pool.join()
+
        self.alias_nodes = alias_nodes


-'''
-    #naive sampling for ABRW-------------------------------------------------------------------
-    def weighted_walk(self, start_node):
-        #
-        #Simulate a weighted walk starting from start node.
-        #
-        G = self.G
-        look_up_dict = self.look_up_dict
-        look_back_list = self.look_back_list
-        node_size = self.node_size
-        walk = [start_node]
+def node2vec_walk(start_node, walk_length):  # to do...
+    global P_G  # more efficient way instead of copy from node2vec
+    global alias_nodes
+    walk = [start_node]
+    while len(walk) < walk_length:
+        cur = walk[-1]
+        cur_nbrs = list(P_G.neighbors(cur))
+        if len(cur_nbrs) > 0:
+            walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
+        else:
+            break
+    return walk

-        while len(walk) < self.walk_length:
-            cur_node = walk[-1]        #the last one entry/node
-            cur_ind = look_up_dict[cur_node]        #key -> index
-            pdf = self.P[cur_ind,:]    #the pdf of node with ind
-            #pdf = np.random.randn(18163)+10  #......test multiprocessor
-            #pdf = pdf / pdf.sum()            #......test multiprocessor
-            #next_ind = int( np.array( nx.utils.random_sequence.discrete_sequence(n=1,distribution=pdf) ) )
-            next_ind = np.random.choice(len(pdf), 1, p=pdf)[0]  #faster than nx
-            #next_ind = 0                     #......test multiprocessor
-            next_node = look_back_list[next_ind]    #index -> key
-            walk.append(next_node)
-        return walk

-    def simulate_walks(self, num_walks, walk_length):
-        #
-        #Repeatedly simulate weighted walks from each node.
-        #
-        G = self.G
-        self.num_walks = num_walks
-        self.walk_length = walk_length
-        self.walks = []  #what we all need later as input to skip-gram 
-        nodes = list(G.nodes())
-
-        print('Walk iteration:')
-        for walk_iter in range(num_walks):
-            t1 = time.time()
-            random.shuffle(nodes)
-            for node in nodes:                              #for single cpu, if # of nodes < 2000 (speed up) or nodes > 20000 (avoid memory error)
-                self.walks.append(self.weighted_walk(node)) #for single cpu, if # of nodes < 2000 (speed up) or nodes > 20000 (avoid memory error)
-            #pool = multiprocessing.Pool(processes=3)  #use all cpu by defalut or specify processes = xx 
-            #self.walks.append(pool.map(self.weighted_walk, nodes))   #ref: https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
-            #pool.close()
-            #pool.join()
-            t2 = time.time()
-            print(str(walk_iter+1), '/', str(num_walks), ' each itr last for: {:.2f}s'.format(t2-t1))
-        #self.walks = list(chain.from_iterable(self.walks))  #unlist...[[[x,x],[x,x]]] -> [x,x], [x,x]
-        return self.walks
-'''
+def get_alias_node(node):
+    global P_G
+    unnormalized_probs = [P_G[node][nbr]['weight'] for nbr in P_G.neighbors(node)]
+    norm_const = sum(unnormalized_probs)
+    normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
+    return alias_setup(normalized_probs)


 # ===========================================deepWalk-walker============================================
 class BasicWalker:
    def __init__(self, G, workers):
        self.G = G.G
-        self.node_size = G.get_num_nodes()
        self.look_up_dict = G.look_up_dict

    def deepwalk_walk(self, walk_length, start_node):