Add Struc2Vec

This commit is contained in:
Weichen Shen 2019-02-14 21:17:53 +08:00
parent be70529fed
commit 4b06fe76da
6 changed files with 691 additions and 3 deletions

View File

@ -0,0 +1,88 @@
import numpy as np
from ge.classify import read_node_label,Classifier
from ge import Node2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None,
data=[('weight', int)])
model = Node2Vec(G, 10, 80, workers=1,p=0.25,q=2 )
model.train()
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)

View File

@ -0,0 +1,88 @@
import numpy as np
from ge.classify import read_node_label,Classifier
from ge import Struc2Vec
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True)
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None,
data=[('weight', int)])
model = Struc2Vec(G, 10, 80, workers=4, verbose=40, )
model.train()
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)

View File

@ -41,7 +41,7 @@ class Classifier(object):
results = {}
for average in averages:
results[average] = f1_score(Y, Y_, average=average)
#results['acc'] = accuracy_score(Y,Y_)
results['acc'] = accuracy_score(Y,Y_)
print('-------------------')
print(results)
return results

View File

@ -1,7 +1,8 @@
from .deepwalk import DeepWalk
from .line import LINE
from .node2vec import Node2Vec
from .line import LINE
from .sdne import SDNE
from .struc2vec import Struc2Vec
__all__ = ["DeepWalk", "LINE", "Node2Vec", "SDNE"]
__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"]

434
ge/models/struc2vec.py Normal file
View File

@ -0,0 +1,434 @@
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Ribeiro L F R, Saverese P H P, Figueiredo D R. struc2vec: Learning node representations from structural identity[C]//Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2017: 385-394.(https://arxiv.org/pdf/1704.03165.pdf)
"""
import math
import os
import shutil
from collections import ChainMap, deque
import numpy as np
import pandas as pd
from fastdtw import fastdtw
from gensim.models import Word2Vec
from joblib import Parallel, delayed
from tqdm import tqdm
from ..alias import create_alias_table
from ..utils import partition_dict, preprocess_nxgraph
from ..walker import BiasedWalker
class Struc2Vec():
def __init__(self, graph, walk_length=10, num_walks=100, workers=1, verbose=0, stay_prob=0.3, opt1_reduce_len=True, opt2_reduce_sim_calc=True, opt3_num_layers=None, temp_path='./temp_struc2vec/', reuse=False):
self.graph = graph
self.idx2node, self.node2idx = preprocess_nxgraph(graph)
self.idx = list(range(len(self.idx2node)))
self.opt1_reduce_len = opt1_reduce_len
self.opt2_reduce_sim_calc = opt2_reduce_sim_calc
self.opt3_num_layers = opt3_num_layers
self.resue = reuse
self.temp_path = temp_path
if not os.path.exists(self.temp_path):
os.mkdir(self.temp_path)
if not reuse:
shutil.rmtree(self.temp_path)
os.mkdir(self.temp_path)
self.create_context_graph(self.opt3_num_layers, workers, verbose)
self.prepare_biased_walk()
self.walker = BiasedWalker(self.idx2node, self.temp_path)
self.sentences = self.walker.simulate_walks(
num_walks, walk_length, stay_prob, workers, verbose)
self._embeddings = {}
def create_context_graph(self, max_num_layers, workers=1, verbose=0,):
pair_distances = self._compute_structural_distance(
max_num_layers, workers, verbose,)
layers_adj, layers_distances = self._get_layer_rep(pair_distances)
pd.to_pickle(layers_adj, self.temp_path + 'layers_adj.pkl')
layers_accept, layers_alias = self._get_transition_probs(
layers_adj, layers_distances)
pd.to_pickle(layers_alias, self.temp_path + 'layers_alias.pkl')
pd.to_pickle(layers_accept, self.temp_path + 'layers_accept.pkl')
def prepare_biased_walk(self,):
sum_weights = {}
sum_edges = {}
average_weight = {}
gamma = {}
layer = 0
while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')):
probs = pd.read_pickle(
self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')
for v, list_weights in probs.items():
sum_weights.setdefault(layer, 0)
sum_edges.setdefault(layer, 0)
sum_weights[layer] += sum(list_weights)
sum_edges[layer] += len(list_weights)
average_weight[layer] = sum_weights[layer] / sum_edges[layer]
gamma.setdefault(layer, {})
for v, list_weights in probs.items():
num_neighbours = 0
for w in list_weights:
if (w > average_weight[layer]):
num_neighbours += 1
gamma[layer][v] = num_neighbours
layer += 1
pd.to_pickle(average_weight, self.temp_path + 'average_weight')
pd.to_pickle(gamma, self.temp_path + 'gamma.pkl')
def train(self, embed_size=128, window_size=5, workers=3, iter=5):
# pd.read_pickle(self.temp_path+'walks.pkl')
sentences = self.sentences
print("Learning representation...")
model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers,
iter=iter)
print("Learning representation done!")
self.w2v_model = model
return model
def get_embeddings(self,):
if self.w2v_model is None:
print("model not train")
return {}
self._embeddings = {}
for word in self.graph.nodes():
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
def _compute_ordered_degreelist(self, max_num_layers):
degreeList = {}
vertices = self.idx # self.g.nodes()
for v in vertices:
degreeList[v] = self._get_order_degreelist_node(v, max_num_layers)
return degreeList
def _get_order_degreelist_node(self, root, max_num_layers=None):
if max_num_layers is None:
max_num_layers = float('inf')
ordered_degree_sequence_dict = {}
visited = [False] * len(self.graph.nodes())
queue = deque()
level = 0
queue.append(root)
visited[root] = True
while (len(queue) > 0 and level <= max_num_layers):
count = len(queue)
if self.opt1_reduce_len:
degree_list = {}
else:
degree_list = []
while (count > 0):
top = queue.popleft()
node = self.idx2node[top]
degree = len(self.graph[node])
if self.opt1_reduce_len:
degree_list[degree] = degree_list.get(degree, 0) + 1
else:
degree_list.append(degree)
for nei in self.graph[node]:
nei_idx = self.node2idx[nei]
if not visited[nei_idx]:
visited[nei_idx] = True
queue.append(nei_idx)
count -= 1
if self.opt1_reduce_len:
orderd_degree_list = [(degree, freq)
for degree, freq in degree_list.items()]
orderd_degree_list.sort(key=lambda x: x[0])
else:
orderd_degree_list = sorted(degree_list)
ordered_degree_sequence_dict[level] = orderd_degree_list
level += 1
return ordered_degree_sequence_dict
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,):
if os.path.exists(self.temp_path+'structural_dist.pkl'):
structural_dist = pd.read_pickle(
self.temp_path+'structural_dist.pkl')
else:
if self.opt1_reduce_len:
dist_func = cost_max
else:
dist_func = cost
if os.path.exists(self.temp_path + 'degreelist.pkl'):
degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl')
else:
degreeList = self._compute_ordered_degreelist(max_num_layers)
pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl')
if self.opt2_reduce_sim_calc:
degrees = self._create_vectors()
degreeListsSelected = {}
vertices = {}
n_nodes = len(self.idx)
for v in self.idx: # c:list of vertex
nbs = get_vertices(
v, len(self.graph[self.idx2node[v]]), degrees, n_nodes)
vertices[v] = nbs # store nbs
degreeListsSelected[v] = degreeList[v] # store dist
for n in nbs:
# store dist of nbs
degreeListsSelected[n] = degreeList[n]
else:
vertices = {}
for v in degreeList:
vertices[v] = [vd for vd in degreeList.keys() if vd > v]
results = Parallel(n_jobs=workers, verbose=verbose,)(
delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers))
dtw_dist = dict(ChainMap(*results))
structural_dist = convert_dtw_struc_dist(dtw_dist)
pd.to_pickle(structural_dist, self.temp_path +
'structural_dist.pkl')
return structural_dist
def _create_vectors(self):
degrees = {} # sotre v list of degree
degrees_sorted = set() # store degree
G = self.graph
for v in self.idx:
degree = len(G[self.idx2node[v]])
degrees_sorted.add(degree)
if (degree not in degrees):
degrees[degree] = {}
degrees[degree]['vertices'] = []
degrees[degree]['vertices'].append(v)
degrees_sorted = np.array(list(degrees_sorted), dtype='int')
degrees_sorted = np.sort(degrees_sorted)
l = len(degrees_sorted)
for index, degree in enumerate(degrees_sorted):
if (index > 0):
degrees[degree]['before'] = degrees_sorted[index - 1]
if (index < (l - 1)):
degrees[degree]['after'] = degrees_sorted[index + 1]
return degrees
def _get_layer_rep(self, pair_distances):
layer_distances = {}
layer_adj = {}
for v_pair, layer_dist in pair_distances.items():
for layer, distance in layer_dist.items():
vx = v_pair[0]
vy = v_pair[1]
layer_distances.setdefault(layer, {})
layer_distances[layer][vx, vy] = distance
layer_adj.setdefault(layer, {})
layer_adj[layer].setdefault(vx, [])
layer_adj[layer].setdefault(vy, [])
layer_adj[layer][vx].append(vy)
layer_adj[layer][vy].append(vx)
return layer_adj, layer_distances
def _get_transition_probs(self, layers_adj, layers_distances):
layers_alias = {}
layers_accept = {}
for layer in layers_adj:
neighbors = layers_adj[layer]
layer_distances = layers_distances[layer]
node_alias_dict = {}
node_accept_dict = {}
norm_weights = {}
for v, neighbors in neighbors.items():
e_list = []
sum_w = 0.0
for n in neighbors:
if (v, n) in layer_distances:
wd = layer_distances[v, n]
else:
wd = layer_distances[n, v]
w = np.exp(-float(wd))
e_list.append(w)
sum_w += w
e_list = [x / sum_w for x in e_list]
norm_weights[v] = e_list
accept, alias = create_alias_table(e_list)
node_alias_dict[v] = alias
node_accept_dict[v] = accept
pd.to_pickle(
norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl')
layers_alias[layer] = node_alias_dict
layers_accept[layer] = node_accept_dict
return layers_accept, layers_alias
def cost(a, b):
ep = 0.5
m = max(a, b) + ep
mi = min(a, b) + ep
return ((m / mi) - 1)
def cost_min(a, b):
ep = 0.5
m = max(a[0], b[0]) + ep
mi = min(a[0], b[0]) + ep
return ((m / mi) - 1) * min(a[1], b[1])
def cost_max(a, b):
ep = 0.5
m = max(a[0], b[0]) + ep
mi = min(a[0], b[0]) + ep
return ((m / mi) - 1) * max(a[1], b[1])
def convert_dtw_struc_dist(distances, startLayer=1):
"""
:param distances: dict of dict
:param startLayer:
:return:
"""
for vertices, layers in distances.items():
keys_layers = sorted(layers.keys())
startLayer = min(len(keys_layers), startLayer)
for layer in range(0, startLayer):
keys_layers.pop(0)
for layer in keys_layers:
layers[layer] += layers[layer - 1]
return distances
def get_vertices(v, degree_v, degrees, n_nodes):
a_vertices_selected = 2 * math.log(n_nodes, 2)
vertices = []
try:
c_v = 0
for v2 in degrees[degree_v]['vertices']:
if (v != v2):
vertices.append(v2) # same degree
c_v += 1
if (c_v > a_vertices_selected):
raise StopIteration
if ('before' not in degrees[degree_v]):
degree_b = -1
else:
degree_b = degrees[degree_v]['before']
if ('after' not in degrees[degree_v]):
degree_a = -1
else:
degree_a = degrees[degree_v]['after']
if (degree_b == -1 and degree_a == -1):
raise StopIteration # not anymore v
degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b)
# nearest valid degree
while True:
for v2 in degrees[degree_now]['vertices']:
if (v != v2):
vertices.append(v2)
c_v += 1
if (c_v > a_vertices_selected):
raise StopIteration
if (degree_now == degree_b):
if ('before' not in degrees[degree_b]):
degree_b = -1
else:
degree_b = degrees[degree_b]['before']
else:
if ('after' not in degrees[degree_a]):
degree_a = -1
else:
degree_a = degrees[degree_a]['after']
if (degree_b == -1 and degree_a == -1):
raise StopIteration
degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b)
except StopIteration:
return list(vertices)
return list(vertices)
def verifyDegrees(degrees, degree_v_root, degree_a, degree_b):
if(degree_b == -1):
degree_now = degree_a
elif(degree_a == -1):
degree_now = degree_b
elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)):
degree_now = degree_b
else:
degree_now = degree_a
return degree_now
def compute_dtw_dist(part_list, degreeList, dist_func):
dtw_dist = {}
for v1, nbs in part_list:
lists_v1 = degreeList[v1] # lists_v1 :orderd degree list of v1
for v2 in nbs:
lists_v2 = degreeList[v2] # lists_v1 :orderd degree list of v2
max_layer = min(len(lists_v1), len(lists_v2)) # valid layer
dtw_dist[v1, v2] = {}
for layer in range(0, max_layer):
dist, path = fastdtw(
lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func)
dtw_dist[v1, v2][layer] = dist
return dtw_dist

View File

@ -138,3 +138,80 @@ class RandomWalker:
self.alias_edges = alias_edges
return
class BiasedWalker:
def __init__(self, idx2node, temp_path):
self.idx2node = idx2node
self.idx = list(range(len(self.idx2node)))
self.temp_path = temp_path
pass
def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):
layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
walks = []
initialLayer = 0
nodes = self.idx # list(self.g.nodes())
results = Parallel(n_jobs=workers, verbose=verbose, )(
delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in
partition_num(num_walks, workers))
walks = list(itertools.chain(*results))
return walks
def _simulate_walks(self, nodes, num_walks, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma):
walks = []
for _ in range(num_walks):
random.shuffle(nodes)
for v in nodes:
walks.append(self._exec_random_walk(layers_adj, layers_accept, layers_alias,
v, walk_length, gamma, stay_prob))
return walks
def _exec_random_walk(self, graphs, layers_accept, layers_alias, v, walk_length, gamma, stay_prob=0.3):
initialLayer = 0
layer = initialLayer
path = []
path.append(self.idx2node[v])
while len(path) < walk_length:
r = random.random()
if(r < stay_prob): # same layer
v = chooseNeighbor(v, graphs, layers_alias,
layers_accept, layer)
path.append(self.idx2node[v])
else: # different layer
r = random.random()
try:
x = math.log(gamma[layer][v] + math.e)
p_moveup = (x / (x + 1))
except:
print(layer, v)
raise ValueError()
if(r > p_moveup):
if(layer > initialLayer):
layer = layer - 1
else:
if((layer + 1) in graphs and v in graphs[layer + 1]):
layer = layer + 1
return path
def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):
v_list = graphs[layer][v]
idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v])
v = v_list[idx]
return v