update
This commit is contained in:
parent
4b060fd879
commit
831dcce14e
45
README.md
45
README.md
@ -1 +1,44 @@
|
||||
# GE
|
||||
# GraphEmbedding
|
||||
|
||||
# Method
|
||||
|
||||
|
||||
| Model | Paper | Note |
|
||||
| :------: | :----------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------- |
|
||||
| DeepWalk | [KDD 2014][DeepWalk: Online Learning of Social Representations](http://www.perozzi.net/publications/14_kdd_deepwalk.pdf) | [【Graph Embedding】DeepWalk:算法原理,实现和应用](https://zhuanlan.zhihu.com/p/56380812) |
|
||||
| LINE | [WWW 2015][LINE: Large-scale Information Network Embedding](https://arxiv.org/pdf/1503.03578.pdf) | [【Graph Embedding】LINE:算法原理,实现和应用](https://zhuanlan.zhihu.com/p/56478167) |
|
||||
# How to run examples
|
||||
1. clone the repo and make sure you have installed `tensorflow` or `tensorflow-gpu` on your local machine.
|
||||
2. run following commands
|
||||
```bash
|
||||
python setup.py install
|
||||
cd examples
|
||||
python deepwalk_wiki.py
|
||||
```
|
||||
|
||||
# Usage
|
||||
The design and implementation follows simple principles(**graph in,embedding out**) as much as possible.
|
||||
## Input format
|
||||
we use `networkx`to create graphs.The input of networkx graph is as follows:
|
||||
`node1 node2 <edge_weight>`
|
||||
|
||||
![](./pics/edge_list.png)
|
||||
## DeepWalk
|
||||
|
||||
```python
|
||||
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',create_using=nx.DiGraph(),nodetype=None,data=[('weight',int)])# Read graph
|
||||
|
||||
model = DeepWalk(G,walk_length=10,num_walks=80,workers=1)#init model
|
||||
model.train(window_size=5,iter=3)# train model
|
||||
embeddings = model.get_embeddings()# get embedding vectors
|
||||
```
|
||||
|
||||
## LINE
|
||||
|
||||
```python
|
||||
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',create_using=nx.DiGraph(),nodetype=None,data=[('weight',int)])#read graph
|
||||
|
||||
model = LINE(G,embedding_size=128,order='second') #init model,order can be ['first','second','all']
|
||||
model.train(batch_size=1024,epochs=50,verbose=2)# train model
|
||||
embeddings = model.get_embeddings()# get embedding vectors
|
||||
```
|
2405
data/wiki/Wiki_category.txt
Normal file
2405
data/wiki/Wiki_category.txt
Normal file
File diff suppressed because it is too large
Load Diff
17981
data/wiki/Wiki_edgelist.txt
Normal file
17981
data/wiki/Wiki_edgelist.txt
Normal file
File diff suppressed because it is too large
Load Diff
2405
data/wiki/wiki_labels.txt
Normal file
2405
data/wiki/wiki_labels.txt
Normal file
File diff suppressed because it is too large
Load Diff
52
examples/deepwalk_wiki.py
Normal file
52
examples/deepwalk_wiki.py
Normal file
@ -0,0 +1,52 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
from ge import DeepWalk
|
||||
from ge.classify import Classifier, read_node_label
|
||||
|
||||
|
||||
def evaluate_embeddings(embeddings):
|
||||
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
|
||||
tr_frac = 0.8
|
||||
print("Training classifier using {:.2f}% nodes...".format(
|
||||
tr_frac * 100))
|
||||
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
|
||||
clf.split_train_evaluate(X, Y, tr_frac)
|
||||
|
||||
|
||||
def plot_embeddings(embeddings,):
|
||||
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
|
||||
|
||||
emb_list = []
|
||||
for k in X:
|
||||
emb_list.append(embeddings[k])
|
||||
emb_list = np.array(emb_list)
|
||||
|
||||
model = TSNE(n_components=2)
|
||||
node_pos = model.fit_transform(emb_list)
|
||||
|
||||
color_idx = {}
|
||||
for i in range(len(X)):
|
||||
color_idx.setdefault(Y[i][0], [])
|
||||
color_idx[Y[i][0]].append(i)
|
||||
|
||||
for c, idx in color_idx.items():
|
||||
plt.scatter(node_pos[idx, 0], node_pos[idx, 1],
|
||||
label=c) # c=node_colors)
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
|
||||
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
|
||||
|
||||
model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
|
||||
model.train(window_size=5, iter=3)
|
||||
embeddings = model.get_embeddings()
|
||||
|
||||
evaluate_embeddings(embeddings)
|
||||
plot_embeddings(embeddings)
|
52
examples/line_wiki.py
Normal file
52
examples/line_wiki.py
Normal file
@ -0,0 +1,52 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
from ge import LINE
|
||||
from ge.classify import Classifier, read_node_label
|
||||
|
||||
|
||||
def evaluate_embeddings(embeddings):
|
||||
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
|
||||
tr_frac = 0.8
|
||||
print("Training classifier using {:.2f}% nodes...".format(
|
||||
tr_frac * 100))
|
||||
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
|
||||
clf.split_train_evaluate(X, Y, tr_frac)
|
||||
|
||||
|
||||
def plot_embeddings(embeddings,):
|
||||
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
|
||||
|
||||
emb_list = []
|
||||
for k in X:
|
||||
emb_list.append(embeddings[k])
|
||||
emb_list = np.array(emb_list)
|
||||
|
||||
model = TSNE(n_components=2)
|
||||
node_pos = model.fit_transform(emb_list)
|
||||
|
||||
color_idx = {}
|
||||
for i in range(len(X)):
|
||||
color_idx.setdefault(Y[i][0], [])
|
||||
color_idx[Y[i][0]].append(i)
|
||||
|
||||
for c, idx in color_idx.items():
|
||||
plt.scatter(node_pos[idx, 0], node_pos[idx, 1],
|
||||
label=c) # c=node_colors)
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
|
||||
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
|
||||
|
||||
model = LINE(G, embedding_size=128, order='second')
|
||||
model.train(batch_size=1024, epochs=45, verbose=2)
|
||||
embeddings = model.get_embeddings()
|
||||
|
||||
evaluate_embeddings(embeddings)
|
||||
plot_embeddings(embeddings)
|
1
ge/__init__.py
Normal file
1
ge/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .models import *
|
54
ge/alias.py
Normal file
54
ge/alias.py
Normal file
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
def create_alias_table(area_ratio):
|
||||
"""
|
||||
|
||||
:param area_ratio: sum(area_ratio)=1
|
||||
:return: accept,alias
|
||||
"""
|
||||
l = len(area_ratio)
|
||||
accept, alias = [0] * l, [0] * l
|
||||
small, large = [], []
|
||||
|
||||
for i, prob in enumerate(area_ratio):
|
||||
if prob < 1.0:
|
||||
small.append(i)
|
||||
else:
|
||||
large.append(i)
|
||||
|
||||
while small and large:
|
||||
small_idx, large_idx = small.pop(), large.pop()
|
||||
accept[small_idx] = area_ratio[small_idx]
|
||||
alias[small_idx] = large_idx
|
||||
area_ratio[large_idx] = area_ratio[large_idx] - \
|
||||
(1 - area_ratio[small_idx])
|
||||
if area_ratio[large_idx] < 1.0:
|
||||
small.append(large_idx)
|
||||
else:
|
||||
large.append(large_idx)
|
||||
|
||||
while large:
|
||||
large_idx = large.pop()
|
||||
accept[large_idx] = 1
|
||||
while small:
|
||||
small_idx = small.pop()
|
||||
accept[small_idx] = 1
|
||||
|
||||
return accept, alias
|
||||
|
||||
|
||||
def alias_sample(accept, alias):
|
||||
"""
|
||||
|
||||
:param accept:
|
||||
:param alias:
|
||||
:return: sample index
|
||||
"""
|
||||
N = len(accept)
|
||||
i = int(np.random.random()*N)
|
||||
r = np.random.random()
|
||||
if r < accept[i]:
|
||||
return i
|
||||
else:
|
||||
return alias[i]
|
84
ge/classify.py
Normal file
84
ge/classify.py
Normal file
@ -0,0 +1,84 @@
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
import numpy
|
||||
from sklearn.metrics import f1_score,accuracy_score
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
|
||||
class TopKRanker(OneVsRestClassifier):
|
||||
def predict(self, X, top_k_list):
|
||||
probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
|
||||
all_labels = []
|
||||
for i, k in enumerate(top_k_list):
|
||||
probs_ = probs[i, :]
|
||||
labels = self.classes_[probs_.argsort()[-k:]].tolist()
|
||||
probs_[:] = 0
|
||||
probs_[labels] = 1
|
||||
all_labels.append(probs_)
|
||||
return numpy.asarray(all_labels)
|
||||
|
||||
|
||||
class Classifier(object):
|
||||
|
||||
def __init__(self, embeddings, clf):
|
||||
self.embeddings = embeddings
|
||||
self.clf = TopKRanker(clf)
|
||||
self.binarizer = MultiLabelBinarizer(sparse_output=True)
|
||||
|
||||
def train(self, X, Y, Y_all):
|
||||
self.binarizer.fit(Y_all)
|
||||
X_train = [self.embeddings[x] for x in X]
|
||||
Y = self.binarizer.transform(Y)
|
||||
self.clf.fit(X_train, Y)
|
||||
|
||||
def evaluate(self, X, Y):
|
||||
top_k_list = [len(l) for l in Y]
|
||||
Y_ = self.predict(X, top_k_list)
|
||||
Y = self.binarizer.transform(Y)
|
||||
averages = ["micro", "macro","samples", "weighted" ]#
|
||||
results = {}
|
||||
for average in averages:
|
||||
results[average] = f1_score(Y, Y_, average=average)
|
||||
#results['acc'] = accuracy_score(Y,Y_)
|
||||
print('-------------------')
|
||||
print(results)
|
||||
return results
|
||||
print('-------------------')
|
||||
|
||||
def predict(self, X, top_k_list):
|
||||
X_ = numpy.asarray([self.embeddings[x] for x in X])
|
||||
Y = self.clf.predict(X_, top_k_list=top_k_list)
|
||||
return Y
|
||||
|
||||
def split_train_evaluate(self, X, Y, train_precent, seed=0):
|
||||
state = numpy.random.get_state()
|
||||
|
||||
training_size = int(train_precent * len(X))
|
||||
numpy.random.seed(seed)
|
||||
shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
|
||||
X_train = [X[shuffle_indices[i]] for i in range(training_size)]
|
||||
Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
|
||||
X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
|
||||
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
|
||||
|
||||
self.train(X_train, Y_train, Y)
|
||||
numpy.random.set_state(state)
|
||||
return self.evaluate(X_test, Y_test)
|
||||
|
||||
def read_node_label(filename, skip_head=False):
|
||||
fin = open(filename, 'r')
|
||||
X = []
|
||||
Y = []
|
||||
while 1:
|
||||
if skip_head:
|
||||
fin.readline()
|
||||
l = fin.readline()
|
||||
if l == '':
|
||||
break
|
||||
vec = l.strip().split(' ')
|
||||
X.append(vec[0])
|
||||
Y.append(vec[1:])
|
||||
fin.close()
|
||||
return X, Y
|
5
ge/models/__init__.py
Normal file
5
ge/models/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .deepwalk import DeepWalk
|
||||
from .line import LINE
|
||||
|
||||
|
||||
__all__ = ["DeepWalk","LINE"]
|
46
ge/models/deepwalk.py
Normal file
46
ge/models/deepwalk.py
Normal file
@ -0,0 +1,46 @@
|
||||
from ..walker import RandomWalker
|
||||
from gensim.models import Word2Vec
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class DeepWalk:
|
||||
def __init__(self, graph, walk_length, num_walks, workers=1):
|
||||
|
||||
self.graph = graph
|
||||
self.w2v_model = None
|
||||
self._embeddings = {}
|
||||
|
||||
self.walker = RandomWalker(
|
||||
graph, p=1, q=1, )
|
||||
self.sentences = self.walker.simulate_walks(
|
||||
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
|
||||
|
||||
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
|
||||
#sentences = pd.read_pickle('random_walks.pkl')
|
||||
|
||||
kwargs["sentences"] = self.sentences
|
||||
kwargs["min_count"] = kwargs.get("min_count", 0)
|
||||
kwargs["size"] = embed_size
|
||||
kwargs["sg"] = 1 # skip gram
|
||||
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
|
||||
kwargs["workers"] = workers
|
||||
kwargs["window"] = window_size
|
||||
kwargs["iter"] = iter
|
||||
|
||||
print("Learning representation...")
|
||||
model = Word2Vec(**kwargs)
|
||||
print("Learning representation done!")
|
||||
|
||||
self.w2v_model = model
|
||||
return model
|
||||
|
||||
def get_embeddings(self,):
|
||||
if self.w2v_model is None:
|
||||
print("model not train")
|
||||
return {}
|
||||
|
||||
self._embeddings = {}
|
||||
for word in self.graph.nodes():
|
||||
self._embeddings[word] = self.w2v_model.wv[word]
|
||||
|
||||
return self._embeddings
|
194
ge/models/line.py
Normal file
194
ge/models/line.py
Normal file
@ -0,0 +1,194 @@
|
||||
import math
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras.layers import Embedding, Input, Lambda
|
||||
from tensorflow.python.keras.models import Model
|
||||
|
||||
from ..alias import create_alias_table, alias_sample
|
||||
from ..utils import preprocess_nxgraph
|
||||
|
||||
|
||||
def line_loss(y_true, y_pred):
|
||||
return -K.mean(K.log(K.sigmoid(y_true*y_pred)))
|
||||
|
||||
|
||||
def create_model(numNodes, embedding_size, order='second'):
|
||||
|
||||
v_i = Input(shape=(1,))
|
||||
v_j = Input(shape=(1,))
|
||||
|
||||
first_emb = Embedding(numNodes, embedding_size, name='first_emb')
|
||||
second_emb = Embedding(numNodes, embedding_size, name='second_emb')
|
||||
context_emb = Embedding(numNodes, embedding_size, name='context_emb')
|
||||
|
||||
v_i_emb = first_emb(v_i)
|
||||
v_j_emb = first_emb(v_j)
|
||||
|
||||
v_i_emb_second = second_emb(v_i)
|
||||
v_j_context_emb = context_emb(v_j)
|
||||
|
||||
first = Lambda(lambda x: tf.reduce_sum(
|
||||
x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb])
|
||||
second = Lambda(lambda x: tf.reduce_sum(
|
||||
x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
|
||||
|
||||
if order == 'first':
|
||||
output_list = [first]
|
||||
elif order == 'second':
|
||||
output_list = [second]
|
||||
else:
|
||||
output_list = [first, second]
|
||||
|
||||
model = Model(inputs=[v_i, v_j], outputs=output_list)
|
||||
|
||||
return model, {'first': first_emb, 'second': second_emb}
|
||||
|
||||
|
||||
class LINE:
|
||||
def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',):
|
||||
"""
|
||||
|
||||
:param graph:
|
||||
:param embedding_size:
|
||||
:param negative_ratio:
|
||||
:param order: 'first','second','all'
|
||||
"""
|
||||
if order not in ['first', 'second', 'all']:
|
||||
raise ValueError('mode must be fisrt,second,or all')
|
||||
|
||||
self.graph = graph
|
||||
self.idx2node, self.node2idx = preprocess_nxgraph(graph)
|
||||
self.use_alias = True
|
||||
|
||||
self.rep_size = embedding_size
|
||||
self.order = order
|
||||
|
||||
self._embeddings = {}
|
||||
self.negative_ratio = negative_ratio
|
||||
self.order = order
|
||||
|
||||
self.node_size = graph.number_of_nodes()
|
||||
self.edge_size = graph.number_of_edges()
|
||||
self.samples_per_epoch = self.edge_size*(1+negative_ratio)
|
||||
|
||||
self._gen_sampling_table()
|
||||
self.reset_model()
|
||||
|
||||
def reset_training_config(self, batch_size, times):
|
||||
self.batch_size = batch_size
|
||||
self.steps_per_epoch = (
|
||||
(self.samples_per_epoch - 1) // self.batch_size + 1)*times
|
||||
|
||||
def reset_model(self, opt='adam'):
|
||||
|
||||
self.model, self.embedding_dict = create_model(
|
||||
self.node_size, self.rep_size, self.order)
|
||||
self.model.compile(opt, line_loss)
|
||||
self.batch_it = self.batch_iter(self.node2idx)
|
||||
|
||||
def _gen_sampling_table(self):
|
||||
|
||||
# create sampling table for vertex
|
||||
power = 0.75
|
||||
numNodes = self.node_size
|
||||
node_degree = np.zeros(numNodes) # out degree
|
||||
node2idx = self.node2idx
|
||||
|
||||
for edge in self.graph.edges():
|
||||
node_degree[node2idx[edge[0]]
|
||||
] += self.graph[edge[0]][edge[1]].get('weight', 1.0)
|
||||
|
||||
total_sum = sum([math.pow(node_degree[i], power)
|
||||
for i in range(numNodes)])
|
||||
norm_prob = [float(math.pow(node_degree[j], power)) /
|
||||
total_sum for j in range(numNodes)]
|
||||
|
||||
self.node_accept, self.node_alias = create_alias_table(norm_prob)
|
||||
|
||||
# create sampling table for edge
|
||||
numEdges = self.graph.number_of_edges()
|
||||
total_sum = sum([self.graph[edge[0]][edge[1]].get('weight', 1.0)
|
||||
for edge in self.graph.edges()])
|
||||
norm_prob = [self.graph[edge[0]][edge[1]].get('weight', 1.0) *
|
||||
numEdges / total_sum for edge in self.graph.edges()]
|
||||
|
||||
self.edge_accept, self.edge_alias = create_alias_table(norm_prob)
|
||||
|
||||
def batch_iter(self, node2idx):
|
||||
|
||||
edges = [(node2idx[x[0]], node2idx[x[1]]) for x in self.graph.edges()]
|
||||
|
||||
data_size = self.graph.number_of_edges()
|
||||
shuffle_indices = np.random.permutation(np.arange(data_size))
|
||||
# positive or negative mod
|
||||
mod = 0
|
||||
mod_size = 1 + self.negative_ratio
|
||||
h = []
|
||||
t = []
|
||||
sign = 0
|
||||
count = 0
|
||||
start_index = 0
|
||||
end_index = min(start_index + self.batch_size, data_size)
|
||||
while True:
|
||||
if mod == 0:
|
||||
|
||||
h = []
|
||||
t = []
|
||||
for i in range(start_index, end_index):
|
||||
if random.random() >= self.edge_accept[shuffle_indices[i]]:
|
||||
shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
|
||||
cur_h = edges[shuffle_indices[i]][0]
|
||||
cur_t = edges[shuffle_indices[i]][1]
|
||||
h.append(cur_h)
|
||||
t.append(cur_t)
|
||||
sign = np.ones(len(h))
|
||||
else:
|
||||
sign = np.ones(len(h))*-1
|
||||
t = []
|
||||
for i in range(len(h)):
|
||||
|
||||
t.append(alias_sample(
|
||||
self.node_accept, self.node_alias))
|
||||
|
||||
if self.order == 'all':
|
||||
yield ([np.array(h), np.array(t)], [sign, sign])
|
||||
else:
|
||||
yield ([np.array(h), np.array(t)], [sign])
|
||||
mod += 1
|
||||
mod %= mod_size
|
||||
if mod == 0:
|
||||
start_index = end_index
|
||||
end_index = min(start_index + self.batch_size, data_size)
|
||||
|
||||
if start_index >= data_size:
|
||||
count += 1
|
||||
mod = 0
|
||||
h = []
|
||||
shuffle_indices = np.random.permutation(np.arange(data_size))
|
||||
start_index = 0
|
||||
end_index = min(start_index + self.batch_size, data_size)
|
||||
|
||||
def get_embeddings(self,):
|
||||
self._embeddings = {}
|
||||
if self.order == 'first':
|
||||
embeddings = self.embedding_dict['first'].get_weights()[0]
|
||||
elif self.order == 'second':
|
||||
embeddings = self.embedding_dict['second'].get_weights()[0]
|
||||
else:
|
||||
embeddings = np.hstack((self.embedding_dict['first'].get_weights()[
|
||||
0], self.embedding_dict['second'].get_weights()[0]))
|
||||
idx2node = self.idx2node
|
||||
for i, embedding in enumerate(embeddings):
|
||||
self._embeddings[idx2node[i]] = embedding
|
||||
|
||||
return self._embeddings
|
||||
|
||||
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
|
||||
self.reset_training_config(batch_size, times)
|
||||
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
|
||||
verbose=verbose)
|
||||
|
||||
return hist
|
46
ge/utils.py
Normal file
46
ge/utils.py
Normal file
@ -0,0 +1,46 @@
|
||||
def preprocess_nxgraph(graph):
|
||||
node2idx = {}
|
||||
idx2node = []
|
||||
node_size = 0
|
||||
for node in graph.nodes():
|
||||
node2idx[node] = node_size
|
||||
idx2node.append(node)
|
||||
node_size += 1
|
||||
return idx2node,node2idx
|
||||
|
||||
|
||||
def partition_dict(vertices, workers):
|
||||
batch_size = (len(vertices) - 1) // workers + 1
|
||||
part_list = []
|
||||
part = []
|
||||
count = 0
|
||||
for v1, nbs in vertices.items():
|
||||
part.append((v1, nbs))
|
||||
count += 1
|
||||
if count % batch_size == 0:
|
||||
part_list.append(part)
|
||||
part = []
|
||||
if len(part) > 0:
|
||||
part_list.append(part)
|
||||
return part_list
|
||||
|
||||
def partition_list(vertices, workers):
|
||||
batch_size = (len(vertices) - 1) // workers + 1
|
||||
part_list = []
|
||||
part = []
|
||||
count = 0
|
||||
for v1, nbs in enumerate(vertices):
|
||||
part.append((v1, nbs))
|
||||
count += 1
|
||||
if count % batch_size == 0:
|
||||
part_list.append(part)
|
||||
part = []
|
||||
if len(part) > 0:
|
||||
part_list.append(part)
|
||||
return part_list
|
||||
|
||||
def partition_num(num,workers):
|
||||
if num%workers == 0:
|
||||
return [num//workers]*workers
|
||||
else:
|
||||
return [num//workers]*workers + [num%workers]
|
237
ge/walker.py
Normal file
237
ge/walker.py
Normal file
@ -0,0 +1,237 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
from joblib import Parallel, delayed
|
||||
import random
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import trange
|
||||
|
||||
from .alias import alias_sample, create_alias_table
|
||||
from .utils import partition_num
|
||||
|
||||
|
||||
class RandomWalker:
|
||||
def __init__(self, G, p=1, q=1):
|
||||
"""
|
||||
:param G:
|
||||
:param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
|
||||
:param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
|
||||
"""
|
||||
self.G = G
|
||||
self.p = p
|
||||
self.q = q
|
||||
|
||||
def deepwalk_walk(self, walk_length, start_node):
|
||||
|
||||
walk = [start_node]
|
||||
|
||||
while len(walk) < walk_length:
|
||||
cur = walk[-1]
|
||||
cur_nbrs = list(self.G.neighbors(cur))
|
||||
if len(cur_nbrs) > 0:
|
||||
walk.append(random.choice(cur_nbrs))
|
||||
else:
|
||||
break
|
||||
return walk
|
||||
|
||||
def node2vec_walk(self, walk_length, start_node):
|
||||
'''
|
||||
Simulate a random walk starting from start node.
|
||||
'''
|
||||
G = self.G
|
||||
alias_nodes = self.alias_nodes
|
||||
alias_edges = self.alias_edges
|
||||
|
||||
walk = [start_node]
|
||||
|
||||
while len(walk) < walk_length:
|
||||
cur = walk[-1]
|
||||
cur_nbrs = list(G.neighbors(cur))
|
||||
if len(cur_nbrs) > 0:
|
||||
if len(walk) == 1:
|
||||
walk.append(
|
||||
cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])])
|
||||
else:
|
||||
prev = walk[-2]
|
||||
pos = (prev, cur)
|
||||
next = cur_nbrs[alias_sample(alias_edges[pos][0],
|
||||
alias_edges[pos][1])]
|
||||
walk.append(next)
|
||||
else:
|
||||
break
|
||||
|
||||
return walk
|
||||
|
||||
def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
|
||||
'''
|
||||
Repeatedly simulate random walks from each node.
|
||||
'''
|
||||
G = self.G
|
||||
|
||||
nodes = list(G.nodes())
|
||||
|
||||
results = Parallel(n_jobs=workers, verbose=verbose, )(
|
||||
delayed(self._simulate_walks)(nodes, num, walk_length) for num in
|
||||
partition_num(num_walks, workers))
|
||||
|
||||
walks = list(itertools.chain(*results))
|
||||
|
||||
|
||||
#pd.to_pickle(walks, 'random_walks.pkl')
|
||||
return walks
|
||||
|
||||
def _simulate_walks(self, nodes, num_walks, walk_length,):
|
||||
walks = []
|
||||
for _ in range(num_walks):
|
||||
random.shuffle(nodes)
|
||||
for v in nodes:
|
||||
if self.p == 1 and self.q == 1:
|
||||
walks.append(self.deepwalk_walk(
|
||||
walk_length=walk_length, start_node=v))
|
||||
else:
|
||||
walks.append(self.node2vec_walk(
|
||||
walk_length=walk_length, start_node=v))
|
||||
return walks
|
||||
|
||||
def get_alias_edge(self, src, dst):
|
||||
'''
|
||||
Get the alias edge setup lists for a given edge.
|
||||
'''
|
||||
G = self.G
|
||||
p = self.p
|
||||
q = self.q
|
||||
|
||||
unnormalized_probs = []
|
||||
for dst_nbr in G.neighbors(dst):
|
||||
weight = G[dst][dst_nbr].get('weight', 1.0)
|
||||
if dst_nbr == src:
|
||||
unnormalized_probs.append(weight/p)
|
||||
elif G.has_edge(dst_nbr, src):
|
||||
unnormalized_probs.append(weight)
|
||||
else:
|
||||
unnormalized_probs.append(weight/q)
|
||||
norm_const = sum(unnormalized_probs)
|
||||
normalized_probs = [
|
||||
float(u_prob)/norm_const for u_prob in unnormalized_probs]
|
||||
|
||||
return create_alias_table(normalized_probs)
|
||||
|
||||
def preprocess_transition_probs(self):
|
||||
'''
|
||||
Preprocessing of transition probabilities for guiding the random walks.
|
||||
'''
|
||||
G = self.G
|
||||
|
||||
alias_nodes = {}
|
||||
for node in G.nodes():
|
||||
unnormalized_probs = [G[node][nbr].get('weight', 1.0)
|
||||
for nbr in G.neighbors(node)]
|
||||
norm_const = sum(unnormalized_probs)
|
||||
normalized_probs = [
|
||||
float(u_prob)/norm_const for u_prob in unnormalized_probs]
|
||||
alias_nodes[node] = create_alias_table(normalized_probs)
|
||||
|
||||
alias_edges = {}
|
||||
triads = {}
|
||||
|
||||
#look_up_dict = self.look_up_dict
|
||||
#node_size = self.node_size
|
||||
for edge in G.edges():
|
||||
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
|
||||
|
||||
self.alias_nodes = alias_nodes
|
||||
self.alias_edges = alias_edges
|
||||
|
||||
return
|
||||
|
||||
|
||||
class BiasedWalker:
|
||||
def __init__(self, idx2node, temp_path):
|
||||
|
||||
self.idx2node = idx2node
|
||||
self.idx = list(range(len(self.idx2node)))
|
||||
self.temp_path = temp_path
|
||||
pass
|
||||
|
||||
def simulate_walks(self, num_walks, walk_length):
|
||||
|
||||
layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
|
||||
# #存储每一层的邻接点列表
|
||||
layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
|
||||
layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
|
||||
gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
|
||||
# 大于平均权重的邻接点个数
|
||||
walks = []
|
||||
initialLayer = 0
|
||||
|
||||
nodes = self.idx # list(self.g.nodes())
|
||||
print('Walk iteration:')
|
||||
for walk_iter in trange(num_walks):
|
||||
random.shuffle(nodes)
|
||||
for v in nodes:
|
||||
walks.append(self._exec_random_walk(layers_adj, layers_alias,
|
||||
layers_accept, v, walk_length, gamma))
|
||||
# walk = self._exec_ramdom_walks_for_chunck(
|
||||
# nodes, g, alias_method_j, alias_method_q, walk_length, gamma)
|
||||
# walks.extend(walk)
|
||||
pd.to_pickle(walks, self.temp_path + 'walks.pkl')
|
||||
return walks
|
||||
|
||||
# def _exec_ramdom_walks_for_chunck(self, nodes, graphs, alias_method_j, alias_method_q, walk_length, gamma):
|
||||
# walks = []
|
||||
# for v in nodes:
|
||||
# walks.append(self._exec_random_walk(graphs, alias_method_j,
|
||||
# alias_method_q, v, walk_length, gamma))
|
||||
# return walks
|
||||
|
||||
def _exec_random_walk(self, graphs, layers_alias, layers_accept, v, walk_length, gamma):
|
||||
original_v = v
|
||||
initialLayer = 0
|
||||
layer = initialLayer
|
||||
|
||||
path = []
|
||||
path.append(self.idx2node[v])
|
||||
|
||||
while len(path) < walk_length:
|
||||
r = random.random()
|
||||
|
||||
if(r < 0.3): # 在同一层
|
||||
v = chooseNeighbor(v, graphs, layers_alias,
|
||||
layers_accept, layer)
|
||||
path.append(self.idx2node[v])
|
||||
|
||||
else: # 跳到不同的层
|
||||
r = random.random()
|
||||
try:
|
||||
limiar_moveup = prob_moveup(gamma[layer][v])
|
||||
except:
|
||||
print(layer, v)
|
||||
raise ValueError()
|
||||
|
||||
if(r > limiar_moveup):
|
||||
if(layer > initialLayer):
|
||||
layer = layer - 1
|
||||
else:
|
||||
if((layer + 1) in graphs and v in graphs[layer + 1]):
|
||||
layer = layer + 1
|
||||
|
||||
return path
|
||||
|
||||
|
||||
def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):
|
||||
|
||||
v_list = graphs[layer][v]
|
||||
|
||||
idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v])
|
||||
v = v_list[idx]
|
||||
|
||||
return v
|
||||
|
||||
|
||||
def prob_moveup(gamma):
|
||||
x = math.log(gamma + math.e)
|
||||
p = (x / (x + 1))
|
||||
return p
|
BIN
pics/edge_list.png
Normal file
BIN
pics/edge_list.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
50
setup.py
Normal file
50
setup.py
Normal file
@ -0,0 +1,50 @@
|
||||
import setuptools
|
||||
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
|
||||
long_description = fh.read()
|
||||
|
||||
|
||||
REQUIRED_PACKAGES = [
|
||||
# 'tensorflow>=1.4.0,<=1.12.0',
|
||||
'gensim',
|
||||
'networkx',
|
||||
'scikit-learn',
|
||||
'numpy'
|
||||
]
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
|
||||
name="ge",
|
||||
|
||||
version="0.0.0",
|
||||
|
||||
author="Weichen Shen",
|
||||
|
||||
author_email="wcshen1994@163.com",
|
||||
|
||||
url="https://github.com/shenweichen/GraphEmbedding",
|
||||
|
||||
packages=setuptools.find_packages(exclude=[]),
|
||||
|
||||
python_requires='>=3.4', # 3.4.6
|
||||
|
||||
install_requires=REQUIRED_PACKAGES,
|
||||
|
||||
extras_require={
|
||||
|
||||
"tf": ['tensorflow>=1.4.0,!=1.7.*,!=1.8.*'],
|
||||
|
||||
"tf_gpu": ['tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*'],
|
||||
|
||||
},
|
||||
|
||||
entry_points={
|
||||
|
||||
},
|
||||
license="MIT license",
|
||||
|
||||
|
||||
)
|
Loading…
Reference in New Issue
Block a user