This commit is contained in:
Weichen Shen 2019-02-13 21:04:23 +08:00
parent 004167d410
commit 1ec2d36704
7 changed files with 290 additions and 1 deletions

View File

@ -8,6 +8,7 @@
| DeepWalk | [KDD 2014][DeepWalk: Online Learning of Social Representations](http://www.perozzi.net/publications/14_kdd_deepwalk.pdf) | [【Graph Embedding】DeepWalk算法原理实现和应用](https://zhuanlan.zhihu.com/p/56380812) |
| LINE | [WWW 2015][LINE: Large-scale Information Network Embedding](https://arxiv.org/pdf/1503.03578.pdf) | [【Graph Embedding】LINE算法原理实现和应用](https://zhuanlan.zhihu.com/p/56478167) |
| Node2Vec | [KDD 2016][node2vec: Scalable Feature Learning for Networks](https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf) | [【Graph Embedding】Node2Vec算法原理实现和应用](https://zhuanlan.zhihu.com/p/56542707) |
| SDNE | [KDD 2016][Structural Deep Network Embedding](https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf) | [【Graph Embedding】SDNE算法原理实现和应用](https://zhuanlan.zhihu.com/p/56637181) |
# How to run examples
1. clone the repo and make sure you have installed `tensorflow` or `tensorflow-gpu` on your local machine.
@ -53,3 +54,12 @@ model = Node2Vec(G, walk_length = 10, num_walks = 80,p = 0.25, q = 4, workers =
model.train(window_size = 5, iter = 3)# train model
embeddings = model.get_embeddings()# get embedding vectors
```
## SDNE
```python
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',create_using=nx.DiGraph(),nodetype=None,data=[('weight',int)])#read graph
model = SDNE(G,hidden_size=[256,128]) #init model
model.train(batch_size=3000,epochs=40,verbose=2)# train model
embeddings = model.get_embeddings()# get embedding vectors
```

54
examples/sdne_wiki.py Normal file
View File

@ -0,0 +1,54 @@
import numpy as np
from ge.classify import read_node_label, Classifier
from ge import SDNE
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1],
label=c) # c=node_colors)
plt.legend()
plt.show()
if __name__ == "__main__":
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
model = SDNE(G, hidden_size=[256, 128],)
model.train(batch_size=3000, epochs=40, verbose=2)
embeddings = model.get_embeddings()
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)

View File

@ -1,6 +1,7 @@
from .deepwalk import DeepWalk
from .line import LINE
from .node2vec import Node2Vec
from .sdne import SDNE
__all__ = ["DeepWalk", "LINE", "Node2Vec"]
__all__ = ["DeepWalk", "LINE", "Node2Vec", "SDNE"]

View File

@ -1,3 +1,22 @@
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf)
"""
from ..walker import RandomWalker
from gensim.models import Word2Vec
import pandas as pd

View File

@ -1,3 +1,22 @@
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Tang J, Qu M, Wang M, et al. Line: Large-scale information network embedding[C]//Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2015: 1067-1077.(https://arxiv.org/pdf/1503.03578.pdf)
"""
import math
import random

View File

@ -1,3 +1,23 @@
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf)
"""
from gensim.models import Word2Vec
import pandas as pd

166
ge/models/sdne.py Normal file
View File

@ -0,0 +1,166 @@
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Wang D, Cui P, Zhu W. Structural deep network embedding[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 1225-1234.(https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf)
"""
import time
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.callbacks import History
from tensorflow.python.keras.layers import Dense, Input
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.regularizers import l1_l2
from ..utils import preprocess_nxgraph
def l_2nd(beta):
def loss_2nd(y_true, y_pred):
b_ = np.ones_like(y_true)
b_[y_true != 0] = beta
x = K.square((y_true - y_pred) * b_)
t = K.sum(x, axis=-1, )
return K.mean(t)
return loss_2nd
def l_1st(alpha):
def loss_1st(y_true, y_pred):
L = y_true
Y = y_pred
batch_size = tf.to_float(K.shape(L)[0])
return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size
return loss_1st
def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4):
A = Input(shape=(node_size,))
L = Input(shape=(None,))
fc = A
for i in range(len(hidden_size)):
if i == len(hidden_size) - 1:
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2), name='1st')(fc)
else:
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2))(fc)
Y = fc
for i in reversed(range(len(hidden_size) - 1)):
fc = Dense(hidden_size[i], activation='relu',
kernel_regularizer=l1_l2(l1, l2))(fc)
A_ = Dense(node_size, 'relu', name='2nd')(fc)
model = Model(inputs=[A, L], outputs=[A_, Y])
emb = Model(inputs=A, outputs=Y)
return model, emb
class SDNE(object):
def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-4, ):
self.graph = graph
# self.g.remove_edges_from(self.g.selfloop_edges())
self.idx2node, self.node2idx = preprocess_nxgraph(self.graph)
self.node_size = self.graph.number_of_nodes()
self.hidden_size = hidden_size
self.alpha = alpha
self.beta = beta
self.nu1 = nu1
self.nu2 = nu2
self.A, self.L = self._create_A_L(
self.graph, self.node2idx) # Adj Matrix,L Matrix
self.reset_model()
self.inputs = [self.A, self.L]
self._embeddings = {}
def reset_model(self, opt='adam'):
self.model, self.emb_model = create_model(self.node_size, hidden_size=self.hidden_size, l1=self.nu1,
l2=self.nu2)
self.model.compile(opt, [l_2nd(self.beta), l_1st(self.alpha)])
self.get_embeddings()
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1):
if batch_size >= self.node_size:
if batch_size > self.node_size:
print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format(
batch_size, self.node_size))
batch_size = self.node_size
return self.model.fit([self.A, self.L], [self.A, self.L], batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose, shuffle=False,)
else:
steps_per_epoch = (self.node_size - 1) // batch_size + 1
hist = History()
hist.on_train_begin()
logs = {}
for epoch in range(initial_epoch, epochs):
start_time = time.time()
losses = np.zeros(3)
for i in range(steps_per_epoch):
index = np.arange(
i * batch_size, min((i + 1) * batch_size, self.node_size))
A_train = self.A[index, :]
L_mat_train = self.L[index][:, index]
inp = [A_train, L_mat_train]
batch_losses = self.model.train_on_batch(inp, inp)
losses += batch_losses
losses = losses/steps_per_epoch
logs['loss'] = losses[0]
logs['2nd_loss'] = losses[1]
logs['1st_loss'] = losses[2]
epoch_time = int(time.time() - start_time)
hist.on_epoch_end(epoch, logs)
if verbose > 0:
print('Epoch {0}/{1}'.format(epoch + 1, epochs))
print('{0}s - loss: {1: .4f} - 2nd_loss: {2: .4f} - 1st_loss: {3: .4f}'.format(
epoch_time, losses[0], losses[1], losses[2]))
return hist
def evaluate(self, ):
return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size)
def get_embeddings(self):
self._embeddings = {}
embeddings = self.emb_model.predict(self.A, batch_size=self.node_size)
look_back = self.idx2node
for i, embedding in enumerate(embeddings):
self._embeddings[look_back[i]] = embedding
return self._embeddings
def _create_A_L(self, graph, node2idx):
node_size = graph.number_of_nodes()
A = np.zeros((node_size, node_size))
A_ = np.zeros((node_size, node_size))
for edge in graph.edges():
v1, v2 = edge
edge_weight = graph[v1][v2].get('weight', 1)
A[node2idx[v1]][node2idx[v2]] = edge_weight
A_[node2idx[v1]][node2idx[v2]] = edge_weight
A_[node2idx[v2]][node2idx[v1]] = edge_weight
D = np.zeros_like(A)
for i in range(node_size):
D[i][i] = np.sum(A_[i])
L = D - A_
return A, L