parent
b39ff95370
commit
c8efad063c
74
.github/workflows/ci.yml
vendored
Normal file
74
.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
path:
|
||||||
|
- 'ge/*'
|
||||||
|
- 'tests/*'
|
||||||
|
pull_request:
|
||||||
|
path:
|
||||||
|
- 'ge/*'
|
||||||
|
- 'tests/*'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 180
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: [3.6,3.7,3.8]
|
||||||
|
tf-version: [1.4.0,1.15.0,2.5.0,2.6.0,2.7.0,2.8.0,2.9.0]
|
||||||
|
|
||||||
|
exclude:
|
||||||
|
- python-version: 3.7
|
||||||
|
tf-version: 1.4.0
|
||||||
|
- python-version: 3.7
|
||||||
|
tf-version: 1.15.0
|
||||||
|
- python-version: 3.8
|
||||||
|
tf-version: 1.4.0
|
||||||
|
- python-version: 3.8
|
||||||
|
tf-version: 1.14.0
|
||||||
|
- python-version: 3.8
|
||||||
|
tf-version: 1.15.0
|
||||||
|
- python-version: 3.6
|
||||||
|
tf-version: 2.7.0
|
||||||
|
- python-version: 3.6
|
||||||
|
tf-version: 2.8.0
|
||||||
|
- python-version: 3.6
|
||||||
|
tf-version: 2.9.0
|
||||||
|
- python-version: 3.9
|
||||||
|
tf-version: 1.4.0
|
||||||
|
- python-version: 3.9
|
||||||
|
tf-version: 1.15.0
|
||||||
|
- python-version: 3.9
|
||||||
|
tf-version: 2.2.0
|
||||||
|
steps:
|
||||||
|
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Setup python environment
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip3 install -q tensorflow==${{ matrix.tf-version }}
|
||||||
|
pip install -q protobuf==3.19.0
|
||||||
|
pip install -q requests
|
||||||
|
pip install -e .
|
||||||
|
- name: Test with pytest
|
||||||
|
timeout-minutes: 180
|
||||||
|
run: |
|
||||||
|
pip install -q pytest
|
||||||
|
pip install -q pytest-cov
|
||||||
|
pip install -q python-coveralls
|
||||||
|
pytest --cov=ge --cov-report=xml
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v3.1.0
|
||||||
|
with:
|
||||||
|
token: ${{secrets.CODECOV_TOKEN}}
|
||||||
|
file: ./coverage.xml
|
||||||
|
flags: pytest
|
||||||
|
name: py${{ matrix.python-version }}-tf${{ matrix.tf-version }}
|
13
README.md
13
README.md
@ -1,5 +1,14 @@
|
|||||||
# GraphEmbedding
|
# GraphEmbedding
|
||||||
|
|
||||||
|
[![GitHub Issues](https://img.shields.io/github/issues/shenweichen/graphembedding.svg
|
||||||
|
)](https://github.com/shenweichen/graphembedding/issues)
|
||||||
|
![CI status](https://github.com/shenweichen/graphembedding/workflows/CI/badge.svg)
|
||||||
|
[![codecov](https://codecov.io/gh/shenweichen/graphembedding/branch/master/graph/badge.svg)](https://codecov.io/gh/shenweichen/graphembedding)
|
||||||
|
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/c46407f5931f40048e28860dccf7dabc)](https://www.codacy.com/gh/shenweichen/GraphEmbedding/dashboard?utm_source=github.com&utm_medium=referral&utm_content=shenweichen/GraphEmbedding&utm_campaign=Badge_Grade)
|
||||||
|
[![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](./README.md#disscussiongroup--related-projects)
|
||||||
|
|
||||||
|
[comment]: <> ([![License](https://img.shields.io/github/license/shenweichen/graphembedding.svg)](https://github.com/shenweichen/graphembedding/blob/master/LICENSE))
|
||||||
|
|
||||||
# Method
|
# Method
|
||||||
|
|
||||||
|
|
||||||
@ -27,7 +36,7 @@ python deepwalk_wiki.py
|
|||||||
<table style="margin-left: 20px; margin-right: auto;">
|
<table style="margin-left: 20px; margin-right: auto;">
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
公众号:<b>浅梦的学习笔记</b><br><br>
|
公众号:<b>浅梦学习笔记</b><br><br>
|
||||||
<a href="https://github.com/shenweichen/GraphEmbedding">
|
<a href="https://github.com/shenweichen/GraphEmbedding">
|
||||||
<img align="center" src="./pics/code.png" />
|
<img align="center" src="./pics/code.png" />
|
||||||
</a>
|
</a>
|
||||||
@ -101,7 +110,7 @@ embeddings = model.get_embeddings()# get embedding vectors
|
|||||||
```python
|
```python
|
||||||
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist',create_using=nx.DiGraph(),nodetype=None,data=[('weight',int)])#read graph
|
G = nx.read_edgelist('../data/flight/brazil-airports.edgelist',create_using=nx.DiGraph(),nodetype=None,data=[('weight',int)])#read graph
|
||||||
|
|
||||||
model = model = Struc2Vec(G, 10, 80, workers=4, verbose=40, ) #init model
|
model = Struc2Vec(G, 10, 80, workers=4, verbose=40, ) #init model
|
||||||
model.train(window_size = 5, iter = 3)# train model
|
model.train(window_size = 5, iter = 3)# train model
|
||||||
embeddings = model.get_embeddings()# get embedding vectors
|
embeddings = model.get_embeddings()# get embedding vectors
|
||||||
```
|
```
|
||||||
|
@ -22,7 +22,7 @@ def create_alias_table(area_ratio):
|
|||||||
accept[small_idx] = area_ratio_[small_idx]
|
accept[small_idx] = area_ratio_[small_idx]
|
||||||
alias[small_idx] = large_idx
|
alias[small_idx] = large_idx
|
||||||
area_ratio_[large_idx] = area_ratio_[large_idx] - \
|
area_ratio_[large_idx] = area_ratio_[large_idx] - \
|
||||||
(1 - area_ratio_[small_idx])
|
(1 - area_ratio_[small_idx])
|
||||||
if area_ratio_[large_idx] < 1.0:
|
if area_ratio_[large_idx] < 1.0:
|
||||||
small.append(large_idx)
|
small.append(large_idx)
|
||||||
else:
|
else:
|
||||||
@ -46,7 +46,7 @@ def alias_sample(accept, alias):
|
|||||||
:return: sample index
|
:return: sample index
|
||||||
"""
|
"""
|
||||||
N = len(accept)
|
N = len(accept)
|
||||||
i = int(np.random.random()*N)
|
i = int(np.random.random() * N)
|
||||||
r = np.random.random()
|
r = np.random.random()
|
||||||
if r < accept[i]:
|
if r < accept[i]:
|
||||||
return i
|
return i
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from sklearn.metrics import f1_score, accuracy_score
|
from sklearn.metrics import f1_score, accuracy_score
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
@ -41,11 +40,10 @@ class Classifier(object):
|
|||||||
results = {}
|
results = {}
|
||||||
for average in averages:
|
for average in averages:
|
||||||
results[average] = f1_score(Y, Y_, average=average)
|
results[average] = f1_score(Y, Y_, average=average)
|
||||||
results['acc'] = accuracy_score(Y,Y_)
|
results['acc'] = accuracy_score(Y, Y_)
|
||||||
print('-------------------')
|
print('-------------------')
|
||||||
print(results)
|
print(results)
|
||||||
return results
|
return results
|
||||||
print('-------------------')
|
|
||||||
|
|
||||||
def predict(self, X, top_k_list):
|
def predict(self, X, top_k_list):
|
||||||
X_ = numpy.asarray([self.embeddings[x] for x in X])
|
X_ = numpy.asarray([self.embeddings[x] for x in X])
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
Author:
|
Author:
|
||||||
|
|
||||||
Weichen Shen,wcshen1994@163.com
|
Weichen Shen,weichenswc@163.com
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -17,9 +17,9 @@ Reference:
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from ..walker import RandomWalker
|
|
||||||
from gensim.models import Word2Vec
|
from gensim.models import Word2Vec
|
||||||
import pandas as pd
|
|
||||||
|
from ..walker import RandomWalker
|
||||||
|
|
||||||
|
|
||||||
class DeepWalk:
|
class DeepWalk:
|
||||||
@ -38,12 +38,12 @@ class DeepWalk:
|
|||||||
|
|
||||||
kwargs["sentences"] = self.sentences
|
kwargs["sentences"] = self.sentences
|
||||||
kwargs["min_count"] = kwargs.get("min_count", 0)
|
kwargs["min_count"] = kwargs.get("min_count", 0)
|
||||||
kwargs["size"] = embed_size
|
kwargs["vector_size"] = embed_size
|
||||||
kwargs["sg"] = 1 # skip gram
|
kwargs["sg"] = 1 # skip gram
|
||||||
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
|
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
|
||||||
kwargs["workers"] = workers
|
kwargs["workers"] = workers
|
||||||
kwargs["window"] = window_size
|
kwargs["window"] = window_size
|
||||||
kwargs["iter"] = iter
|
kwargs["epochs"] = iter
|
||||||
|
|
||||||
print("Learning embedding vectors...")
|
print("Learning embedding vectors...")
|
||||||
model = Word2Vec(**kwargs)
|
model = Word2Vec(**kwargs)
|
||||||
@ -52,7 +52,7 @@ class DeepWalk:
|
|||||||
self.w2v_model = model
|
self.w2v_model = model
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def get_embeddings(self,):
|
def get_embeddings(self, ):
|
||||||
if self.w2v_model is None:
|
if self.w2v_model is None:
|
||||||
print("model not train")
|
print("model not train")
|
||||||
return {}
|
return {}
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
Author:
|
Author:
|
||||||
|
|
||||||
Weichen Shen,wcshen1994@163.com
|
Weichen Shen,weichenswc@163.com
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ import math
|
|||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
from deepctr.layers.utils import reduce_sum
|
||||||
from tensorflow.python.keras import backend as K
|
from tensorflow.python.keras import backend as K
|
||||||
from tensorflow.python.keras.layers import Embedding, Input, Lambda
|
from tensorflow.python.keras.layers import Embedding, Input, Lambda
|
||||||
from tensorflow.python.keras.models import Model
|
from tensorflow.python.keras.models import Model
|
||||||
@ -31,11 +31,10 @@ from ..utils import preprocess_nxgraph
|
|||||||
|
|
||||||
|
|
||||||
def line_loss(y_true, y_pred):
|
def line_loss(y_true, y_pred):
|
||||||
return -K.mean(K.log(K.sigmoid(y_true*y_pred)))
|
return -K.mean(K.log(K.sigmoid(y_true * y_pred)))
|
||||||
|
|
||||||
|
|
||||||
def create_model(numNodes, embedding_size, order='second'):
|
def create_model(numNodes, embedding_size, order='second'):
|
||||||
|
|
||||||
v_i = Input(shape=(1,))
|
v_i = Input(shape=(1,))
|
||||||
v_j = Input(shape=(1,))
|
v_j = Input(shape=(1,))
|
||||||
|
|
||||||
@ -49,10 +48,10 @@ def create_model(numNodes, embedding_size, order='second'):
|
|||||||
v_i_emb_second = second_emb(v_i)
|
v_i_emb_second = second_emb(v_i)
|
||||||
v_j_context_emb = context_emb(v_j)
|
v_j_context_emb = context_emb(v_j)
|
||||||
|
|
||||||
first = Lambda(lambda x: tf.reduce_sum(
|
first = Lambda(lambda x: reduce_sum(
|
||||||
x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb])
|
x[0] * x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb])
|
||||||
second = Lambda(lambda x: tf.reduce_sum(
|
second = Lambda(lambda x: reduce_sum(
|
||||||
x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
|
x[0] * x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
|
||||||
|
|
||||||
if order == 'first':
|
if order == 'first':
|
||||||
output_list = [first]
|
output_list = [first]
|
||||||
@ -67,7 +66,7 @@ def create_model(numNodes, embedding_size, order='second'):
|
|||||||
|
|
||||||
|
|
||||||
class LINE:
|
class LINE:
|
||||||
def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',):
|
def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second', ):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
:param graph:
|
:param graph:
|
||||||
@ -91,7 +90,7 @@ class LINE:
|
|||||||
|
|
||||||
self.node_size = graph.number_of_nodes()
|
self.node_size = graph.number_of_nodes()
|
||||||
self.edge_size = graph.number_of_edges()
|
self.edge_size = graph.number_of_edges()
|
||||||
self.samples_per_epoch = self.edge_size*(1+negative_ratio)
|
self.samples_per_epoch = self.edge_size * (1 + negative_ratio)
|
||||||
|
|
||||||
self._gen_sampling_table()
|
self._gen_sampling_table()
|
||||||
self.reset_model()
|
self.reset_model()
|
||||||
@ -99,7 +98,7 @@ class LINE:
|
|||||||
def reset_training_config(self, batch_size, times):
|
def reset_training_config(self, batch_size, times):
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.steps_per_epoch = (
|
self.steps_per_epoch = (
|
||||||
(self.samples_per_epoch - 1) // self.batch_size + 1)*times
|
(self.samples_per_epoch - 1) // self.batch_size + 1) * times
|
||||||
|
|
||||||
def reset_model(self, opt='adam'):
|
def reset_model(self, opt='adam'):
|
||||||
|
|
||||||
@ -118,7 +117,7 @@ class LINE:
|
|||||||
|
|
||||||
for edge in self.graph.edges():
|
for edge in self.graph.edges():
|
||||||
node_degree[node2idx[edge[0]]
|
node_degree[node2idx[edge[0]]
|
||||||
] += self.graph[edge[0]][edge[1]].get('weight', 1.0)
|
] += self.graph[edge[0]][edge[1]].get('weight', 1.0)
|
||||||
|
|
||||||
total_sum = sum([math.pow(node_degree[i], power)
|
total_sum = sum([math.pow(node_degree[i], power)
|
||||||
for i in range(numNodes)])
|
for i in range(numNodes)])
|
||||||
@ -165,10 +164,9 @@ class LINE:
|
|||||||
t.append(cur_t)
|
t.append(cur_t)
|
||||||
sign = np.ones(len(h))
|
sign = np.ones(len(h))
|
||||||
else:
|
else:
|
||||||
sign = np.ones(len(h))*-1
|
sign = np.ones(len(h)) * -1
|
||||||
t = []
|
t = []
|
||||||
for i in range(len(h)):
|
for i in range(len(h)):
|
||||||
|
|
||||||
t.append(alias_sample(
|
t.append(alias_sample(
|
||||||
self.node_accept, self.node_alias))
|
self.node_accept, self.node_alias))
|
||||||
|
|
||||||
@ -190,7 +188,7 @@ class LINE:
|
|||||||
start_index = 0
|
start_index = 0
|
||||||
end_index = min(start_index + self.batch_size, data_size)
|
end_index = min(start_index + self.batch_size, data_size)
|
||||||
|
|
||||||
def get_embeddings(self,):
|
def get_embeddings(self, ):
|
||||||
self._embeddings = {}
|
self._embeddings = {}
|
||||||
if self.order == 'first':
|
if self.order == 'first':
|
||||||
embeddings = self.embedding_dict['first'].get_weights()[0]
|
embeddings = self.embedding_dict['first'].get_weights()[0]
|
||||||
@ -198,7 +196,7 @@ class LINE:
|
|||||||
embeddings = self.embedding_dict['second'].get_weights()[0]
|
embeddings = self.embedding_dict['second'].get_weights()[0]
|
||||||
else:
|
else:
|
||||||
embeddings = np.hstack((self.embedding_dict['first'].get_weights()[
|
embeddings = np.hstack((self.embedding_dict['first'].get_weights()[
|
||||||
0], self.embedding_dict['second'].get_weights()[0]))
|
0], self.embedding_dict['second'].get_weights()[0]))
|
||||||
idx2node = self.idx2node
|
idx2node = self.idx2node
|
||||||
for i, embedding in enumerate(embeddings):
|
for i, embedding in enumerate(embeddings):
|
||||||
self._embeddings[idx2node[i]] = embedding
|
self._embeddings[idx2node[i]] = embedding
|
||||||
@ -207,7 +205,8 @@ class LINE:
|
|||||||
|
|
||||||
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
|
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
|
||||||
self.reset_training_config(batch_size, times)
|
self.reset_training_config(batch_size, times)
|
||||||
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
|
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch,
|
||||||
|
steps_per_epoch=self.steps_per_epoch,
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
|
|
||||||
return hist
|
return hist
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
Author:
|
Author:
|
||||||
|
|
||||||
Weichen Shen,wcshen1994@163.com
|
Weichen Shen,weichenswc@163.com
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -19,14 +19,13 @@ Reference:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from gensim.models import Word2Vec
|
from gensim.models import Word2Vec
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from ..walker import RandomWalker
|
from ..walker import RandomWalker
|
||||||
|
|
||||||
|
|
||||||
class Node2Vec:
|
class Node2Vec:
|
||||||
|
|
||||||
def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
|
def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=False):
|
||||||
|
|
||||||
self.graph = graph
|
self.graph = graph
|
||||||
self._embeddings = {}
|
self._embeddings = {}
|
||||||
@ -57,7 +56,7 @@ class Node2Vec:
|
|||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def get_embeddings(self,):
|
def get_embeddings(self, ):
|
||||||
if self.w2v_model is None:
|
if self.w2v_model is None:
|
||||||
print("model not train")
|
print("model not train")
|
||||||
return {}
|
return {}
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
Author:
|
Author:
|
||||||
|
|
||||||
Weichen Shen,wcshen1994@163.com
|
Weichen Shen,weichenswc@163.com
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -88,8 +88,7 @@ class SDNE(object):
|
|||||||
self.nu1 = nu1
|
self.nu1 = nu1
|
||||||
self.nu2 = nu2
|
self.nu2 = nu2
|
||||||
|
|
||||||
self.A, self.L = self._create_A_L(
|
self.A, self.L = _create_A_L(self.graph, self.node2idx) # Adj Matrix,L Matrix
|
||||||
self.graph, self.node2idx) # Adj Matrix,L Matrix
|
|
||||||
self.reset_model()
|
self.reset_model()
|
||||||
self.inputs = [self.A, self.L]
|
self.inputs = [self.A, self.L]
|
||||||
self._embeddings = {}
|
self._embeddings = {}
|
||||||
@ -151,24 +150,25 @@ class SDNE(object):
|
|||||||
|
|
||||||
return self._embeddings
|
return self._embeddings
|
||||||
|
|
||||||
def _create_A_L(self, graph, node2idx):
|
|
||||||
node_size = graph.number_of_nodes()
|
|
||||||
A_data = []
|
|
||||||
A_row_index = []
|
|
||||||
A_col_index = []
|
|
||||||
|
|
||||||
for edge in graph.edges():
|
def _create_A_L(graph, node2idx):
|
||||||
v1, v2 = edge
|
node_size = graph.number_of_nodes()
|
||||||
edge_weight = graph[v1][v2].get('weight', 1)
|
A_data = []
|
||||||
|
A_row_index = []
|
||||||
|
A_col_index = []
|
||||||
|
|
||||||
A_data.append(edge_weight)
|
for edge in graph.edges():
|
||||||
A_row_index.append(node2idx[v1])
|
v1, v2 = edge
|
||||||
A_col_index.append(node2idx[v2])
|
edge_weight = graph[v1][v2].get('weight', 1)
|
||||||
|
|
||||||
A = sp.csr_matrix((A_data, (A_row_index, A_col_index)), shape=(node_size, node_size))
|
A_data.append(edge_weight)
|
||||||
A_ = sp.csr_matrix((A_data + A_data, (A_row_index + A_col_index, A_col_index + A_row_index)),
|
A_row_index.append(node2idx[v1])
|
||||||
shape=(node_size, node_size))
|
A_col_index.append(node2idx[v2])
|
||||||
|
|
||||||
D = sp.diags(A_.sum(axis=1).flatten().tolist()[0])
|
A = sp.csr_matrix((A_data, (A_row_index, A_col_index)), shape=(node_size, node_size))
|
||||||
L = D - A_
|
A_ = sp.csr_matrix((A_data + A_data, (A_row_index + A_col_index, A_col_index + A_row_index)),
|
||||||
return A, L
|
shape=(node_size, node_size))
|
||||||
|
|
||||||
|
D = sp.diags(A_.sum(axis=1).flatten().tolist()[0])
|
||||||
|
L = D - A_
|
||||||
|
return A, L
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
Author:
|
Author:
|
||||||
|
|
||||||
Weichen Shen,wcshen1994@163.com
|
Weichen Shen,weichenswc@163.com
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -28,7 +28,6 @@ import pandas as pd
|
|||||||
from fastdtw import fastdtw
|
from fastdtw import fastdtw
|
||||||
from gensim.models import Word2Vec
|
from gensim.models import Word2Vec
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from ..alias import create_alias_table
|
from ..alias import create_alias_table
|
||||||
from ..utils import partition_dict, preprocess_nxgraph
|
from ..utils import partition_dict, preprocess_nxgraph
|
||||||
@ -36,7 +35,8 @@ from ..walker import BiasedWalker
|
|||||||
|
|
||||||
|
|
||||||
class Struc2Vec():
|
class Struc2Vec():
|
||||||
def __init__(self, graph, walk_length=10, num_walks=100, workers=1, verbose=0, stay_prob=0.3, opt1_reduce_len=True, opt2_reduce_sim_calc=True, opt3_num_layers=None, temp_path='./temp_struc2vec/', reuse=False):
|
def __init__(self, graph, walk_length=10, num_walks=100, workers=1, verbose=0, stay_prob=0.3, opt1_reduce_len=True,
|
||||||
|
opt2_reduce_sim_calc=True, opt3_num_layers=None, temp_path='./temp_struc2vec/', reuse=False):
|
||||||
self.graph = graph
|
self.graph = graph
|
||||||
self.idx2node, self.node2idx = preprocess_nxgraph(graph)
|
self.idx2node, self.node2idx = preprocess_nxgraph(graph)
|
||||||
self.idx = list(range(len(self.idx2node)))
|
self.idx = list(range(len(self.idx2node)))
|
||||||
@ -62,10 +62,10 @@ class Struc2Vec():
|
|||||||
|
|
||||||
self._embeddings = {}
|
self._embeddings = {}
|
||||||
|
|
||||||
def create_context_graph(self, max_num_layers, workers=1, verbose=0,):
|
def create_context_graph(self, max_num_layers, workers=1, verbose=0, ):
|
||||||
|
|
||||||
pair_distances = self._compute_structural_distance(
|
pair_distances = self._compute_structural_distance(
|
||||||
max_num_layers, workers, verbose,)
|
max_num_layers, workers, verbose, )
|
||||||
layers_adj, layers_distances = self._get_layer_rep(pair_distances)
|
layers_adj, layers_distances = self._get_layer_rep(pair_distances)
|
||||||
pd.to_pickle(layers_adj, self.temp_path + 'layers_adj.pkl')
|
pd.to_pickle(layers_adj, self.temp_path + 'layers_adj.pkl')
|
||||||
|
|
||||||
@ -74,16 +74,16 @@ class Struc2Vec():
|
|||||||
pd.to_pickle(layers_alias, self.temp_path + 'layers_alias.pkl')
|
pd.to_pickle(layers_alias, self.temp_path + 'layers_alias.pkl')
|
||||||
pd.to_pickle(layers_accept, self.temp_path + 'layers_accept.pkl')
|
pd.to_pickle(layers_accept, self.temp_path + 'layers_accept.pkl')
|
||||||
|
|
||||||
def prepare_biased_walk(self,):
|
def prepare_biased_walk(self, ):
|
||||||
|
|
||||||
sum_weights = {}
|
sum_weights = {}
|
||||||
sum_edges = {}
|
sum_edges = {}
|
||||||
average_weight = {}
|
average_weight = {}
|
||||||
gamma = {}
|
gamma = {}
|
||||||
layer = 0
|
layer = 0
|
||||||
while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')):
|
while (os.path.exists(self.temp_path + 'norm_weights_distance-layer-' + str(layer) + '.pkl')):
|
||||||
probs = pd.read_pickle(
|
probs = pd.read_pickle(
|
||||||
self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')
|
self.temp_path + 'norm_weights_distance-layer-' + str(layer) + '.pkl')
|
||||||
for v, list_weights in probs.items():
|
for v, list_weights in probs.items():
|
||||||
sum_weights.setdefault(layer, 0)
|
sum_weights.setdefault(layer, 0)
|
||||||
sum_edges.setdefault(layer, 0)
|
sum_edges.setdefault(layer, 0)
|
||||||
@ -112,14 +112,15 @@ class Struc2Vec():
|
|||||||
sentences = self.sentences
|
sentences = self.sentences
|
||||||
|
|
||||||
print("Learning representation...")
|
print("Learning representation...")
|
||||||
model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers,
|
model = Word2Vec(sentences, vector_size=embed_size, window=window_size, min_count=0, hs=1, sg=1,
|
||||||
iter=iter)
|
workers=workers,
|
||||||
|
epochs=iter)
|
||||||
print("Learning representation done!")
|
print("Learning representation done!")
|
||||||
self.w2v_model = model
|
self.w2v_model = model
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def get_embeddings(self,):
|
def get_embeddings(self, ):
|
||||||
if self.w2v_model is None:
|
if self.w2v_model is None:
|
||||||
print("model not train")
|
print("model not train")
|
||||||
return {}
|
return {}
|
||||||
@ -184,11 +185,11 @@ class Struc2Vec():
|
|||||||
|
|
||||||
return ordered_degree_sequence_dict
|
return ordered_degree_sequence_dict
|
||||||
|
|
||||||
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,):
|
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0, ):
|
||||||
|
|
||||||
if os.path.exists(self.temp_path+'structural_dist.pkl'):
|
if os.path.exists(self.temp_path + 'structural_dist.pkl'):
|
||||||
structural_dist = pd.read_pickle(
|
structural_dist = pd.read_pickle(
|
||||||
self.temp_path+'structural_dist.pkl')
|
self.temp_path + 'structural_dist.pkl')
|
||||||
else:
|
else:
|
||||||
if self.opt1_reduce_len:
|
if self.opt1_reduce_len:
|
||||||
dist_func = cost_max
|
dist_func = cost_max
|
||||||
@ -219,8 +220,9 @@ class Struc2Vec():
|
|||||||
for v in degreeList:
|
for v in degreeList:
|
||||||
vertices[v] = [vd for vd in degreeList.keys() if vd > v]
|
vertices[v] = [vd for vd in degreeList.keys() if vd > v]
|
||||||
|
|
||||||
results = Parallel(n_jobs=workers, verbose=verbose,)(
|
results = Parallel(n_jobs=workers, verbose=verbose, )(
|
||||||
delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers))
|
delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in
|
||||||
|
partition_dict(vertices, workers))
|
||||||
dtw_dist = dict(ChainMap(*results))
|
dtw_dist = dict(ChainMap(*results))
|
||||||
|
|
||||||
structural_dist = convert_dtw_struc_dist(dtw_dist)
|
structural_dist = convert_dtw_struc_dist(dtw_dist)
|
||||||
@ -303,7 +305,7 @@ class Struc2Vec():
|
|||||||
node_accept_dict[v] = accept
|
node_accept_dict[v] = accept
|
||||||
|
|
||||||
pd.to_pickle(
|
pd.to_pickle(
|
||||||
norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl')
|
norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer) + '.pkl')
|
||||||
|
|
||||||
layers_alias[layer] = node_alias_dict
|
layers_alias[layer] = node_alias_dict
|
||||||
layers_accept[layer] = node_accept_dict
|
layers_accept[layer] = node_accept_dict
|
||||||
@ -406,12 +408,11 @@ def get_vertices(v, degree_v, degrees, n_nodes):
|
|||||||
|
|
||||||
|
|
||||||
def verifyDegrees(degrees, degree_v_root, degree_a, degree_b):
|
def verifyDegrees(degrees, degree_v_root, degree_a, degree_b):
|
||||||
|
if (degree_b == -1):
|
||||||
if(degree_b == -1):
|
|
||||||
degree_now = degree_a
|
degree_now = degree_a
|
||||||
elif(degree_a == -1):
|
elif (degree_a == -1):
|
||||||
degree_now = degree_b
|
degree_now = degree_b
|
||||||
elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)):
|
elif (abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)):
|
||||||
degree_now = degree_b
|
degree_now = degree_b
|
||||||
else:
|
else:
|
||||||
degree_now = degree_a
|
degree_now = degree_a
|
||||||
|
@ -43,6 +43,6 @@ def partition_list(vertices, workers):
|
|||||||
|
|
||||||
def partition_num(num, workers):
|
def partition_num(num, workers):
|
||||||
if num % workers == 0:
|
if num % workers == 0:
|
||||||
return [num//workers]*workers
|
return [num // workers] * workers
|
||||||
else:
|
else:
|
||||||
return [num//workers]*workers + [num % workers]
|
return [num // workers] * workers + [num % workers]
|
||||||
|
36
ge/walker.py
36
ge/walker.py
@ -2,17 +2,15 @@ import itertools
|
|||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from tqdm import trange
|
|
||||||
|
|
||||||
from .alias import alias_sample, create_alias_table
|
from .alias import alias_sample, create_alias_table
|
||||||
from .utils import partition_num
|
from .utils import partition_num
|
||||||
|
|
||||||
|
|
||||||
class RandomWalker:
|
class RandomWalker:
|
||||||
def __init__(self, G, p=1, q=1, use_rejection_sampling=0):
|
def __init__(self, G, p=1, q=1, use_rejection_sampling=False):
|
||||||
"""
|
"""
|
||||||
:param G:
|
:param G:
|
||||||
:param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
|
:param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
|
||||||
@ -130,7 +128,7 @@ class RandomWalker:
|
|||||||
|
|
||||||
return walks
|
return walks
|
||||||
|
|
||||||
def _simulate_walks(self, nodes, num_walks, walk_length,):
|
def _simulate_walks(self, nodes, num_walks, walk_length, ):
|
||||||
walks = []
|
walks = []
|
||||||
for _ in range(num_walks):
|
for _ in range(num_walks):
|
||||||
random.shuffle(nodes)
|
random.shuffle(nodes)
|
||||||
@ -161,14 +159,14 @@ class RandomWalker:
|
|||||||
for x in G.neighbors(v):
|
for x in G.neighbors(v):
|
||||||
weight = G[v][x].get('weight', 1.0) # w_vx
|
weight = G[v][x].get('weight', 1.0) # w_vx
|
||||||
if x == t: # d_tx == 0
|
if x == t: # d_tx == 0
|
||||||
unnormalized_probs.append(weight/p)
|
unnormalized_probs.append(weight / p)
|
||||||
elif G.has_edge(x, t): # d_tx == 1
|
elif G.has_edge(x, t): # d_tx == 1
|
||||||
unnormalized_probs.append(weight)
|
unnormalized_probs.append(weight)
|
||||||
else: # d_tx > 1
|
else: # d_tx > 1
|
||||||
unnormalized_probs.append(weight/q)
|
unnormalized_probs.append(weight / q)
|
||||||
norm_const = sum(unnormalized_probs)
|
norm_const = sum(unnormalized_probs)
|
||||||
normalized_probs = [
|
normalized_probs = [
|
||||||
float(u_prob)/norm_const for u_prob in unnormalized_probs]
|
float(u_prob) / norm_const for u_prob in unnormalized_probs]
|
||||||
|
|
||||||
return create_alias_table(normalized_probs)
|
return create_alias_table(normalized_probs)
|
||||||
|
|
||||||
@ -183,7 +181,7 @@ class RandomWalker:
|
|||||||
for nbr in G.neighbors(node)]
|
for nbr in G.neighbors(node)]
|
||||||
norm_const = sum(unnormalized_probs)
|
norm_const = sum(unnormalized_probs)
|
||||||
normalized_probs = [
|
normalized_probs = [
|
||||||
float(u_prob)/norm_const for u_prob in unnormalized_probs]
|
float(u_prob) / norm_const for u_prob in unnormalized_probs]
|
||||||
alias_nodes[node] = create_alias_table(normalized_probs)
|
alias_nodes[node] = create_alias_table(normalized_probs)
|
||||||
|
|
||||||
if not self.use_rejection_sampling:
|
if not self.use_rejection_sampling:
|
||||||
@ -209,17 +207,16 @@ class BiasedWalker:
|
|||||||
|
|
||||||
def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):
|
def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):
|
||||||
|
|
||||||
layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
|
layers_adj = pd.read_pickle(self.temp_path + 'layers_adj.pkl')
|
||||||
layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
|
layers_alias = pd.read_pickle(self.temp_path + 'layers_alias.pkl')
|
||||||
layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
|
layers_accept = pd.read_pickle(self.temp_path + 'layers_accept.pkl')
|
||||||
gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
|
gamma = pd.read_pickle(self.temp_path + 'gamma.pkl')
|
||||||
walks = []
|
|
||||||
initialLayer = 0
|
|
||||||
|
|
||||||
nodes = self.idx # list(self.g.nodes())
|
nodes = self.idx # list(self.g.nodes())
|
||||||
|
|
||||||
results = Parallel(n_jobs=workers, verbose=verbose, )(
|
results = Parallel(n_jobs=workers, verbose=verbose, )(
|
||||||
delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in
|
delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias,
|
||||||
|
gamma) for num in
|
||||||
partition_num(num_walks, workers))
|
partition_num(num_walks, workers))
|
||||||
|
|
||||||
walks = list(itertools.chain(*results))
|
walks = list(itertools.chain(*results))
|
||||||
@ -243,7 +240,7 @@ class BiasedWalker:
|
|||||||
|
|
||||||
while len(path) < walk_length:
|
while len(path) < walk_length:
|
||||||
r = random.random()
|
r = random.random()
|
||||||
if(r < stay_prob): # same layer
|
if (r < stay_prob): # same layer
|
||||||
v = chooseNeighbor(v, graphs, layers_alias,
|
v = chooseNeighbor(v, graphs, layers_alias,
|
||||||
layers_accept, layer)
|
layers_accept, layer)
|
||||||
path.append(self.idx2node[v])
|
path.append(self.idx2node[v])
|
||||||
@ -256,18 +253,17 @@ class BiasedWalker:
|
|||||||
print(layer, v)
|
print(layer, v)
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
if(r > p_moveup):
|
if (r > p_moveup):
|
||||||
if(layer > initialLayer):
|
if (layer > initialLayer):
|
||||||
layer = layer - 1
|
layer = layer - 1
|
||||||
else:
|
else:
|
||||||
if((layer + 1) in graphs and v in graphs[layer + 1]):
|
if ((layer + 1) in graphs and v in graphs[layer + 1]):
|
||||||
layer = layer + 1
|
layer = layer + 1
|
||||||
|
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):
|
def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):
|
||||||
|
|
||||||
v_list = graphs[layer][v]
|
v_list = graphs[layer][v]
|
||||||
|
|
||||||
idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v])
|
idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v])
|
||||||
|
15
setup.py
15
setup.py
@ -7,16 +7,17 @@ with open("README.md", "r") as fh:
|
|||||||
|
|
||||||
|
|
||||||
REQUIRED_PACKAGES = [
|
REQUIRED_PACKAGES = [
|
||||||
# 'tensorflow>=1.4.0,<=1.12.0',
|
# 'tensorflow>=1.4.0',
|
||||||
'gensim==3.6.0',
|
'gensim>=4.0.0',
|
||||||
'networkx==2.1',
|
'networkx',
|
||||||
'joblib==0.13.0',
|
'joblib',
|
||||||
'fastdtw==0.3.2',
|
'fastdtw',
|
||||||
'tqdm',
|
'tqdm',
|
||||||
'numpy',
|
'numpy',
|
||||||
'scikit-learn',
|
'scikit-learn',
|
||||||
'pandas',
|
'pandas',
|
||||||
'matplotlib',
|
'matplotlib',
|
||||||
|
'deepctr'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -28,13 +29,13 @@ setuptools.setup(
|
|||||||
|
|
||||||
author="Weichen Shen",
|
author="Weichen Shen",
|
||||||
|
|
||||||
author_email="wcshen1994@163.com",
|
author_email="weichenswc@163.com",
|
||||||
|
|
||||||
url="https://github.com/shenweichen/GraphEmbedding",
|
url="https://github.com/shenweichen/GraphEmbedding",
|
||||||
|
|
||||||
packages=setuptools.find_packages(exclude=[]),
|
packages=setuptools.find_packages(exclude=[]),
|
||||||
|
|
||||||
python_requires='>=3.4', # 3.4.6
|
python_requires='>=3.5', # 3.4.6
|
||||||
|
|
||||||
install_requires=REQUIRED_PACKAGES,
|
install_requires=REQUIRED_PACKAGES,
|
||||||
|
|
||||||
|
5
tests/Wiki_edgelist.txt
Normal file
5
tests/Wiki_edgelist.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
0 1
|
||||||
|
0 2
|
||||||
|
0 3
|
||||||
|
1 2
|
||||||
|
2 3
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
16
tests/deepwalk_test.py
Normal file
16
tests/deepwalk_test.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
from ge import DeepWalk
|
||||||
|
|
||||||
|
|
||||||
|
def test_DeepWalk():
|
||||||
|
G = nx.read_edgelist('./tests/Wiki_edgelist.txt',
|
||||||
|
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
|
||||||
|
|
||||||
|
model = DeepWalk(G, walk_length=3, num_walks=2, workers=1)
|
||||||
|
model.train(window_size=3, iter=1)
|
||||||
|
embeddings = model.get_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
16
tests/line_test.py
Normal file
16
tests/line_test.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
from ge import LINE
|
||||||
|
|
||||||
|
|
||||||
|
def test_LINE():
|
||||||
|
G = nx.read_edgelist('./tests/Wiki_edgelist.txt',
|
||||||
|
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
|
||||||
|
|
||||||
|
model = LINE(G, embedding_size=2, order='second')
|
||||||
|
model.train(batch_size=2, epochs=1, verbose=2)
|
||||||
|
embeddings = model.get_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
22
tests/node2vec_test.py
Normal file
22
tests/node2vec_test.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import networkx as nx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ge import Node2Vec
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'use_rejection_sampling',
|
||||||
|
[True, False
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_Node2Vec(use_rejection_sampling):
|
||||||
|
G = nx.read_edgelist('./tests/Wiki_edgelist.txt',
|
||||||
|
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
|
||||||
|
model = Node2Vec(G, walk_length=10, num_walks=80,
|
||||||
|
p=0.25, q=4, workers=1, use_rejection_sampling=use_rejection_sampling)
|
||||||
|
model.train(window_size=5, iter=3)
|
||||||
|
embeddings = model.get_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
19
tests/sdne_test.py
Normal file
19
tests/sdne_test.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import networkx as nx
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from ge import SDNE
|
||||||
|
|
||||||
|
|
||||||
|
def test_SDNE():
|
||||||
|
if tf.__version__ >= '1.15.0':
|
||||||
|
return #todo
|
||||||
|
G = nx.read_edgelist('./tests/Wiki_edgelist.txt',
|
||||||
|
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
|
||||||
|
|
||||||
|
model = SDNE(G, hidden_size=[8, 4], )
|
||||||
|
model.train(batch_size=2, epochs=1, verbose=2)
|
||||||
|
embeddings = model.get_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
16
tests/struct2vec_test.py
Normal file
16
tests/struct2vec_test.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
from ge import Struc2Vec
|
||||||
|
|
||||||
|
|
||||||
|
def test_Struc2Vec():
|
||||||
|
G = nx.read_edgelist('./tests/Wiki_edgelist.txt', create_using=nx.DiGraph(), nodetype=None,
|
||||||
|
data=[('weight', int)])
|
||||||
|
|
||||||
|
model = Struc2Vec(G, 3, 1, workers=1, verbose=40, )
|
||||||
|
model.train()
|
||||||
|
embeddings = model.get_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
Loading…
Reference in New Issue
Block a user