format
This commit is contained in:
parent
4903c0ef0e
commit
2d1783bfb1
@ -1,26 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
"""
|
||||
downstream tasks; each task is a class;
|
||||
by Chengbin Hou & Zeyu Dong
|
||||
"""
|
||||
|
||||
import math
|
||||
import random
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import f1_score, roc_auc_score
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')
|
||||
|
||||
'''
|
||||
#-----------------------------------------------------------------------------
|
||||
# by Chengbin Hou 2018
|
||||
# Email: Chengbin.Hou10@foxmail.com
|
||||
#-----------------------------------------------------------------------------
|
||||
'''
|
||||
|
||||
# node classification classifier
|
||||
|
||||
# ------------------node classification task---------------------------
|
||||
|
||||
class ncClassifier(object):
|
||||
|
||||
@ -68,8 +60,6 @@ class ncClassifier(object):
|
||||
results[average] = f1_score(Y, Y_, average=average)
|
||||
print(results)
|
||||
return results
|
||||
|
||||
|
||||
class TopKRanker(OneVsRestClassifier): # orignal LR or SVM is for binary clf
|
||||
def predict(self, X, top_k_list): # re-define predict func of OneVsRestClassifier
|
||||
probs = np.asarray(super(TopKRanker, self).predict_proba(X))
|
||||
@ -84,7 +74,7 @@ class TopKRanker(OneVsRestClassifier): # orignal LR or SVM is for binary clf
|
||||
return np.asarray(all_labels)
|
||||
|
||||
|
||||
# link prediction binary classifier
|
||||
# ------------------link prediction task---------------------------
|
||||
class lpClassifier(object):
|
||||
|
||||
def __init__(self, vectors):
|
||||
@ -110,21 +100,19 @@ class lpClassifier(object):
|
||||
roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5
|
||||
print("roc=", "{:.9f}".format(roc))
|
||||
|
||||
|
||||
def norm(a):
|
||||
sum = 0.0
|
||||
for i in range(len(a)):
|
||||
sum = sum + a[i] * a[i]
|
||||
return math.sqrt(sum)
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
sum = 0.0
|
||||
for i in range(len(a)):
|
||||
sum = sum + a[i] * b[i]
|
||||
return sum / (norm(a) * norm(b) + 1e-100)
|
||||
|
||||
|
||||
'''
|
||||
def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
|
||||
# randomly split links/edges into training set and testing set
|
||||
# *** note: we do not assume every node must be connected after removing links
|
||||
@ -166,3 +154,4 @@ def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
|
||||
print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(
|
||||
g.numDiEdges(), len(test_pos_sample), len(test_neg_sample)))
|
||||
return g.G, test_edge_pair, test_edge_label
|
||||
'''
|
@ -1,20 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
commonly used ulits
|
||||
by Chengbin Hou & Zeyu Dong
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
|
||||
'''
|
||||
#-----------------------------------------------------------------------------
|
||||
# Chengbin Hou @ SUSTech 2018
|
||||
# Email: Chengbin.Hou10@foxmail.com
|
||||
#-----------------------------------------------------------------------------
|
||||
'''
|
||||
|
||||
# ---------------------------------ulits for calculation--------------------------------
|
||||
|
||||
|
||||
def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
|
||||
"""Make each row of matrix sums up to 1.0, i.e., a probability distribution.
|
||||
Support both dense and sparse matrix.
|
||||
@ -72,8 +68,8 @@ def pairwise_similarity(mat, type='cosine'):
|
||||
return 'Not found!'
|
||||
return result
|
||||
|
||||
# ---------------------------------ulits for downstream tasks--------------------------------
|
||||
|
||||
# ---------------------------------ulits for downstream tasks--------------------------------
|
||||
|
||||
def read_edge_label_downstream(filename):
|
||||
fin = open(filename, 'r')
|
||||
@ -133,6 +129,8 @@ def generate_edges_for_linkpred(graph, edges_removed, balance_ratio=1.0):
|
||||
return test_node_pairs, test_edge_labels
|
||||
|
||||
|
||||
# ---------------------------------others--------------------------------
|
||||
|
||||
def dim_reduction(mat, dim=128, method='pca'):
|
||||
''' dimensionality reduction: PCA, SVD, etc...
|
||||
dim = # of columns
|
||||
|
35
src/main.py
35
src/main.py
@ -7,28 +7,28 @@ STEP4: downstream evaluations
|
||||
|
||||
python src/main.py --method abrw
|
||||
|
||||
by Chengbin Hou 2018 <chengbin.hou10@foxmail.com>
|
||||
by Chengbin HOU 2018 <chengbin.hou10@foxmail.com>
|
||||
'''
|
||||
|
||||
import time
|
||||
# import random
|
||||
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
|
||||
|
||||
from libnrl import abrw # ANE method; Attributed Biased Random Walk
|
||||
from libnrl import aane # ANE method
|
||||
from libnrl import tadw # ANE method
|
||||
from libnrl import asne # ANE method
|
||||
from libnrl.graphsage import graphsageAPI # ANE method
|
||||
from libnrl import attrcomb # ANE method
|
||||
from libnrl import attrpure # NE method simply use svd or pca for dim reduction
|
||||
from libnrl import line # PNE method
|
||||
from libnrl import tadw # ANE method
|
||||
from libnrl.downstream import lpClassifier, ncClassifier
|
||||
from libnrl import grarep # PNE method
|
||||
from libnrl import node2vec # PNE method; including deepwalk and node2vec
|
||||
from libnrl.graph import Graph
|
||||
from libnrl.graphsage import graphsageAPI # ANE method
|
||||
from libnrl.grarep import GraRep # PNE method
|
||||
from libnrl.downstream import lpClassifier, ncClassifier
|
||||
from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream
|
||||
|
||||
from sklearn.linear_model import LogisticRegression # to do... 1) put it in downstream.py; and 2) try SVM...
|
||||
from libnrl import abrw # ANE method; Attributed Biased Random Walk
|
||||
from libnrl import node2vec # PNE method; including deepwalk and node2vec
|
||||
from sklearn.linear_model import LogisticRegression # to do... try SVM...
|
||||
|
||||
|
||||
def parse_args():
|
||||
@ -175,17 +175,16 @@ def main(args):
|
||||
model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim,
|
||||
workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q)
|
||||
elif args.method == 'grarep':
|
||||
model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
|
||||
model = grarep.GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
|
||||
elif args.method == 'line': # if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score
|
||||
model = line.LINE(graph=g, epoch=args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio,
|
||||
label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro')
|
||||
|
||||
elif args.method == 'asne':
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
|
||||
elif args.method == 'sagemean': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
|
||||
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
|
||||
elif args.method == 'sagegcn': # parameters for graphsage models are in 'graphsage' -> '__init__.py'
|
||||
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
|
||||
elif args.method == 'asne':
|
||||
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
|
||||
else:
|
||||
print('method not found...')
|
||||
exit(0)
|
||||
@ -193,10 +192,10 @@ def main(args):
|
||||
print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s')
|
||||
|
||||
if args.save_emb:
|
||||
model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
|
||||
#model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
|
||||
model.save_embeddings(args.emb_file)
|
||||
print(f'Save node embeddings in file: {args.emb_file}')
|
||||
|
||||
|
||||
# ---------------------------------------STEP4: downstream task-----------------------------------------------
|
||||
print('\nSTEP4: start evaluating ......: ')
|
||||
t1 = time.time()
|
||||
@ -205,14 +204,14 @@ def main(args):
|
||||
# ------lp task
|
||||
if args.task == 'lp' or args.task == 'lp_and_nc':
|
||||
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
|
||||
clf = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||
clf.evaluate(test_node_pairs, test_edge_labels)
|
||||
ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||
ds_task.evaluate(test_node_pairs, test_edge_labels)
|
||||
# ------nc task
|
||||
if args.task == 'nc' or args.task == 'lp_and_nc':
|
||||
X, Y = read_node_label_downstream(args.label_file)
|
||||
print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
|
||||
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) # use Logistic Regression as clf; we may choose SVM or more advanced ones
|
||||
clf.split_train_evaluate(X, Y, args.label_reserved)
|
||||
ds_task = ncClassifier(vectors=vectors, clf=LogisticRegression()) # use Logistic Regression as clf; we may choose SVM or more advanced ones
|
||||
ds_task.split_train_evaluate(X, Y, args.label_reserved)
|
||||
t2 = time.time()
|
||||
print(f'STEP4: end evaluating; time cost: {(t2-t1):.2f}s')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user