This commit is contained in:
Chengbin Hou 2018-11-30 21:32:14 +00:00
parent 4903c0ef0e
commit 2d1783bfb1
3 changed files with 33 additions and 47 deletions

View File

@ -1,26 +1,18 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
"""
downstream tasks; each task is a class;
by Chengbin Hou & Zeyu Dong
"""
import math
import random
import warnings
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')
'''
#-----------------------------------------------------------------------------
# by Chengbin Hou 2018
# Email: Chengbin.Hou10@foxmail.com
#-----------------------------------------------------------------------------
'''
# node classification classifier
# ------------------node classification task---------------------------
class ncClassifier(object):
@ -68,8 +60,6 @@ class ncClassifier(object):
results[average] = f1_score(Y, Y_, average=average)
print(results)
return results
class TopKRanker(OneVsRestClassifier): # orignal LR or SVM is for binary clf
def predict(self, X, top_k_list): # re-define predict func of OneVsRestClassifier
probs = np.asarray(super(TopKRanker, self).predict_proba(X))
@ -84,7 +74,7 @@ class TopKRanker(OneVsRestClassifier): # orignal LR or SVM is for binary clf
return np.asarray(all_labels)
# link prediction binary classifier
# ------------------link prediction task---------------------------
class lpClassifier(object):
def __init__(self, vectors):
@ -110,21 +100,19 @@ class lpClassifier(object):
roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5
print("roc=", "{:.9f}".format(roc))
def norm(a):
sum = 0.0
for i in range(len(a)):
sum = sum + a[i] * a[i]
return math.sqrt(sum)
def cosine_similarity(a, b):
sum = 0.0
for i in range(len(a)):
sum = sum + a[i] * b[i]
return sum / (norm(a) * norm(b) + 1e-100)
'''
def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
# randomly split links/edges into training set and testing set
# *** note: we do not assume every node must be connected after removing links
@ -166,3 +154,4 @@ def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0):
print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(
g.numDiEdges(), len(test_pos_sample), len(test_neg_sample)))
return g.G, test_edge_pair, test_edge_label
'''

View File

@ -1,20 +1,16 @@
# -*- coding: utf-8 -*-
"""
commonly used ulits
by Chengbin Hou & Zeyu Dong
"""
import time
import numpy as np
from scipy import sparse
'''
#-----------------------------------------------------------------------------
# Chengbin Hou @ SUSTech 2018
# Email: Chengbin.Hou10@foxmail.com
#-----------------------------------------------------------------------------
'''
# ---------------------------------ulits for calculation--------------------------------
def row_as_probdist(mat, dense_output=False, preserve_zeros=False):
"""Make each row of matrix sums up to 1.0, i.e., a probability distribution.
Support both dense and sparse matrix.
@ -72,8 +68,8 @@ def pairwise_similarity(mat, type='cosine'):
return 'Not found!'
return result
# ---------------------------------ulits for downstream tasks--------------------------------
# ---------------------------------ulits for downstream tasks--------------------------------
def read_edge_label_downstream(filename):
fin = open(filename, 'r')
@ -133,6 +129,8 @@ def generate_edges_for_linkpred(graph, edges_removed, balance_ratio=1.0):
return test_node_pairs, test_edge_labels
# ---------------------------------others--------------------------------
def dim_reduction(mat, dim=128, method='pca'):
''' dimensionality reduction: PCA, SVD, etc...
dim = # of columns

View File

@ -7,28 +7,28 @@ STEP4: downstream evaluations
python src/main.py --method abrw
by Chengbin Hou 2018 <chengbin.hou10@foxmail.com>
by Chengbin HOU 2018 <chengbin.hou10@foxmail.com>
'''
import time
# import random
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from libnrl import abrw # ANE method; Attributed Biased Random Walk
from libnrl import aane # ANE method
from libnrl import tadw # ANE method
from libnrl import asne # ANE method
from libnrl.graphsage import graphsageAPI # ANE method
from libnrl import attrcomb # ANE method
from libnrl import attrpure # NE method simply use svd or pca for dim reduction
from libnrl import line # PNE method
from libnrl import tadw # ANE method
from libnrl.downstream import lpClassifier, ncClassifier
from libnrl import grarep # PNE method
from libnrl import node2vec # PNE method; including deepwalk and node2vec
from libnrl.graph import Graph
from libnrl.graphsage import graphsageAPI # ANE method
from libnrl.grarep import GraRep # PNE method
from libnrl.downstream import lpClassifier, ncClassifier
from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream
from sklearn.linear_model import LogisticRegression # to do... 1) put it in downstream.py; and 2) try SVM...
from libnrl import abrw # ANE method; Attributed Biased Random Walk
from libnrl import node2vec # PNE method; including deepwalk and node2vec
from sklearn.linear_model import LogisticRegression # to do... try SVM...
def parse_args():
@ -175,17 +175,16 @@ def main(args):
model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim,
workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q)
elif args.method == 'grarep':
model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
model = grarep.GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
elif args.method == 'line': # if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score
model = line.LINE(graph=g, epoch=args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio,
label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro')
elif args.method == 'asne':
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
elif args.method == 'sagemean': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False)
elif args.method == 'sagegcn': # parameters for graphsage models are in 'graphsage' -> '__init__.py'
model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False)
elif args.method == 'asne':
model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10)
else:
print('method not found...')
exit(0)
@ -193,10 +192,10 @@ def main(args):
print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s')
if args.save_emb:
model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
#model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
model.save_embeddings(args.emb_file)
print(f'Save node embeddings in file: {args.emb_file}')
# ---------------------------------------STEP4: downstream task-----------------------------------------------
print('\nSTEP4: start evaluating ......: ')
t1 = time.time()
@ -205,14 +204,14 @@ def main(args):
# ------lp task
if args.task == 'lp' or args.task == 'lp_and_nc':
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)')
clf = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
clf.evaluate(test_node_pairs, test_edge_labels)
ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm
ds_task.evaluate(test_node_pairs, test_edge_labels)
# ------nc task
if args.task == 'nc' or args.task == 'lp_and_nc':
X, Y = read_node_label_downstream(args.label_file)
print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) # use Logistic Regression as clf; we may choose SVM or more advanced ones
clf.split_train_evaluate(X, Y, args.label_reserved)
ds_task = ncClassifier(vectors=vectors, clf=LogisticRegression()) # use Logistic Regression as clf; we may choose SVM or more advanced ones
ds_task.split_train_evaluate(X, Y, args.label_reserved)
t2 = time.time()
print(f'STEP4: end evaluating; time cost: {(t2-t1):.2f}s')