diff --git a/src/libnrl/downstream.py b/src/libnrl/downstream.py index 7a035e4..9199fff 100644 --- a/src/libnrl/downstream.py +++ b/src/libnrl/downstream.py @@ -1,26 +1,18 @@ -# -*- coding: utf-8 -*- -from __future__ import print_function +""" +downstream tasks; each task is a class; +by Chengbin Hou & Zeyu Dong +""" import math import random -import warnings import numpy as np from sklearn.metrics import f1_score, roc_auc_score from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import MultiLabelBinarizer -warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn') - -''' -#----------------------------------------------------------------------------- -# by Chengbin Hou 2018 -# Email: Chengbin.Hou10@foxmail.com -#----------------------------------------------------------------------------- -''' - -# node classification classifier +# ------------------node classification task--------------------------- class ncClassifier(object): @@ -68,8 +60,6 @@ class ncClassifier(object): results[average] = f1_score(Y, Y_, average=average) print(results) return results - - class TopKRanker(OneVsRestClassifier): # orignal LR or SVM is for binary clf def predict(self, X, top_k_list): # re-define predict func of OneVsRestClassifier probs = np.asarray(super(TopKRanker, self).predict_proba(X)) @@ -84,7 +74,7 @@ class TopKRanker(OneVsRestClassifier): # orignal LR or SVM is for binary clf return np.asarray(all_labels) -# link prediction binary classifier +# ------------------link prediction task--------------------------- class lpClassifier(object): def __init__(self, vectors): @@ -110,21 +100,19 @@ class lpClassifier(object): roc = 1.0 - roc # since lp is binary clf task, just predict the opposite if<0.5 print("roc=", "{:.9f}".format(roc)) - def norm(a): sum = 0.0 for i in range(len(a)): sum = sum + a[i] * a[i] return math.sqrt(sum) - def cosine_similarity(a, b): sum = 0.0 for i in range(len(a)): sum = sum + a[i] * b[i] return sum / (norm(a) * norm(b) + 1e-100) - +''' def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0): # randomly split links/edges into training set and testing set # *** note: we do not assume every node must be connected after removing links @@ -166,3 +154,4 @@ def lp_train_test_split(graph, ratio=0.8, neg_pos_link_ratio=1.0): print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format( g.numDiEdges(), len(test_pos_sample), len(test_neg_sample))) return g.G, test_edge_pair, test_edge_label +''' \ No newline at end of file diff --git a/src/libnrl/utils.py b/src/libnrl/utils.py index 792d841..106b433 100644 --- a/src/libnrl/utils.py +++ b/src/libnrl/utils.py @@ -1,20 +1,16 @@ -# -*- coding: utf-8 -*- +""" +commonly used ulits +by Chengbin Hou & Zeyu Dong +""" + import time import numpy as np from scipy import sparse -''' -#----------------------------------------------------------------------------- -# Chengbin Hou @ SUSTech 2018 -# Email: Chengbin.Hou10@foxmail.com -#----------------------------------------------------------------------------- -''' - # ---------------------------------ulits for calculation-------------------------------- - def row_as_probdist(mat, dense_output=False, preserve_zeros=False): """Make each row of matrix sums up to 1.0, i.e., a probability distribution. Support both dense and sparse matrix. @@ -72,8 +68,8 @@ def pairwise_similarity(mat, type='cosine'): return 'Not found!' return result -# ---------------------------------ulits for downstream tasks-------------------------------- +# ---------------------------------ulits for downstream tasks-------------------------------- def read_edge_label_downstream(filename): fin = open(filename, 'r') @@ -133,6 +129,8 @@ def generate_edges_for_linkpred(graph, edges_removed, balance_ratio=1.0): return test_node_pairs, test_edge_labels +# ---------------------------------others-------------------------------- + def dim_reduction(mat, dim=128, method='pca'): ''' dimensionality reduction: PCA, SVD, etc... dim = # of columns diff --git a/src/main.py b/src/main.py index e55823d..325e40b 100644 --- a/src/main.py +++ b/src/main.py @@ -7,28 +7,28 @@ STEP4: downstream evaluations python src/main.py --method abrw -by Chengbin Hou 2018 +by Chengbin HOU 2018 ''' import time # import random from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from libnrl import abrw # ANE method; Attributed Biased Random Walk from libnrl import aane # ANE method +from libnrl import tadw # ANE method from libnrl import asne # ANE method +from libnrl.graphsage import graphsageAPI # ANE method from libnrl import attrcomb # ANE method from libnrl import attrpure # NE method simply use svd or pca for dim reduction from libnrl import line # PNE method -from libnrl import tadw # ANE method -from libnrl.downstream import lpClassifier, ncClassifier +from libnrl import grarep # PNE method +from libnrl import node2vec # PNE method; including deepwalk and node2vec from libnrl.graph import Graph -from libnrl.graphsage import graphsageAPI # ANE method -from libnrl.grarep import GraRep # PNE method +from libnrl.downstream import lpClassifier, ncClassifier from libnrl.utils import generate_edges_for_linkpred, read_node_label_downstream -from sklearn.linear_model import LogisticRegression # to do... 1) put it in downstream.py; and 2) try SVM... -from libnrl import abrw # ANE method; Attributed Biased Random Walk -from libnrl import node2vec # PNE method; including deepwalk and node2vec +from sklearn.linear_model import LogisticRegression # to do... try SVM... def parse_args(): @@ -175,17 +175,16 @@ def main(args): model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim, workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q) elif args.method == 'grarep': - model = GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim) + model = grarep.GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim) elif args.method == 'line': # if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score model = line.LINE(graph=g, epoch=args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio, label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro') - + elif args.method == 'asne': + model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10) elif args.method == 'sagemean': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False) elif args.method == 'sagegcn': # parameters for graphsage models are in 'graphsage' -> '__init__.py' model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False) - elif args.method == 'asne': - model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10) else: print('method not found...') exit(0) @@ -193,10 +192,10 @@ def main(args): print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s') if args.save_emb: - model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime())) + #model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime())) + model.save_embeddings(args.emb_file) print(f'Save node embeddings in file: {args.emb_file}') - # ---------------------------------------STEP4: downstream task----------------------------------------------- print('\nSTEP4: start evaluating ......: ') t1 = time.time() @@ -205,14 +204,14 @@ def main(args): # ------lp task if args.task == 'lp' or args.task == 'lp_and_nc': print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)') - clf = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm - clf.evaluate(test_node_pairs, test_edge_labels) + ds_task = lpClassifier(vectors=vectors) # similarity/distance metric as clf; basically, lp is a binary clf probelm + ds_task.evaluate(test_node_pairs, test_edge_labels) # ------nc task if args.task == 'nc' or args.task == 'lp_and_nc': X, Y = read_node_label_downstream(args.label_file) print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%') - clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) # use Logistic Regression as clf; we may choose SVM or more advanced ones - clf.split_train_evaluate(X, Y, args.label_reserved) + ds_task = ncClassifier(vectors=vectors, clf=LogisticRegression()) # use Logistic Regression as clf; we may choose SVM or more advanced ones + ds_task.split_train_evaluate(X, Y, args.label_reserved) t2 = time.time() print(f'STEP4: end evaluating; time cost: {(t2-t1):.2f}s')