diff --git a/src/libnrl/asne.py b/src/libnrl/asne.py index 7ce1a92..6302559 100644 --- a/src/libnrl/asne.py +++ b/src/libnrl/asne.py @@ -15,7 +15,6 @@ import numpy as np import tensorflow as tf from sklearn.base import BaseEstimator, TransformerMixin import time -#from .classify import ncClassifier, lpClassifier, read_node_label #from sklearn.linear_model import LogisticRegression class ASNE(BaseEstimator, TransformerMixin): diff --git a/src/libnrl/classify.py b/src/libnrl/classify.py deleted file mode 100644 index 29c7b76..0000000 --- a/src/libnrl/classify.py +++ /dev/null @@ -1,235 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import print_function -import numpy as np -import math -import random -import networkx as nx -import warnings -warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn') -from sklearn.multiclass import OneVsRestClassifier -from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc -from sklearn.preprocessing import MultiLabelBinarizer - -''' -#----------------------------------------------------------------------------- -# part of code was originally forked from https://github.com/thunlp/OpenNE - -# modified by Chengbin Hou 2018 -# Email: Chengbin.Hou10@foxmail.com -#----------------------------------------------------------------------------- -''' - -# node classification classifier -class ncClassifier(object): - - def __init__(self, vectors, clf): - self.embeddings = vectors - self.clf = TopKRanker(clf) #here clf is LR - self.binarizer = MultiLabelBinarizer(sparse_output=True) - - def split_train_evaluate(self, X, Y, train_precent, seed=0): - state = np.random.get_state() - training_size = int(train_precent * len(X)) - #np.random.seed(seed) - shuffle_indices = np.random.permutation(np.arange(len(X))) - X_train = [X[shuffle_indices[i]] for i in range(training_size)] - Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] - X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] - Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] - - self.train(X_train, Y_train, Y) - np.random.set_state(state) #why??? for binarizer.transform?? - return self.evaluate(X_test, Y_test) - - def train(self, X, Y, Y_all): - self.binarizer.fit(Y_all) #to support multi-labels, fit means dict mapping {orig cat: binarized vec} - X_train = [self.embeddings[x] for x in X] - Y = self.binarizer.transform(Y) #since we have use Y_all fitted, then we simply transform - self.clf.fit(X_train, Y) - - def predict(self, X, top_k_list): - X_ = np.asarray([self.embeddings[x] for x in X]) - # see TopKRanker(OneVsRestClassifier) - Y = self.clf.predict(X_, top_k_list=top_k_list) # the top k probs to be output... - return Y - - def evaluate(self, X, Y): - top_k_list = [len(l) for l in Y] #multi-labels, diff len of labels of each node - Y_ = self.predict(X, top_k_list) #pred val of X_test i.e. Y_pred - Y = self.binarizer.transform(Y) #true val i.e. Y_test - averages = ["micro", "macro", "samples", "weighted"] - results = {} - for average in averages: - results[average] = f1_score(Y, Y_, average=average) - # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]])) - print(results) - return results - -class TopKRanker(OneVsRestClassifier): #orignal LR or SVM is for binary clf - def predict(self, X, top_k_list): #re-define predict func of OneVsRestClassifier - probs = np.asarray(super(TopKRanker, self).predict_proba(X)) - all_labels = [] - for i, k in enumerate(top_k_list): - probs_ = probs[i, :] - labels = self.classes_[probs_.argsort()[-k:]].tolist() #denote labels - probs_[:] = 0 #reset probs_ to all 0 - probs_[labels] = 1 #reset probs_ to 1 if labels denoted... - all_labels.append(probs_) - return np.asarray(all_labels) - -''' -#note: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true labels -#see: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi -''' - -''' -import matplotlib.pyplot as plt -def plt_roc(y_test, y_score): - """ - calculate AUC value and plot the ROC curve - """ - fpr, tpr, threshold = roc_curve(y_test, y_score) - roc_auc = auc(fpr, tpr) - plt.figure() - plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black') - plt.plot(fpr, tpr, color='black', lw = 1) - plt.plot([0,1],[0,1], color = 'red', linestyle = '--') - plt.text(0.5,0.3,'ROC curve (area = %0.3f)' % roc_auc) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.show() - return roc_auc -''' - -# link prediction binary classifier -class lpClassifier(object): - - def __init__(self, vectors): - self.embeddings = vectors - - def evaluate(self, X_test, Y_test, seed=0): #clf here is simply a similarity/distance metric - state = np.random.get_state() - #np.random.seed(seed) - test_size = len(X_test) - #shuffle_indices = np.random.permutation(np.arange(test_size)) - #X_test = [X_test[shuffle_indices[i]] for i in range(test_size)] - #Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)] - - Y_true = [int(i) for i in Y_test] - Y_probs = [] - for i in range(test_size): - start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape(-1,1) - end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape(-1,1) - score = cosine_similarity(start_node_emb, end_node_emb) #ranging from [-1, +1] - Y_probs.append( (score+1)/2.0 ) #switch to prob... however, we may also directly y_score = score - #in sklearn roc... which yields the same reasult - roc = roc_auc_score(y_true = Y_true, y_score = Y_probs) - if roc < 0.5: - roc = 1.0 - roc #since lp is binary clf task, just predict the opposite if<0.5 - print("roc=", "{:.9f}".format(roc)) - #plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value - -def norm(a): - sum = 0.0 - for i in range(len(a)): - sum = sum + a[i] * a[i] - return math.sqrt(sum) - -def cosine_similarity(a, b): - sum = 0.0 - for i in range(len(a)): - sum = sum + a[i] * b[i] - #return sum/(norm(a) * norm(b)) - return sum/(norm(a) * norm(b) + 1e-20) #fix numerical issue 1e-20 almost = 0! - -''' -#cosine_similarity realized by use... -#or try sklearn.... - from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics - #ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise -''' - -def lp_train_test_split(graph, ratio=0.5, neg_pos_link_ratio=1.0, test_pos_links_ratio=0.1): - #randomly split links/edges into training set and testing set - #*** note: we do not assume every node must be connected after removing links - #*** hence, the resulting graph might have few single nodes --> more realistic scenario - #*** e.g. a user just sign in a website has no link to others - - #graph: OpenANE graph data strcture - #ratio: perc of links for training; ranging [0, 1] - #neg_pos_link_ratio: 1.0 means neg-links/pos-links = 1.0 i.e. balance case; raning [0, +inf) - g = graph - test_pos_links = int(nx.number_of_edges(g.G) * test_pos_links_ratio) - - print("test_pos_links_ratio {:.2f}, test_pos_links {:.2f}, neg_pos_link_ratio is {:.2f}, links for training {:.2f}%,".format(test_pos_links_ratio, test_pos_links, neg_pos_link_ratio, ratio*100)) - test_pos_sample = [] - test_neg_sample = [] - - #random.seed(2018) #generate testing set that contains both pos and neg samples - test_pos_sample = random.sample(g.G.edges(), test_pos_links) - #test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do... - #more efficient way: - test_neg_sample = [] - num_neg_sample = int(test_pos_links * neg_pos_link_ratio) - num = 0 - while num < num_neg_sample: - pair_nodes = np.random.choice(g.look_back_list, size=2, replace=False) - if pair_nodes not in g.G.edges(): - num += 1 - test_neg_sample.append(list(pair_nodes)) - - test_edge_pair = test_pos_sample + test_neg_sample - test_edge_label = list(np.ones(len(test_pos_sample))) + list(np.zeros(len(test_neg_sample))) - - print('before removing, the # of links: ', nx.number_of_edges(g.G), '; the # of single nodes: ', g.numSingleNodes()) - g.G.remove_edges_from(test_pos_sample) #training set should NOT contain testing set i.e. delete testing pos samples - g.simulate_sparsely_linked_net(link_reserved = ratio) #simulate sparse net - print('after removing, the # of links: ', nx.number_of_edges(g.G), '; the # of single nodes: ', g.numSingleNodes()) - print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(nx.number_of_edges(g.G), len(test_pos_sample), len(test_neg_sample))) - return g.G, test_edge_pair, test_edge_label - -#---------------------------------ulits for downstream tasks-------------------------------- -def load_embeddings(filename): - fin = open(filename, 'r') - node_num, size = [int(x) for x in fin.readline().strip().split()] - vectors = {} - while 1: - l = fin.readline() - if l == '': - break - vec = l.strip().split(' ') - assert len(vec) == size+1 - vectors[vec[0]] = [float(x) for x in vec[1:]] - fin.close() - assert len(vectors) == node_num - return vectors - -def read_node_label(filename): - fin = open(filename, 'r') - X = [] - Y = [] - while 1: - l = fin.readline() - if l == '': - break - vec = l.strip().split(' ') - X.append(vec[0]) - Y.append(vec[1:]) - fin.close() - return X, Y - - -def read_edge_label(filename): - fin = open(filename, 'r') - X = [] - Y = [] - while 1: - l = fin.readline() - if l == '': - break - vec = l.strip().split(' ') - X.append(vec[:2]) - Y.append(vec[2]) - fin.close() - return X, Y - \ No newline at end of file diff --git a/src/libnrl/line.py b/src/libnrl/line.py index 0161b8a..1ca883b 100644 --- a/src/libnrl/line.py +++ b/src/libnrl/line.py @@ -13,7 +13,8 @@ import math import numpy as np from sklearn.linear_model import LogisticRegression import tensorflow as tf -from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings? +from .downstream import ncClassifier # to do... try use lpClassifier to choose best embeddings? +from .utils import read_node_label_downstream class _LINE(object): @@ -219,7 +220,7 @@ class LINE(object): self.model2.train_one_epoch() if label_file: self.get_embeddings() - X, Y = read_node_label(label_file) + X, Y = read_node_label_downstream(label_file) print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100)) clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) @@ -235,7 +236,7 @@ class LINE(object): self.model.train_one_epoch() if label_file: self.get_embeddings() - X, Y = read_node_label(label_file) + X, Y = read_node_label_downstream(label_file) print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100)) clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) diff --git a/src/main.py b/src/main.py index 35a30ae..5d03665 100644 --- a/src/main.py +++ b/src/main.py @@ -15,7 +15,7 @@ import random import numpy as np from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from sklearn.linear_model import LogisticRegression #to do... 1) put it in downstream.py; and 2) try SVM... -from libnrl.classify import ncClassifier, lpClassifier, read_node_label +from libnrl.downstream import ncClassifier, lpClassifier from libnrl.graph import * from libnrl.utils import * from libnrl import abrw #ANE method; Attributed Biased Random Walk @@ -225,14 +225,14 @@ def main(args): del model, g #------lp task if args.task == 'lp' or args.task == 'lp_and_nc': - #X_test_lp, Y_test_lp = read_edge_label(args.label_file) #if you want to load your own lp testing data + #X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file) #if you want to load your own lp testing data print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%' + ' (by default, also generate equal negative links for testing)') clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm clf.evaluate(test_node_pairs, test_edge_labels) #------nc task if args.task == 'nc' or args.task == 'lp_and_nc': - X, Y = read_node_label(args.label_file) + X, Y = read_node_label_downstream(args.label_file) print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%') clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones clf.split_train_evaluate(X, Y, args.label_reserved)