move classify to downstream
This commit is contained in:
parent
a33442f74c
commit
7980429764
@ -15,7 +15,6 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
import time
|
import time
|
||||||
#from .classify import ncClassifier, lpClassifier, read_node_label
|
|
||||||
#from sklearn.linear_model import LogisticRegression
|
#from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
class ASNE(BaseEstimator, TransformerMixin):
|
class ASNE(BaseEstimator, TransformerMixin):
|
||||||
|
@ -1,235 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from __future__ import print_function
|
|
||||||
import numpy as np
|
|
||||||
import math
|
|
||||||
import random
|
|
||||||
import networkx as nx
|
|
||||||
import warnings
|
|
||||||
warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')
|
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
|
||||||
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc
|
|
||||||
from sklearn.preprocessing import MultiLabelBinarizer
|
|
||||||
|
|
||||||
'''
|
|
||||||
#-----------------------------------------------------------------------------
|
|
||||||
# part of code was originally forked from https://github.com/thunlp/OpenNE
|
|
||||||
|
|
||||||
# modified by Chengbin Hou 2018
|
|
||||||
# Email: Chengbin.Hou10@foxmail.com
|
|
||||||
#-----------------------------------------------------------------------------
|
|
||||||
'''
|
|
||||||
|
|
||||||
# node classification classifier
|
|
||||||
class ncClassifier(object):
|
|
||||||
|
|
||||||
def __init__(self, vectors, clf):
|
|
||||||
self.embeddings = vectors
|
|
||||||
self.clf = TopKRanker(clf) #here clf is LR
|
|
||||||
self.binarizer = MultiLabelBinarizer(sparse_output=True)
|
|
||||||
|
|
||||||
def split_train_evaluate(self, X, Y, train_precent, seed=0):
|
|
||||||
state = np.random.get_state()
|
|
||||||
training_size = int(train_precent * len(X))
|
|
||||||
#np.random.seed(seed)
|
|
||||||
shuffle_indices = np.random.permutation(np.arange(len(X)))
|
|
||||||
X_train = [X[shuffle_indices[i]] for i in range(training_size)]
|
|
||||||
Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
|
|
||||||
X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
|
|
||||||
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
|
|
||||||
|
|
||||||
self.train(X_train, Y_train, Y)
|
|
||||||
np.random.set_state(state) #why??? for binarizer.transform??
|
|
||||||
return self.evaluate(X_test, Y_test)
|
|
||||||
|
|
||||||
def train(self, X, Y, Y_all):
|
|
||||||
self.binarizer.fit(Y_all) #to support multi-labels, fit means dict mapping {orig cat: binarized vec}
|
|
||||||
X_train = [self.embeddings[x] for x in X]
|
|
||||||
Y = self.binarizer.transform(Y) #since we have use Y_all fitted, then we simply transform
|
|
||||||
self.clf.fit(X_train, Y)
|
|
||||||
|
|
||||||
def predict(self, X, top_k_list):
|
|
||||||
X_ = np.asarray([self.embeddings[x] for x in X])
|
|
||||||
# see TopKRanker(OneVsRestClassifier)
|
|
||||||
Y = self.clf.predict(X_, top_k_list=top_k_list) # the top k probs to be output...
|
|
||||||
return Y
|
|
||||||
|
|
||||||
def evaluate(self, X, Y):
|
|
||||||
top_k_list = [len(l) for l in Y] #multi-labels, diff len of labels of each node
|
|
||||||
Y_ = self.predict(X, top_k_list) #pred val of X_test i.e. Y_pred
|
|
||||||
Y = self.binarizer.transform(Y) #true val i.e. Y_test
|
|
||||||
averages = ["micro", "macro", "samples", "weighted"]
|
|
||||||
results = {}
|
|
||||||
for average in averages:
|
|
||||||
results[average] = f1_score(Y, Y_, average=average)
|
|
||||||
# print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
|
|
||||||
print(results)
|
|
||||||
return results
|
|
||||||
|
|
||||||
class TopKRanker(OneVsRestClassifier): #orignal LR or SVM is for binary clf
|
|
||||||
def predict(self, X, top_k_list): #re-define predict func of OneVsRestClassifier
|
|
||||||
probs = np.asarray(super(TopKRanker, self).predict_proba(X))
|
|
||||||
all_labels = []
|
|
||||||
for i, k in enumerate(top_k_list):
|
|
||||||
probs_ = probs[i, :]
|
|
||||||
labels = self.classes_[probs_.argsort()[-k:]].tolist() #denote labels
|
|
||||||
probs_[:] = 0 #reset probs_ to all 0
|
|
||||||
probs_[labels] = 1 #reset probs_ to 1 if labels denoted...
|
|
||||||
all_labels.append(probs_)
|
|
||||||
return np.asarray(all_labels)
|
|
||||||
|
|
||||||
'''
|
|
||||||
#note: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true labels
|
|
||||||
#see: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
def plt_roc(y_test, y_score):
|
|
||||||
"""
|
|
||||||
calculate AUC value and plot the ROC curve
|
|
||||||
"""
|
|
||||||
fpr, tpr, threshold = roc_curve(y_test, y_score)
|
|
||||||
roc_auc = auc(fpr, tpr)
|
|
||||||
plt.figure()
|
|
||||||
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
|
|
||||||
plt.plot(fpr, tpr, color='black', lw = 1)
|
|
||||||
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
|
|
||||||
plt.text(0.5,0.3,'ROC curve (area = %0.3f)' % roc_auc)
|
|
||||||
plt.xlabel('False Positive Rate')
|
|
||||||
plt.ylabel('True Positive Rate')
|
|
||||||
plt.show()
|
|
||||||
return roc_auc
|
|
||||||
'''
|
|
||||||
|
|
||||||
# link prediction binary classifier
|
|
||||||
class lpClassifier(object):
|
|
||||||
|
|
||||||
def __init__(self, vectors):
|
|
||||||
self.embeddings = vectors
|
|
||||||
|
|
||||||
def evaluate(self, X_test, Y_test, seed=0): #clf here is simply a similarity/distance metric
|
|
||||||
state = np.random.get_state()
|
|
||||||
#np.random.seed(seed)
|
|
||||||
test_size = len(X_test)
|
|
||||||
#shuffle_indices = np.random.permutation(np.arange(test_size))
|
|
||||||
#X_test = [X_test[shuffle_indices[i]] for i in range(test_size)]
|
|
||||||
#Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)]
|
|
||||||
|
|
||||||
Y_true = [int(i) for i in Y_test]
|
|
||||||
Y_probs = []
|
|
||||||
for i in range(test_size):
|
|
||||||
start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape(-1,1)
|
|
||||||
end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape(-1,1)
|
|
||||||
score = cosine_similarity(start_node_emb, end_node_emb) #ranging from [-1, +1]
|
|
||||||
Y_probs.append( (score+1)/2.0 ) #switch to prob... however, we may also directly y_score = score
|
|
||||||
#in sklearn roc... which yields the same reasult
|
|
||||||
roc = roc_auc_score(y_true = Y_true, y_score = Y_probs)
|
|
||||||
if roc < 0.5:
|
|
||||||
roc = 1.0 - roc #since lp is binary clf task, just predict the opposite if<0.5
|
|
||||||
print("roc=", "{:.9f}".format(roc))
|
|
||||||
#plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value
|
|
||||||
|
|
||||||
def norm(a):
|
|
||||||
sum = 0.0
|
|
||||||
for i in range(len(a)):
|
|
||||||
sum = sum + a[i] * a[i]
|
|
||||||
return math.sqrt(sum)
|
|
||||||
|
|
||||||
def cosine_similarity(a, b):
|
|
||||||
sum = 0.0
|
|
||||||
for i in range(len(a)):
|
|
||||||
sum = sum + a[i] * b[i]
|
|
||||||
#return sum/(norm(a) * norm(b))
|
|
||||||
return sum/(norm(a) * norm(b) + 1e-20) #fix numerical issue 1e-20 almost = 0!
|
|
||||||
|
|
||||||
'''
|
|
||||||
#cosine_similarity realized by use...
|
|
||||||
#or try sklearn....
|
|
||||||
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics
|
|
||||||
#ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
|
|
||||||
'''
|
|
||||||
|
|
||||||
def lp_train_test_split(graph, ratio=0.5, neg_pos_link_ratio=1.0, test_pos_links_ratio=0.1):
|
|
||||||
#randomly split links/edges into training set and testing set
|
|
||||||
#*** note: we do not assume every node must be connected after removing links
|
|
||||||
#*** hence, the resulting graph might have few single nodes --> more realistic scenario
|
|
||||||
#*** e.g. a user just sign in a website has no link to others
|
|
||||||
|
|
||||||
#graph: OpenANE graph data strcture
|
|
||||||
#ratio: perc of links for training; ranging [0, 1]
|
|
||||||
#neg_pos_link_ratio: 1.0 means neg-links/pos-links = 1.0 i.e. balance case; raning [0, +inf)
|
|
||||||
g = graph
|
|
||||||
test_pos_links = int(nx.number_of_edges(g.G) * test_pos_links_ratio)
|
|
||||||
|
|
||||||
print("test_pos_links_ratio {:.2f}, test_pos_links {:.2f}, neg_pos_link_ratio is {:.2f}, links for training {:.2f}%,".format(test_pos_links_ratio, test_pos_links, neg_pos_link_ratio, ratio*100))
|
|
||||||
test_pos_sample = []
|
|
||||||
test_neg_sample = []
|
|
||||||
|
|
||||||
#random.seed(2018) #generate testing set that contains both pos and neg samples
|
|
||||||
test_pos_sample = random.sample(g.G.edges(), test_pos_links)
|
|
||||||
#test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do...
|
|
||||||
#more efficient way:
|
|
||||||
test_neg_sample = []
|
|
||||||
num_neg_sample = int(test_pos_links * neg_pos_link_ratio)
|
|
||||||
num = 0
|
|
||||||
while num < num_neg_sample:
|
|
||||||
pair_nodes = np.random.choice(g.look_back_list, size=2, replace=False)
|
|
||||||
if pair_nodes not in g.G.edges():
|
|
||||||
num += 1
|
|
||||||
test_neg_sample.append(list(pair_nodes))
|
|
||||||
|
|
||||||
test_edge_pair = test_pos_sample + test_neg_sample
|
|
||||||
test_edge_label = list(np.ones(len(test_pos_sample))) + list(np.zeros(len(test_neg_sample)))
|
|
||||||
|
|
||||||
print('before removing, the # of links: ', nx.number_of_edges(g.G), '; the # of single nodes: ', g.numSingleNodes())
|
|
||||||
g.G.remove_edges_from(test_pos_sample) #training set should NOT contain testing set i.e. delete testing pos samples
|
|
||||||
g.simulate_sparsely_linked_net(link_reserved = ratio) #simulate sparse net
|
|
||||||
print('after removing, the # of links: ', nx.number_of_edges(g.G), '; the # of single nodes: ', g.numSingleNodes())
|
|
||||||
print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(nx.number_of_edges(g.G), len(test_pos_sample), len(test_neg_sample)))
|
|
||||||
return g.G, test_edge_pair, test_edge_label
|
|
||||||
|
|
||||||
#---------------------------------ulits for downstream tasks--------------------------------
|
|
||||||
def load_embeddings(filename):
|
|
||||||
fin = open(filename, 'r')
|
|
||||||
node_num, size = [int(x) for x in fin.readline().strip().split()]
|
|
||||||
vectors = {}
|
|
||||||
while 1:
|
|
||||||
l = fin.readline()
|
|
||||||
if l == '':
|
|
||||||
break
|
|
||||||
vec = l.strip().split(' ')
|
|
||||||
assert len(vec) == size+1
|
|
||||||
vectors[vec[0]] = [float(x) for x in vec[1:]]
|
|
||||||
fin.close()
|
|
||||||
assert len(vectors) == node_num
|
|
||||||
return vectors
|
|
||||||
|
|
||||||
def read_node_label(filename):
|
|
||||||
fin = open(filename, 'r')
|
|
||||||
X = []
|
|
||||||
Y = []
|
|
||||||
while 1:
|
|
||||||
l = fin.readline()
|
|
||||||
if l == '':
|
|
||||||
break
|
|
||||||
vec = l.strip().split(' ')
|
|
||||||
X.append(vec[0])
|
|
||||||
Y.append(vec[1:])
|
|
||||||
fin.close()
|
|
||||||
return X, Y
|
|
||||||
|
|
||||||
|
|
||||||
def read_edge_label(filename):
|
|
||||||
fin = open(filename, 'r')
|
|
||||||
X = []
|
|
||||||
Y = []
|
|
||||||
while 1:
|
|
||||||
l = fin.readline()
|
|
||||||
if l == '':
|
|
||||||
break
|
|
||||||
vec = l.strip().split(' ')
|
|
||||||
X.append(vec[:2])
|
|
||||||
Y.append(vec[2])
|
|
||||||
fin.close()
|
|
||||||
return X, Y
|
|
||||||
|
|
@ -13,7 +13,8 @@ import math
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings?
|
from .downstream import ncClassifier # to do... try use lpClassifier to choose best embeddings?
|
||||||
|
from .utils import read_node_label_downstream
|
||||||
|
|
||||||
|
|
||||||
class _LINE(object):
|
class _LINE(object):
|
||||||
@ -219,7 +220,7 @@ class LINE(object):
|
|||||||
self.model2.train_one_epoch()
|
self.model2.train_one_epoch()
|
||||||
if label_file:
|
if label_file:
|
||||||
self.get_embeddings()
|
self.get_embeddings()
|
||||||
X, Y = read_node_label(label_file)
|
X, Y = read_node_label_downstream(label_file)
|
||||||
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
||||||
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
||||||
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
||||||
@ -235,7 +236,7 @@ class LINE(object):
|
|||||||
self.model.train_one_epoch()
|
self.model.train_one_epoch()
|
||||||
if label_file:
|
if label_file:
|
||||||
self.get_embeddings()
|
self.get_embeddings()
|
||||||
X, Y = read_node_label(label_file)
|
X, Y = read_node_label_downstream(label_file)
|
||||||
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
||||||
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
||||||
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
||||||
|
@ -15,7 +15,7 @@ import random
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
||||||
from sklearn.linear_model import LogisticRegression #to do... 1) put it in downstream.py; and 2) try SVM...
|
from sklearn.linear_model import LogisticRegression #to do... 1) put it in downstream.py; and 2) try SVM...
|
||||||
from libnrl.classify import ncClassifier, lpClassifier, read_node_label
|
from libnrl.downstream import ncClassifier, lpClassifier
|
||||||
from libnrl.graph import *
|
from libnrl.graph import *
|
||||||
from libnrl.utils import *
|
from libnrl.utils import *
|
||||||
from libnrl import abrw #ANE method; Attributed Biased Random Walk
|
from libnrl import abrw #ANE method; Attributed Biased Random Walk
|
||||||
@ -225,14 +225,14 @@ def main(args):
|
|||||||
del model, g
|
del model, g
|
||||||
#------lp task
|
#------lp task
|
||||||
if args.task == 'lp' or args.task == 'lp_and_nc':
|
if args.task == 'lp' or args.task == 'lp_and_nc':
|
||||||
#X_test_lp, Y_test_lp = read_edge_label(args.label_file) #if you want to load your own lp testing data
|
#X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file) #if you want to load your own lp testing data
|
||||||
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%'
|
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%'
|
||||||
+ ' (by default, also generate equal negative links for testing)')
|
+ ' (by default, also generate equal negative links for testing)')
|
||||||
clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm
|
clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||||
clf.evaluate(test_node_pairs, test_edge_labels)
|
clf.evaluate(test_node_pairs, test_edge_labels)
|
||||||
#------nc task
|
#------nc task
|
||||||
if args.task == 'nc' or args.task == 'lp_and_nc':
|
if args.task == 'nc' or args.task == 'lp_and_nc':
|
||||||
X, Y = read_node_label(args.label_file)
|
X, Y = read_node_label_downstream(args.label_file)
|
||||||
print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
|
print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
|
||||||
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones
|
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones
|
||||||
clf.split_train_evaluate(X, Y, args.label_reserved)
|
clf.split_train_evaluate(X, Y, args.label_reserved)
|
||||||
|
Loading…
Reference in New Issue
Block a user