move classify to downstream
This commit is contained in:
parent
a33442f74c
commit
7980429764
@ -15,7 +15,6 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
import time
|
||||
#from .classify import ncClassifier, lpClassifier, read_node_label
|
||||
#from sklearn.linear_model import LogisticRegression
|
||||
|
||||
class ASNE(BaseEstimator, TransformerMixin):
|
||||
|
@ -1,235 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
import math
|
||||
import random
|
||||
import networkx as nx
|
||||
import warnings
|
||||
warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
'''
|
||||
#-----------------------------------------------------------------------------
|
||||
# part of code was originally forked from https://github.com/thunlp/OpenNE
|
||||
|
||||
# modified by Chengbin Hou 2018
|
||||
# Email: Chengbin.Hou10@foxmail.com
|
||||
#-----------------------------------------------------------------------------
|
||||
'''
|
||||
|
||||
# node classification classifier
|
||||
class ncClassifier(object):
|
||||
|
||||
def __init__(self, vectors, clf):
|
||||
self.embeddings = vectors
|
||||
self.clf = TopKRanker(clf) #here clf is LR
|
||||
self.binarizer = MultiLabelBinarizer(sparse_output=True)
|
||||
|
||||
def split_train_evaluate(self, X, Y, train_precent, seed=0):
|
||||
state = np.random.get_state()
|
||||
training_size = int(train_precent * len(X))
|
||||
#np.random.seed(seed)
|
||||
shuffle_indices = np.random.permutation(np.arange(len(X)))
|
||||
X_train = [X[shuffle_indices[i]] for i in range(training_size)]
|
||||
Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
|
||||
X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
|
||||
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
|
||||
|
||||
self.train(X_train, Y_train, Y)
|
||||
np.random.set_state(state) #why??? for binarizer.transform??
|
||||
return self.evaluate(X_test, Y_test)
|
||||
|
||||
def train(self, X, Y, Y_all):
|
||||
self.binarizer.fit(Y_all) #to support multi-labels, fit means dict mapping {orig cat: binarized vec}
|
||||
X_train = [self.embeddings[x] for x in X]
|
||||
Y = self.binarizer.transform(Y) #since we have use Y_all fitted, then we simply transform
|
||||
self.clf.fit(X_train, Y)
|
||||
|
||||
def predict(self, X, top_k_list):
|
||||
X_ = np.asarray([self.embeddings[x] for x in X])
|
||||
# see TopKRanker(OneVsRestClassifier)
|
||||
Y = self.clf.predict(X_, top_k_list=top_k_list) # the top k probs to be output...
|
||||
return Y
|
||||
|
||||
def evaluate(self, X, Y):
|
||||
top_k_list = [len(l) for l in Y] #multi-labels, diff len of labels of each node
|
||||
Y_ = self.predict(X, top_k_list) #pred val of X_test i.e. Y_pred
|
||||
Y = self.binarizer.transform(Y) #true val i.e. Y_test
|
||||
averages = ["micro", "macro", "samples", "weighted"]
|
||||
results = {}
|
||||
for average in averages:
|
||||
results[average] = f1_score(Y, Y_, average=average)
|
||||
# print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
|
||||
print(results)
|
||||
return results
|
||||
|
||||
class TopKRanker(OneVsRestClassifier): #orignal LR or SVM is for binary clf
|
||||
def predict(self, X, top_k_list): #re-define predict func of OneVsRestClassifier
|
||||
probs = np.asarray(super(TopKRanker, self).predict_proba(X))
|
||||
all_labels = []
|
||||
for i, k in enumerate(top_k_list):
|
||||
probs_ = probs[i, :]
|
||||
labels = self.classes_[probs_.argsort()[-k:]].tolist() #denote labels
|
||||
probs_[:] = 0 #reset probs_ to all 0
|
||||
probs_[labels] = 1 #reset probs_ to 1 if labels denoted...
|
||||
all_labels.append(probs_)
|
||||
return np.asarray(all_labels)
|
||||
|
||||
'''
|
||||
#note: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true labels
|
||||
#see: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi
|
||||
'''
|
||||
|
||||
'''
|
||||
import matplotlib.pyplot as plt
|
||||
def plt_roc(y_test, y_score):
|
||||
"""
|
||||
calculate AUC value and plot the ROC curve
|
||||
"""
|
||||
fpr, tpr, threshold = roc_curve(y_test, y_score)
|
||||
roc_auc = auc(fpr, tpr)
|
||||
plt.figure()
|
||||
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
|
||||
plt.plot(fpr, tpr, color='black', lw = 1)
|
||||
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
|
||||
plt.text(0.5,0.3,'ROC curve (area = %0.3f)' % roc_auc)
|
||||
plt.xlabel('False Positive Rate')
|
||||
plt.ylabel('True Positive Rate')
|
||||
plt.show()
|
||||
return roc_auc
|
||||
'''
|
||||
|
||||
# link prediction binary classifier
|
||||
class lpClassifier(object):
|
||||
|
||||
def __init__(self, vectors):
|
||||
self.embeddings = vectors
|
||||
|
||||
def evaluate(self, X_test, Y_test, seed=0): #clf here is simply a similarity/distance metric
|
||||
state = np.random.get_state()
|
||||
#np.random.seed(seed)
|
||||
test_size = len(X_test)
|
||||
#shuffle_indices = np.random.permutation(np.arange(test_size))
|
||||
#X_test = [X_test[shuffle_indices[i]] for i in range(test_size)]
|
||||
#Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)]
|
||||
|
||||
Y_true = [int(i) for i in Y_test]
|
||||
Y_probs = []
|
||||
for i in range(test_size):
|
||||
start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape(-1,1)
|
||||
end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape(-1,1)
|
||||
score = cosine_similarity(start_node_emb, end_node_emb) #ranging from [-1, +1]
|
||||
Y_probs.append( (score+1)/2.0 ) #switch to prob... however, we may also directly y_score = score
|
||||
#in sklearn roc... which yields the same reasult
|
||||
roc = roc_auc_score(y_true = Y_true, y_score = Y_probs)
|
||||
if roc < 0.5:
|
||||
roc = 1.0 - roc #since lp is binary clf task, just predict the opposite if<0.5
|
||||
print("roc=", "{:.9f}".format(roc))
|
||||
#plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value
|
||||
|
||||
def norm(a):
|
||||
sum = 0.0
|
||||
for i in range(len(a)):
|
||||
sum = sum + a[i] * a[i]
|
||||
return math.sqrt(sum)
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
sum = 0.0
|
||||
for i in range(len(a)):
|
||||
sum = sum + a[i] * b[i]
|
||||
#return sum/(norm(a) * norm(b))
|
||||
return sum/(norm(a) * norm(b) + 1e-20) #fix numerical issue 1e-20 almost = 0!
|
||||
|
||||
'''
|
||||
#cosine_similarity realized by use...
|
||||
#or try sklearn....
|
||||
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances # we may try diff metrics
|
||||
#ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
|
||||
'''
|
||||
|
||||
def lp_train_test_split(graph, ratio=0.5, neg_pos_link_ratio=1.0, test_pos_links_ratio=0.1):
|
||||
#randomly split links/edges into training set and testing set
|
||||
#*** note: we do not assume every node must be connected after removing links
|
||||
#*** hence, the resulting graph might have few single nodes --> more realistic scenario
|
||||
#*** e.g. a user just sign in a website has no link to others
|
||||
|
||||
#graph: OpenANE graph data strcture
|
||||
#ratio: perc of links for training; ranging [0, 1]
|
||||
#neg_pos_link_ratio: 1.0 means neg-links/pos-links = 1.0 i.e. balance case; raning [0, +inf)
|
||||
g = graph
|
||||
test_pos_links = int(nx.number_of_edges(g.G) * test_pos_links_ratio)
|
||||
|
||||
print("test_pos_links_ratio {:.2f}, test_pos_links {:.2f}, neg_pos_link_ratio is {:.2f}, links for training {:.2f}%,".format(test_pos_links_ratio, test_pos_links, neg_pos_link_ratio, ratio*100))
|
||||
test_pos_sample = []
|
||||
test_neg_sample = []
|
||||
|
||||
#random.seed(2018) #generate testing set that contains both pos and neg samples
|
||||
test_pos_sample = random.sample(g.G.edges(), test_pos_links)
|
||||
#test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do...
|
||||
#more efficient way:
|
||||
test_neg_sample = []
|
||||
num_neg_sample = int(test_pos_links * neg_pos_link_ratio)
|
||||
num = 0
|
||||
while num < num_neg_sample:
|
||||
pair_nodes = np.random.choice(g.look_back_list, size=2, replace=False)
|
||||
if pair_nodes not in g.G.edges():
|
||||
num += 1
|
||||
test_neg_sample.append(list(pair_nodes))
|
||||
|
||||
test_edge_pair = test_pos_sample + test_neg_sample
|
||||
test_edge_label = list(np.ones(len(test_pos_sample))) + list(np.zeros(len(test_neg_sample)))
|
||||
|
||||
print('before removing, the # of links: ', nx.number_of_edges(g.G), '; the # of single nodes: ', g.numSingleNodes())
|
||||
g.G.remove_edges_from(test_pos_sample) #training set should NOT contain testing set i.e. delete testing pos samples
|
||||
g.simulate_sparsely_linked_net(link_reserved = ratio) #simulate sparse net
|
||||
print('after removing, the # of links: ', nx.number_of_edges(g.G), '; the # of single nodes: ', g.numSingleNodes())
|
||||
print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(nx.number_of_edges(g.G), len(test_pos_sample), len(test_neg_sample)))
|
||||
return g.G, test_edge_pair, test_edge_label
|
||||
|
||||
#---------------------------------ulits for downstream tasks--------------------------------
|
||||
def load_embeddings(filename):
|
||||
fin = open(filename, 'r')
|
||||
node_num, size = [int(x) for x in fin.readline().strip().split()]
|
||||
vectors = {}
|
||||
while 1:
|
||||
l = fin.readline()
|
||||
if l == '':
|
||||
break
|
||||
vec = l.strip().split(' ')
|
||||
assert len(vec) == size+1
|
||||
vectors[vec[0]] = [float(x) for x in vec[1:]]
|
||||
fin.close()
|
||||
assert len(vectors) == node_num
|
||||
return vectors
|
||||
|
||||
def read_node_label(filename):
|
||||
fin = open(filename, 'r')
|
||||
X = []
|
||||
Y = []
|
||||
while 1:
|
||||
l = fin.readline()
|
||||
if l == '':
|
||||
break
|
||||
vec = l.strip().split(' ')
|
||||
X.append(vec[0])
|
||||
Y.append(vec[1:])
|
||||
fin.close()
|
||||
return X, Y
|
||||
|
||||
|
||||
def read_edge_label(filename):
|
||||
fin = open(filename, 'r')
|
||||
X = []
|
||||
Y = []
|
||||
while 1:
|
||||
l = fin.readline()
|
||||
if l == '':
|
||||
break
|
||||
vec = l.strip().split(' ')
|
||||
X.append(vec[:2])
|
||||
Y.append(vec[2])
|
||||
fin.close()
|
||||
return X, Y
|
||||
|
@ -13,7 +13,8 @@ import math
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import tensorflow as tf
|
||||
from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings?
|
||||
from .downstream import ncClassifier # to do... try use lpClassifier to choose best embeddings?
|
||||
from .utils import read_node_label_downstream
|
||||
|
||||
|
||||
class _LINE(object):
|
||||
@ -219,7 +220,7 @@ class LINE(object):
|
||||
self.model2.train_one_epoch()
|
||||
if label_file:
|
||||
self.get_embeddings()
|
||||
X, Y = read_node_label(label_file)
|
||||
X, Y = read_node_label_downstream(label_file)
|
||||
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
||||
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
||||
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
||||
@ -235,7 +236,7 @@ class LINE(object):
|
||||
self.model.train_one_epoch()
|
||||
if label_file:
|
||||
self.get_embeddings()
|
||||
X, Y = read_node_label(label_file)
|
||||
X, Y = read_node_label_downstream(label_file)
|
||||
print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
|
||||
clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
|
||||
result = clf.split_train_evaluate(X, Y, clf_ratio)
|
||||
|
@ -15,7 +15,7 @@ import random
|
||||
import numpy as np
|
||||
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
||||
from sklearn.linear_model import LogisticRegression #to do... 1) put it in downstream.py; and 2) try SVM...
|
||||
from libnrl.classify import ncClassifier, lpClassifier, read_node_label
|
||||
from libnrl.downstream import ncClassifier, lpClassifier
|
||||
from libnrl.graph import *
|
||||
from libnrl.utils import *
|
||||
from libnrl import abrw #ANE method; Attributed Biased Random Walk
|
||||
@ -225,14 +225,14 @@ def main(args):
|
||||
del model, g
|
||||
#------lp task
|
||||
if args.task == 'lp' or args.task == 'lp_and_nc':
|
||||
#X_test_lp, Y_test_lp = read_edge_label(args.label_file) #if you want to load your own lp testing data
|
||||
#X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file) #if you want to load your own lp testing data
|
||||
print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%'
|
||||
+ ' (by default, also generate equal negative links for testing)')
|
||||
clf = lpClassifier(vectors=vectors) #similarity/distance metric as clf; basically, lp is a binary clf probelm
|
||||
clf.evaluate(test_node_pairs, test_edge_labels)
|
||||
#------nc task
|
||||
if args.task == 'nc' or args.task == 'lp_and_nc':
|
||||
X, Y = read_node_label(args.label_file)
|
||||
X, Y = read_node_label_downstream(args.label_file)
|
||||
print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
|
||||
clf = ncClassifier(vectors=vectors, clf=LogisticRegression()) #use Logistic Regression as clf; we may choose SVM or more advanced ones
|
||||
clf.split_train_evaluate(X, Y, args.label_reserved)
|
||||
|
Loading…
Reference in New Issue
Block a user