move classify to downstream

2018-11-30 17:24:31 +08:00 · 2018-11-30 17:24:31 +08:00 · 7980429764
commit 7980429764
parent a33442f74c
4 changed files with 7 additions and 242 deletions
--- a/src/libnrl/asne.py
+++ b/src/libnrl/asne.py
@ -15,7 +15,6 @@ import numpy as np
 import tensorflow as tf
 from sklearn.base import BaseEstimator, TransformerMixin
 import time
 #from .classify import ncClassifier, lpClassifier, read_node_label
 #from sklearn.linear_model import LogisticRegression
 class ASNE(BaseEstimator, TransformerMixin):
--- a/src/libnrl/classify.py
+++ b/src/libnrl/classify.py
@ -1,235 +0,0 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 import numpy as np
 import math
 import random
 import networkx as nx
 import warnings
 warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc
 from sklearn.preprocessing import MultiLabelBinarizer
 '''
 #-----------------------------------------------------------------------------
 # part of code was originally forked from https://github.com/thunlp/OpenNE
 # modified by Chengbin Hou 2018
 # Email: Chengbin.Hou10@foxmail.com
 #-----------------------------------------------------------------------------
 '''
 # node classification classifier
 class ncClassifier(object):
    def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)  #here clf is LR
        self.binarizer = MultiLabelBinarizer(sparse_output=True)
    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = np.random.get_state()
        training_size = int(train_precent * len(X))
        #np.random.seed(seed) 
        shuffle_indices = np.random.permutation(np.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
        self.train(X_train, Y_train, Y)
        np.random.set_state(state)  #why??? for binarizer.transform?? 
        return self.evaluate(X_test, Y_test)
    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)  #to support multi-labels, fit means dict mapping {orig cat: binarized vec}
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)  #since we have use Y_all fitted, then we simply transform
        self.clf.fit(X_train, Y)
    def predict(self, X, top_k_list):
        X_ = np.asarray([self.embeddings[x] for x in X])
        # see TopKRanker(OneVsRestClassifier)
        Y = self.clf.predict(X_, top_k_list=top_k_list)  # the top k probs to be output...
        return Y
    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y]  #multi-labels, diff len of labels of each node
        Y_ = self.predict(X, top_k_list)  #pred val of X_test i.e. Y_pred
        Y = self.binarizer.transform(Y)   #true val i.e. Y_test
        averages = ["micro", "macro", "samples", "weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
        print(results)
        return results
 class TopKRanker(OneVsRestClassifier):  #orignal LR or SVM is for binary clf
    def predict(self, X, top_k_list):   #re-define predict func of OneVsRestClassifier
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist() #denote labels
            probs_[:] = 0      #reset probs_ to all 0
            probs_[labels] = 1 #reset probs_ to 1 if labels denoted...
            all_labels.append(probs_)
        return np.asarray(all_labels)
 '''
 #note: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true labels
 #see: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi
 '''
 '''
 import matplotlib.pyplot as plt
 def plt_roc(y_test, y_score):
    """
    calculate AUC value and plot the ROC curve
    """
    fpr, tpr, threshold = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
    plt.plot(fpr, tpr, color='black', lw = 1)
    plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
    plt.text(0.5,0.3,'ROC curve (area = %0.3f)' % roc_auc)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    return roc_auc
 '''
 # link prediction binary classifier
 class lpClassifier(object):
    def __init__(self, vectors):
        self.embeddings = vectors
    def evaluate(self, X_test, Y_test, seed=0):  #clf here is simply a similarity/distance metric
        state = np.random.get_state()
        #np.random.seed(seed)
        test_size = len(X_test)
        #shuffle_indices = np.random.permutation(np.arange(test_size))
        #X_test = [X_test[shuffle_indices[i]] for i in range(test_size)]
        #Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)]
        Y_true = [int(i) for i in Y_test]
        Y_probs = []
        for i in range(test_size):
            start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape(-1,1)
            end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape(-1,1)
            score = cosine_similarity(start_node_emb, end_node_emb) #ranging from [-1, +1]
            Y_probs.append( (score+1)/2.0 )     #switch to prob... however, we may also directly y_score = score 
                                                #in sklearn roc... which yields the same reasult
        roc = roc_auc_score(y_true = Y_true, y_score = Y_probs)
        if roc < 0.5:
            roc = 1.0 - roc    #since lp is binary clf task, just predict the opposite if<0.5
        print("roc=", "{:.9f}".format(roc))
        #plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value
 def norm(a):
    sum = 0.0
    for i in range(len(a)):
        sum = sum + a[i] * a[i]
    return math.sqrt(sum)
 def cosine_similarity(a, b):
    sum = 0.0
    for i in range(len(a)):
        sum = sum + a[i] * b[i]
    #return sum/(norm(a) * norm(b))
    return sum/(norm(a) * norm(b) + 1e-20)  #fix numerical issue 1e-20 almost = 0!
 '''
 #cosine_similarity realized by use...
 #or try sklearn....
        from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances  # we may try diff metrics
        #ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
 '''
 def lp_train_test_split(graph, ratio=0.5, neg_pos_link_ratio=1.0, test_pos_links_ratio=0.1):
    #randomly split links/edges into training set and testing set
    #*** note: we do not assume every node must be connected after removing links
    #*** hence, the resulting graph might have few single nodes --> more realistic scenario
    #*** e.g. a user just sign in a website has no link to others
    #graph: OpenANE graph data strcture
    #ratio: perc of links for training; ranging [0, 1]
    #neg_pos_link_ratio: 1.0 means neg-links/pos-links = 1.0 i.e. balance case; raning [0, +inf)
    g = graph
    test_pos_links = int(nx.number_of_edges(g.G) * test_pos_links_ratio)
    print("test_pos_links_ratio {:.2f}, test_pos_links {:.2f}, neg_pos_link_ratio is {:.2f}, links for training {:.2f}%,".format(test_pos_links_ratio, test_pos_links, neg_pos_link_ratio, ratio*100))
    test_pos_sample = []
    test_neg_sample = []
    #random.seed(2018) #generate testing set that contains both pos and neg samples
    test_pos_sample = random.sample(g.G.edges(), test_pos_links)
    #test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do...
    #more efficient way: 
    test_neg_sample = []
    num_neg_sample = int(test_pos_links * neg_pos_link_ratio)
    num = 0
    while num < num_neg_sample:
        pair_nodes = np.random.choice(g.look_back_list, size=2, replace=False)
        if pair_nodes not in g.G.edges():
            num += 1
            test_neg_sample.append(list(pair_nodes))
    test_edge_pair = test_pos_sample + test_neg_sample 
    test_edge_label = list(np.ones(len(test_pos_sample))) + list(np.zeros(len(test_neg_sample)))
    print('before removing, the # of links: ', nx.number_of_edges(g.G), ';   the # of single nodes: ', g.numSingleNodes())
    g.G.remove_edges_from(test_pos_sample)  #training set should NOT contain testing set i.e. delete testing pos samples
    g.simulate_sparsely_linked_net(link_reserved = ratio)  #simulate sparse net
    print('after removing,  the # of links: ', nx.number_of_edges(g.G), ';   the # of single nodes: ', g.numSingleNodes())
    print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(nx.number_of_edges(g.G), len(test_pos_sample), len(test_neg_sample)))
    return g.G, test_edge_pair, test_edge_label
 #---------------------------------ulits for downstream tasks--------------------------------
 def load_embeddings(filename):   
    fin = open(filename, 'r')
    node_num, size = [int(x) for x in fin.readline().strip().split()]
    vectors = {} 
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        assert len(vec) == size+1
        vectors[vec[0]] = [float(x) for x in vec[1:]]
    fin.close()
    assert len(vectors) == node_num
    return vectors
 def read_node_label(filename):
    fin = open(filename, 'r')
    X = []
    Y = []
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        X.append(vec[0])
        Y.append(vec[1:])
    fin.close()
    return X, Y
 def read_edge_label(filename):
    fin = open(filename, 'r')
    X = []
    Y = []
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        X.append(vec[:2])
        Y.append(vec[2])
    fin.close()
    return X, Y
--- a/src/libnrl/line.py
+++ b/src/libnrl/line.py
@ -13,7 +13,8 @@ import math
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import tensorflow as tf
-from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings?
+from .downstream import ncClassifier  # to do... try use lpClassifier to choose best embeddings?
 from .utils import read_node_label_downstream
 class _LINE(object):
@ -219,7 +220,7 @@ class LINE(object):
                self.model2.train_one_epoch()
                if label_file:
                    self.get_embeddings()
-                    X, Y = read_node_label(label_file)
+                    X, Y = read_node_label_downstream(label_file)
                    print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
                    clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)
@ -235,7 +236,7 @@ class LINE(object):
                self.model.train_one_epoch()
                if label_file:
                    self.get_embeddings()
-                    X, Y = read_node_label(label_file)
+                    X, Y = read_node_label_downstream(label_file)
                    print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
                    clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)
--- a/src/main.py
+++ b/src/main.py
@ -15,7 +15,7 @@ import random
 import numpy as np
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 from sklearn.linear_model import LogisticRegression #to do... 1) put it in downstream.py; and 2) try SVM...
-from libnrl.classify import ncClassifier, lpClassifier, read_node_label
+from libnrl.downstream import ncClassifier, lpClassifier
 from libnrl.graph import *
 from libnrl.utils import *
 from libnrl import abrw #ANE method; Attributed Biased Random Walk
@ -225,14 +225,14 @@ def main(args):
    del model, g
    #------lp task
    if args.task == 'lp' or args.task == 'lp_and_nc':
-        #X_test_lp, Y_test_lp = read_edge_label(args.label_file)  #if you want to load your own lp testing data
+        #X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file)  #if you want to load your own lp testing data
        print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%'
                + ' (by default, also generate equal negative links for testing)')
        clf = lpClassifier(vectors=vectors)     #similarity/distance metric as clf; basically, lp is a binary clf probelm
        clf.evaluate(test_node_pairs, test_edge_labels)
    #------nc task
    if args.task == 'nc' or args.task == 'lp_and_nc':
-        X, Y = read_node_label(args.label_file)
+        X, Y = read_node_label_downstream(args.label_file)
        print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
        clf = ncClassifier(vectors=vectors, clf=LogisticRegression())   #use Logistic Regression as clf; we may choose SVM or more advanced ones
        clf.split_train_evaluate(X, Y, args.label_reserved)