move classify to downstream

2018-11-30 17:24:31 +08:00 · 2018-11-30 17:24:31 +08:00 · 7980429764
commit 7980429764
parent a33442f74c
4 changed files with 7 additions and 242 deletions
--- a/src/libnrl/asne.py
+++ b/src/libnrl/asne.py
@ -15,7 +15,6 @@ import numpy as np
 import tensorflow as tf
 from sklearn.base import BaseEstimator, TransformerMixin
 import time
-#from .classify import ncClassifier, lpClassifier, read_node_label
 #from sklearn.linear_model import LogisticRegression

 class ASNE(BaseEstimator, TransformerMixin):
--- a/src/libnrl/classify.py
+++ b/src/libnrl/classify.py
@ -1,235 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import print_function
-import numpy as np
-import math
-import random
-import networkx as nx
-import warnings
-warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc
-from sklearn.preprocessing import MultiLabelBinarizer
-
-'''
-#-----------------------------------------------------------------------------
-# part of code was originally forked from https://github.com/thunlp/OpenNE
-
-# modified by Chengbin Hou 2018
-# Email: Chengbin.Hou10@foxmail.com
-#-----------------------------------------------------------------------------
-'''
-
-# node classification classifier
-class ncClassifier(object):
-
-    def __init__(self, vectors, clf):
-        self.embeddings = vectors
-        self.clf = TopKRanker(clf)  #here clf is LR
-        self.binarizer = MultiLabelBinarizer(sparse_output=True)
-
-    def split_train_evaluate(self, X, Y, train_precent, seed=0):
-        state = np.random.get_state()
-        training_size = int(train_precent * len(X))
-        #np.random.seed(seed) 
-        shuffle_indices = np.random.permutation(np.arange(len(X)))
-        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
-        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
-        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
-        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
-
-        self.train(X_train, Y_train, Y)
-        np.random.set_state(state)  #why??? for binarizer.transform?? 
-        return self.evaluate(X_test, Y_test)
-
-    def train(self, X, Y, Y_all):
-        self.binarizer.fit(Y_all)  #to support multi-labels, fit means dict mapping {orig cat: binarized vec}
-        X_train = [self.embeddings[x] for x in X]
-        Y = self.binarizer.transform(Y)  #since we have use Y_all fitted, then we simply transform
-        self.clf.fit(X_train, Y)
-
-    def predict(self, X, top_k_list):
-        X_ = np.asarray([self.embeddings[x] for x in X])
-        # see TopKRanker(OneVsRestClassifier)
-        Y = self.clf.predict(X_, top_k_list=top_k_list)  # the top k probs to be output...
-        return Y
-
-    def evaluate(self, X, Y):
-        top_k_list = [len(l) for l in Y]  #multi-labels, diff len of labels of each node
-        Y_ = self.predict(X, top_k_list)  #pred val of X_test i.e. Y_pred
-        Y = self.binarizer.transform(Y)   #true val i.e. Y_test
-        averages = ["micro", "macro", "samples", "weighted"]
-        results = {}
-        for average in averages:
-            results[average] = f1_score(Y, Y_, average=average)
-        # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
-        print(results)
-        return results
-
-class TopKRanker(OneVsRestClassifier):  #orignal LR or SVM is for binary clf
-    def predict(self, X, top_k_list):   #re-define predict func of OneVsRestClassifier
-        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
-        all_labels = []
-        for i, k in enumerate(top_k_list):
-            probs_ = probs[i, :]
-            labels = self.classes_[probs_.argsort()[-k:]].tolist() #denote labels
-            probs_[:] = 0      #reset probs_ to all 0
-            probs_[labels] = 1 #reset probs_ to 1 if labels denoted...
-            all_labels.append(probs_)
-        return np.asarray(all_labels)
-
-'''
-#note: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true labels
-#see: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi
-'''
-
-'''
-import matplotlib.pyplot as plt
-def plt_roc(y_test, y_score):
-    """
-    calculate AUC value and plot the ROC curve
-    """
-    fpr, tpr, threshold = roc_curve(y_test, y_score)
-    roc_auc = auc(fpr, tpr)
-    plt.figure()
-    plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
-    plt.plot(fpr, tpr, color='black', lw = 1)
-    plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
-    plt.text(0.5,0.3,'ROC curve (area = %0.3f)' % roc_auc)
-    plt.xlabel('False Positive Rate')
-    plt.ylabel('True Positive Rate')
-    plt.show()
-    return roc_auc
-'''
-
-# link prediction binary classifier
-class lpClassifier(object):
-
-    def __init__(self, vectors):
-        self.embeddings = vectors
-
-    def evaluate(self, X_test, Y_test, seed=0):  #clf here is simply a similarity/distance metric
-        state = np.random.get_state()
-        #np.random.seed(seed)
-        test_size = len(X_test)
-        #shuffle_indices = np.random.permutation(np.arange(test_size))
-        #X_test = [X_test[shuffle_indices[i]] for i in range(test_size)]
-        #Y_test = [Y_test[shuffle_indices[i]] for i in range(test_size)]
-
-        Y_true = [int(i) for i in Y_test]
-        Y_probs = []
-        for i in range(test_size):
-            start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape(-1,1)
-            end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape(-1,1)
-            score = cosine_similarity(start_node_emb, end_node_emb) #ranging from [-1, +1]
-            Y_probs.append( (score+1)/2.0 )     #switch to prob... however, we may also directly y_score = score 
-                                                #in sklearn roc... which yields the same reasult
-        roc = roc_auc_score(y_true = Y_true, y_score = Y_probs)
-        if roc < 0.5:
-            roc = 1.0 - roc    #since lp is binary clf task, just predict the opposite if<0.5
-        print("roc=", "{:.9f}".format(roc))
-        #plt_roc(Y_true, Y_probs) #enable to plot roc curve and return auc value
-
-def norm(a):
-    sum = 0.0
-    for i in range(len(a)):
-        sum = sum + a[i] * a[i]
-    return math.sqrt(sum)
-
-def cosine_similarity(a, b):
-    sum = 0.0
-    for i in range(len(a)):
-        sum = sum + a[i] * b[i]
-    #return sum/(norm(a) * norm(b))
-    return sum/(norm(a) * norm(b) + 1e-20)  #fix numerical issue 1e-20 almost = 0!
-
-'''
-#cosine_similarity realized by use...
-#or try sklearn....
-        from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances, euclidean_distances  # we may try diff metrics
-        #ref http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
-'''
-
-def lp_train_test_split(graph, ratio=0.5, neg_pos_link_ratio=1.0, test_pos_links_ratio=0.1):
-    #randomly split links/edges into training set and testing set
-    #*** note: we do not assume every node must be connected after removing links
-    #*** hence, the resulting graph might have few single nodes --> more realistic scenario
-    #*** e.g. a user just sign in a website has no link to others
-    
-    #graph: OpenANE graph data strcture
-    #ratio: perc of links for training; ranging [0, 1]
-    #neg_pos_link_ratio: 1.0 means neg-links/pos-links = 1.0 i.e. balance case; raning [0, +inf)
-    g = graph
-    test_pos_links = int(nx.number_of_edges(g.G) * test_pos_links_ratio)
-
-    print("test_pos_links_ratio {:.2f}, test_pos_links {:.2f}, neg_pos_link_ratio is {:.2f}, links for training {:.2f}%,".format(test_pos_links_ratio, test_pos_links, neg_pos_link_ratio, ratio*100))
-    test_pos_sample = []
-    test_neg_sample = []
-
-    #random.seed(2018) #generate testing set that contains both pos and neg samples
-    test_pos_sample = random.sample(g.G.edges(), test_pos_links)
-    #test_neg_sample = random.sample(list(nx.classes.function.non_edges(g.G)), int(test_size * neg_pos_link_ratio)) #using nx build-in func, not efficient, to do...
-    #more efficient way: 
-    test_neg_sample = []
-    num_neg_sample = int(test_pos_links * neg_pos_link_ratio)
-    num = 0
-    while num < num_neg_sample:
-        pair_nodes = np.random.choice(g.look_back_list, size=2, replace=False)
-        if pair_nodes not in g.G.edges():
-            num += 1
-            test_neg_sample.append(list(pair_nodes))
-    
-    test_edge_pair = test_pos_sample + test_neg_sample 
-    test_edge_label = list(np.ones(len(test_pos_sample))) + list(np.zeros(len(test_neg_sample)))
-
-    print('before removing, the # of links: ', nx.number_of_edges(g.G), ';   the # of single nodes: ', g.numSingleNodes())
-    g.G.remove_edges_from(test_pos_sample)  #training set should NOT contain testing set i.e. delete testing pos samples
-    g.simulate_sparsely_linked_net(link_reserved = ratio)  #simulate sparse net
-    print('after removing,  the # of links: ', nx.number_of_edges(g.G), ';   the # of single nodes: ', g.numSingleNodes())
-    print("# training links {0}; # positive testing links {1}; # negative testing links {2},".format(nx.number_of_edges(g.G), len(test_pos_sample), len(test_neg_sample)))
-    return g.G, test_edge_pair, test_edge_label
-
-#---------------------------------ulits for downstream tasks--------------------------------
-def load_embeddings(filename):   
-    fin = open(filename, 'r')
-    node_num, size = [int(x) for x in fin.readline().strip().split()]
-    vectors = {} 
-    while 1:
-        l = fin.readline()
-        if l == '':
-            break
-        vec = l.strip().split(' ')
-        assert len(vec) == size+1
-        vectors[vec[0]] = [float(x) for x in vec[1:]]
-    fin.close()
-    assert len(vectors) == node_num
-    return vectors
-
-def read_node_label(filename):
-    fin = open(filename, 'r')
-    X = []
-    Y = []
-    while 1:
-        l = fin.readline()
-        if l == '':
-            break
-        vec = l.strip().split(' ')
-        X.append(vec[0])
-        Y.append(vec[1:])
-    fin.close()
-    return X, Y
-
-
-def read_edge_label(filename):
-    fin = open(filename, 'r')
-    X = []
-    Y = []
-    while 1:
-        l = fin.readline()
-        if l == '':
-            break
-        vec = l.strip().split(' ')
-        X.append(vec[:2])
-        Y.append(vec[2])
-    fin.close()
-    return X, Y
-    
--- a/src/libnrl/line.py
+++ b/src/libnrl/line.py
@ -13,7 +13,8 @@ import math
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import tensorflow as tf
-from .classify import ncClassifier, lpClassifier, read_node_label, read_edge_label #to do... try use lpClassifier to choose best embeddings?
+from .downstream import ncClassifier  # to do... try use lpClassifier to choose best embeddings?
+from .utils import read_node_label_downstream


 class _LINE(object):
@ -219,7 +220,7 @@ class LINE(object):
                self.model2.train_one_epoch()
                if label_file:
                    self.get_embeddings()
-                    X, Y = read_node_label(label_file)
+                    X, Y = read_node_label_downstream(label_file)
                    print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
                    clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)
@ -235,7 +236,7 @@ class LINE(object):
                self.model.train_one_epoch()
                if label_file:
                    self.get_embeddings()
-                    X, Y = read_node_label(label_file)
+                    X, Y = read_node_label_downstream(label_file)
                    print("Training classifier using {:.2f}% nodes...".format(clf_ratio*100))
                    clf = ncClassifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)
--- a/src/main.py
+++ b/src/main.py
@ -15,7 +15,7 @@ import random
 import numpy as np
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 from sklearn.linear_model import LogisticRegression #to do... 1) put it in downstream.py; and 2) try SVM...
-from libnrl.classify import ncClassifier, lpClassifier, read_node_label
+from libnrl.downstream import ncClassifier, lpClassifier
 from libnrl.graph import *
 from libnrl.utils import *
 from libnrl import abrw #ANE method; Attributed Biased Random Walk
@ -225,14 +225,14 @@ def main(args):
    del model, g
    #------lp task
    if args.task == 'lp' or args.task == 'lp_and_nc':
-        #X_test_lp, Y_test_lp = read_edge_label(args.label_file)  #if you want to load your own lp testing data
+        #X_test_lp, Y_test_lp = read_edge_label_downstream(args.label_file)  #if you want to load your own lp testing data
        print(f'Link Prediction task; the percentage of positive links for testing: {(args.link_remove*100):.2f}%'
                + ' (by default, also generate equal negative links for testing)')
        clf = lpClassifier(vectors=vectors)     #similarity/distance metric as clf; basically, lp is a binary clf probelm
        clf.evaluate(test_node_pairs, test_edge_labels)
    #------nc task
    if args.task == 'nc' or args.task == 'lp_and_nc':
-        X, Y = read_node_label(args.label_file)
+        X, Y = read_node_label_downstream(args.label_file)
        print(f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%')
        clf = ncClassifier(vectors=vectors, clf=LogisticRegression())   #use Logistic Regression as clf; we may choose SVM or more advanced ones
        clf.split_train_evaluate(X, Y, args.label_reserved)