ensemble.py

import numpy as np
import pickle
import re
import os
import sys
import itertools
from glob import glob
from sklearn.metrics import confusion_matrix, f1_score, auc, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from joblib import Parallel, delayed
import multiprocessing
import copy


# Just assume fixed CV size for ensemble with evaluation
cvSize = 5
numClasses = 8

# First argument is folder, filled with CV results files
all_preds_path = sys.argv[1]

# Second argument indicates, whether we are only generating predictions or actually evaluating performance on something
if 'eval' in sys.argv[2]:
    evaluate = True
    # Determin if vote or average is used
    if 'vote' in sys.argv[2]:
        evaluate_method = 'vote'
    else:
        evaluate_method = 'average'
    # Determine if exhaustive combination search or ordered search is used
    if 'exhaust' in sys.argv[2]:
        exhaustive_search = True
        num_top_models = [int(s) for s in re.findall(r'\d+',sys.argv[2])][-1]
    else:
        exhaustive_search = False
    # Third argument indicates where subset should be saved
    if 'subSet' in sys.argv[3]:
        subSetPath = sys.argv[3]
    else:
        subSetPath = None
else:
    evaluate = False
    acceptedList = []
    if 'last' in sys.argv[2]:
        acceptedList.append('last')
    if 'best' in sys.argv[2]:
        acceptedList.append('best')
    if 'meta' in sys.argv[2]:
        acceptedList.append('meta')                
    # Third argument indicates whether some subset should be used
    if 'subSet' in sys.argv[3]:
        # Load subset file
        with open(sys.argv[3],'rb') as f:
            subSetDict = pickle.load(f)       
        subSet = subSetDict['subSet']
    else:
        subSet = None    

# Fourth argument indicates csv path to save final results into
if len(sys.argv) > 4 and 'csvFile' in sys.argv[4]:
    csvPath = sys.argv[4]
    origFilePath = sys.argv[5]
else:
    csvPath = None

# Function to get some metrics back
def get_metrics(predictions,targets):
    # Calculate metrics
    # Accuarcy
    acc = np.mean(np.equal(np.argmax(predictions,1),np.argmax(targets,1)))
    # Confusion matrix
    conf = confusion_matrix(np.argmax(targets,1),np.argmax(predictions,1))     
    # Class weighted accuracy
    wacc = conf.diagonal()/conf.sum(axis=1)  
    # Auc
    fpr = {}
    tpr = {}
    roc_auc = np.zeros([numClasses])
    for i in range(numClasses):
        fpr[i], tpr[i], _ = roc_curve(targets[:, i], predictions[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])       
    # F1 Score
    f1 = f1_score(np.argmax(predictions,1),np.argmax(targets,1),average='weighted')        
    # Print
    print("Accuracy:",acc)
    print("F1-Score:",f1)
    print("WACC:",wacc)
    print("Mean WACC:",np.mean(wacc))
    print("AUC:",roc_auc)
    print("Mean Auc:",np.mean(roc_auc))        
    return acc, f1, wacc, roc_auc

# If its actual evaluation, evaluate each CV indipendently, show results both for each CV set and all of them together
if evaluate:
    # Go through all files
    files = sorted(glob(all_preds_path+'/*'))
    # Because of unkown prediction size, dont use matrix
    final_preds = {}
    final_targets = {}
    all_waccs = []
    accum_preds = {}
    # Define each pred size in loop
    firstLoaded = False
    for j in range(len(files)):
        # Skip if it is a directory
        if os.path.isdir(files[j]):
            continue
        # Skip if not a pkl file
        if '.pkl' not in files[j]:
            print("Remove non-pkl files")
            break
        # Load file
        with open(files[j],'rb') as f:
            allDataCurr = pickle.load(f)    
        # Get predictions
        if not firstLoaded:
            # Define accumulated prediction size
            for i in range(cvSize):
                accum_preds[i] = np.zeros([len(files),len(allDataCurr['bestPred'][i]),numClasses])
            firstLoaded = True
        # Write preds into array
        #print(files[j],allDataCurr['bestPred'][0].shape)
        wacc_avg = 0
        for i in range(cvSize):
            accum_preds[i][j,:,:] = allDataCurr['bestPred'][i]
            final_targets[i] = allDataCurr['targets'][i]
            # Confusion matrix
            conf = confusion_matrix(np.argmax(allDataCurr['targets'][i],1),np.argmax(allDataCurr['bestPred'][i],1))     
            # Class weighted accuracy
            wacc_avg += np.mean(conf.diagonal()/conf.sum(axis=1))  
        wacc_avg = wacc_avg/cvSize    
        all_waccs.append(wacc_avg)         
        # Print performance of model + name
        print("Model:",files[j],"WACC:",wacc_avg)
    # Print results per cv
    # Averaging predictions
    f1_avg = 0
    acc_avg = 0
    auc_avg = np.zeros([numClasses])
    wacc_avg = np.zeros([numClasses])
    # Voting with predictions
    f1_vote = 0
    acc_vote = 0
    auc_vote = np.zeros([numClasses])
    wacc_vote = np.zeros([numClasses])
    # Linear SVM on predictions
    f1_linsvm = 0
    acc_linsvm = 0
    auc_linsvm = np.zeros([numClasses])
    wacc_linsvm = np.zeros([numClasses])
    # RF on predictions
    f1_rf = 0
    acc_rf = 0
    auf_rf = np.zeros([numClasses])
    wacc_rf = np.zeros([numClasses])
    # Helper function to determine top combination
    def evalEnsemble(currComb,eval_auc=False):
        currWacc = np.zeros([cvSize])
        currAUC = np.zeros([cvSize])
        for i in range(cvSize):
            if evaluate_method == 'vote':
                pred_argmax = np.argmax(accum_preds[i][currComb,:,:],2)   
                pred_eval = np.zeros([pred_argmax.shape[1],numClasses]) 
                for j in range(pred_eval.shape[0]):
                    pred_eval[j,:] = np.bincount(pred_argmax[:,j],minlength=numClasses)  
            else:
                pred_eval = np.mean(accum_preds[i][currComb,:,:],0)
            # Confusion matrix
            conf = confusion_matrix(np.argmax(final_targets[i],1),np.argmax(pred_eval,1))     
            # Class weighted accuracy
            currWacc[i] = np.mean(conf.diagonal()/conf.sum(axis=1))   
            if eval_auc:
                currAUC_ = np.zeros([numClasses])
                for j in range(numClasses):
                    fpr, tpr, _ = roc_curve(final_targets[i][:,j], pred_eval[:, j])
                    currAUC_[j] = auc(fpr, tpr)
                currAUC[i] = np.mean(currAUC_)                
        if eval_auc:
            currAUCstd = np.std(currAUC)
            currAUC = np.mean(currAUC)
        else:
            currAUCstd = currAUC
        currWaccStd = np.std(currWacc)
        currWacc = np.mean(currWacc)
        if eval_auc:
            return currWacc, currWaccStd, currAUC, currAUCstd       
        else:
            return currWacc
    if exhaustive_search:
        # First: determine best subset based on average CV wacc
        # Select best subset based on wacc metric
        # Only take top N models
        top_inds = np.argsort(-np.array(all_waccs))
        elements = top_inds[:num_top_models]
        allCombs = []
        for L in range(0, len(elements)+1):
            for subset in itertools.combinations(elements, L):
                allCombs.append(subset)
                #print(subset)
        print("Number of combinations",len(allCombs))
        print("Models considered")
        for i in range(len(elements)):
            print("ID",elements[i],files[elements[i]]) 
        #allWaccs = np.zeros([len(allCombs)])
        num_cores = multiprocessing.cpu_count()
        print("Cores available",num_cores)
        allWaccs = Parallel(n_jobs=num_cores)(delayed(evalEnsemble)(comb) for comb in allCombs)
        # Sort by highest value
        allWaccsSrt = -np.sort(-np.array(allWaccs))
        srtInds = np.argsort(-np.array(allWaccs))
        allCombsSrt = np.array(allCombs)[srtInds]
        for i in range(5):
            print("Top",i+1)
            print("Best WACC",allWaccsSrt[i])       
            wacc, wacc_std, auc_val, auc_val_std = evalEnsemble(allCombsSrt[i],eval_auc=True)
            print("Metrics WACC %.4f +- %.4f AUC %.4f +- %.4f"%(wacc,wacc_std,auc_val,auc_val_std))     
            print("Best Combination:",allCombsSrt[i])
            print("Corresponding File Names")
            subSetDict = {}
            subSetDict['subSet'] = []
            for j in allCombsSrt[i]:
                print("ID",j,files[j])  
                # Add filename without last part, indicating the type "best/last/meta/full"
                if i == 0:                
                    subSetDict['subSet'].append(files[j])    
            print("---------------------------------------------")                 
        bestComb = allCombsSrt[0]     
    else:
        # Only take top N models
        top_inds = np.argsort(-np.array(all_waccs))
        # Go through all top N combs
        allWaccs = np.zeros([len(top_inds)])
        allCombs = []
        for i in range(len(top_inds)):
            allCombs.append([])
            if i==0:
                allCombs[i].append(top_inds[0])
            else:
                allCombs[i] = copy.deepcopy(allCombs[i-1])
                allCombs[i].append(top_inds[i])
            # Test comb
            allWaccs[i] = evalEnsemble(allCombs[i])
        # Sort by highest value
        allWaccsSrt = -np.sort(-np.array(allWaccs))
        srtInds = np.argsort(-np.array(allWaccs))
        allCombsSrt = np.array(allCombs)[srtInds]
        for i in range(len(top_inds)):
            print("Top",i+1)
            print("WACC",allWaccsSrt[i])  
            wacc, wacc_std, auc_val, auc_val_std = evalEnsemble(allCombsSrt[i],eval_auc=True)
            print("Metrics WACC %.4f +- %.4f AUC %.4f +- %.4f"%(wacc,wacc_std,auc_val,auc_val_std))           
            print("Combination:",allCombsSrt[i])
            if i == 0:
                subSetDict = {}
                subSetDict['subSet'] = []
                for j in allCombsSrt[i]:
                    print("ID",j,files[j])  
                    # Add filename without last part, indicating the type "best/last/meta/full"
                    subSetDict['subSet'].append(files[j])
            print("---------------------------------------------") 
        p#rint("Corresponding File Names")  
        #for j in allCombs[-1]:
        #    print("ID",j,files[j])                          
        bestComb = allCombsSrt[0]    
    # Save subset for later
    if subSetPath is not None:
        with open(subSetPath, 'wb') as f:
            pickle.dump(subSetDict, f, pickle.HIGHEST_PROTOCOL)                    

else:
    # Only generate predictions. All models predict on the same set -> cv models are equal to full models here    
    # Go through all files
    files = sorted(glob(all_preds_path+'/*'))
    # Because of unkown prediction size, only determin it in the loop
    firstLoaded = False
    ind = 0
    for j in range(len(files)):
        # Skip if not a pkl file
        if '.pkl' not in files[j]:
            continue
        # Potentially check, if this file is among the selected subset
        if subSet is not None:
            # Search
            found = False
            for name in subSet:
                _, name_only = name.split('ISIC')
                if name_only in files[j]:
                    found = True
                    break
            if not found:
                # Check extra for acceptedList inclusion
                for name in subSet:
                    _, name_only = name.split('ISIC')
                    if name_only[:-13] in files[j]:
                        found = True
                        break
                if not found:
                    continue
                # Then check, whether this type of "best,last,meta,full" is desired
                found = False
                for name in acceptedList:
                    if name in files[j]:
                        found = True
                        break
                if not found:
                    continue            
        # Load file
        with open(files[j],'rb') as f:
            allDataCurr = pickle.load(f)    
        # Get predictions
        if not firstLoaded:
            # Define final prediction/targets size, assume fixed CV size
            final_preds = np.zeros([len(allDataCurr['extPred'][0]),numClasses])
            # Define accumulated prediction size
            accum_preds = np.expand_dims(allDataCurr['extPred'][0],0)
            ind += 1
            if len(allDataCurr['extPred']) > 1:
                for i in range(1,len(allDataCurr['extPred'])):
                    accum_preds = np.concatenate((accum_preds,np.expand_dims(allDataCurr['extPred'][i],0)),0)
                    ind += 1
            else:
                # Just repeat the first model X times
                for i in range(1,5):
                    accum_preds = np.concatenate((accum_preds,np.expand_dims(allDataCurr['extPred'][0],0)),0)
                    ind += 1                
            firstLoaded = True
        else:
            # Write preds into array
            if len(allDataCurr['extPred']) > 1:
                for i in range(len(allDataCurr['extPred'])):
                    accum_preds = np.concatenate((accum_preds,np.expand_dims(allDataCurr['extPred'][i],0)),0)
                    ind += 1
            else:
                # Just repeat the first model X times
                for i in range(0,5):
                    accum_preds = np.concatenate((accum_preds,np.expand_dims(allDataCurr['extPred'][0],0)),0)
                    ind += 1                       
        print(files[j])
    # Resize array to actually used size
    print(accum_preds.shape)
    final_preds = accum_preds[:ind,:,:]
    print(final_preds.shape)
    # Average for final predictions
    final_preds = np.mean(final_preds,0)
    class_pred = np.argmax(final_preds,1)
    print(np.mean(final_preds,0))
    # Write into csv file, according to ordered list
    if csvPath is not None:
        # Get order file names from original folder
        files = sorted(glob(origFilePath+'/*'))
        # save into formatted csv file
        with open(csvPath, 'w') as csv_file:
            # First line
            csv_file.write("image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK\n")
            ind = 0
            for file_name in files:
                if 'ISIC_' not in file_name:
                    continue
                splits = file_name.split('\\')
                name = splits[-1]
                name, _ = name.split('.')
                csv_file.write(name + "," + str(final_preds[ind,0]) + "," +  str(final_preds[ind,1]) + "," + str(final_preds[ind,2]) + "," + str(final_preds[ind,3]) + "," + str(final_preds[ind,4]) + "," + str(final_preds[ind,5]) + "," + str(final_preds[ind,6]) + "," + str(final_preds[ind,7]) + "," + str(final_preds[ind,8]) + "\n")
                ind += 1