Performance

To measure the performance of various methods implementend in clustimage, we can use the digits dataset to determine the match between clustered sampels and the true label. It can be seen that multiple different parameters still result in similar good performance based on the results below.

The following peace of code clusters the digit images, compares the detected cluster labels with the true label, and finally computes the accuracy.

import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from clustimage import Clustimage
import classeval as clf
import itertools as it
from sklearn.metrics import accuracy_score

# Load example data
digits = load_digits(n_class=10)
X, y_true = digits.data, digits.target

param_grid = {
    'method':['pca', 'hog', None],
    'evaluate' : ['silhouette', 'dbindex', 'derivative'],
    'cluster_space' : ['low', 'high'],
    }

scores = []
labels = []
allNames = param_grid.keys()
combinations = list(it.product(*(param_grid[Name] for Name in allNames)))

# Iterate over all combinations
for combination in combinations:
    # Initialize
    cl = Clustimage(method=combination[0])

    # Preprocessing, feature extraction and cluster evaluation
    results = cl.fit_transform(X, cluster_space=combination[2], evaluate=combination[1])

    # Compute confmat
    cm = clf.confmatrix.eval(y_true, results['labels'], normalize=False)

    # Transform numbers to make it comparible
    y_pred = results['labels']
    cm_argmax = cm['confmat'].argmax(axis=0)
    y_pred_ = np.array([cm_argmax[i] for i in y_pred])

    # Compute again confmat
    cm = clf.confmatrix.eval(y_true, y_pred_, normalize=False)
    fig,ax = clf.confmatrix.plot(cm)
    ax.set_title('Feature extraction: [%s]\nCluster evaluation with [%s] in [%s] dimension' %(combination[0], combination[1], combination[2]), fontsize=16)
    plt.pause(0.1)

    # Store scores and labels
    scores.append(accuracy_score(y_true,y_pred_))
    labels.append(str(combination[0]) + ' - ' + combination[1] + ' - ' + combination[2])

# Make plot
import numpy as np
scores=np.array(scores)
labels=np.array(labels)
isort=np.argsort(scores)
plt.figure(figsize=(12,6))
plt.plot(np.arange(0,len(scores)), scores[isort])
plt.xticks(np.arange(0,len(scores)), labels[isort], rotation='vertical')
plt.margins(0.2)
plt.title("Comparison of various approaches.", fontsize=14)
plt.grid(True)

Comparison of the performance for the digits dataset using various methods.

Results of the best two approaches.