# Scientific Programming with Python
# Software Carpentry Exercise
# Author: Nicola Chiapolini <nchiapol _at_ physik _dot_ uzh _dot ch>


import numpy as np


def kmean_classify(data, means):
    """ assign data points to means

    classify the points in data based on the distance
    to the different means

    Parameters
    ----------
    data : NxM array of float
        N data points with M coordinates each
    means : KxM array of float
        coordinates of the different means

    Returns
    -------
    indices : Nx1 array of int
        index of closes mean for each data point

    """
    indices = np.empty(data.shape[0], dtype=int)
    for data_idx, value in enumerate(data):
        min_dist = np.inf
        min_idx = None
        for mean_idx, mean in enumerate(means):
            dist = np.linalg.norm(value - mean)
            if dist < min_dist:
                min_dist = dist
                min_idx = mean_idx
        indices[data_idx] = min_idx
    return indices


def kmean_means(data, indices, k=None):
    """ calculate means from classified data points

    Parameters
    ----------
    data : NxM array of float
        N data points with M coordinates each
    indices : Nx1 array of int
        index of group the n-th data point belongs to
    k : int
        number of mean values to calculate
        (default: k = inices.max()+1)

    Returns
    -------
    means : KxM array of float
        coordinates of the k-means

    """
    if k is None:
        k = indices.max() + 1
    means = np.empty((k, 2))
    for i in xrange(k):
        means[i] = data[np.where(indices == i)].mean(0)
    return means


def kmean(data, k, n_iter=20):
    """ run the kmean algorithm

    find k clusters in data by iteratively optimising the positions
    of the means

    Parameters
    ----------
    data : NxM array of float
        N data points with M coordinates each
    k : int
        number of mean values to calculate
    n_iter : int
        number of iterations to run

    Returns
    -------
    indices : Nx1 array of int
        index of closes mean for each data point
    means : KxM array of float
        coordinates of the k-means

    """
    # choose k first data points as initial values for means
    means = data[:k, :]
    for _ in xrange(n_iter):
        indices = kmean_classify(data, means)
        means = kmean_means(data, indices, k)
    return indices, means


if __name__ == "__main__":
    offset = [10, 10]
    data = np.random.randn(1000, 2)
    data[::2] += offset
    indices, means = kmean(data, 2)
    print means
