/**
* 1-D kmeans
*
* This is a very efficient version of kmeans that is specialized for
* 1-dimensional data. We can run very quickly by keeping only the
* indices of class boundaries.
*
* class 1 = 0
* class 2 = n_1
* class 3 = n_2
* .
* .
* class k = n_k
*
* The data points which belong to a given class i are [n_(i-1) n_i-1].
*
* When the means are updated we can use a binary search to find the new
* boundary. By starting the search from the old boundary, we find the
* new index *very* quickly. Also, we need only update the means based on
* the data points which change clusters.
*
* Overall, this algorithm runs in time ~O( i k log n )
*
* Copyright 2005
* Lucas Scharenbroich
*/
#include
#include
/** Pointer to the data array */
static float *data = NULL;
/**
* This comparison function takes two integer values and compares the float
* values stored in the array at the given indices.
*/
int my_compar( const void *a, const void *b )
{
unsigned int i = *( (unsigned int *) a );
unsigned int j = *( (unsigned int *) b );
if ( data[i] < data[j] ) return -1;
if ( data[i] > data[j] ) return 1;
return 0;
}
/**
* Finds the index of the datapoint in the array which is the midpoint
* between the values of x and y. This is done in logarithmic time using
* a binary search.
*
* There are some subtle issues here. First, when moving from a midpoint,
* we need to keep the selected midpoint as the upper or lower bound, since
* we might otherwise overstep the proper value. At the end, we need to
* check whether the upper or lower value is the best.
*/
unsigned int find_boundary( unsigned int *a, unsigned int n, float x, float y )
{
unsigned int lower, upper, mid;
float w, z, lower_sum, upper_sum;
lower = 0;
upper = n - 1;
while ( lower < upper - 1 )
{
mid = ( upper + lower ) / 2;
z = data[a[mid]];
if ( y - z <= z - x )
upper = mid;
else
lower = mid;
}
/** Find the best one */
w = data[a[lower]];
z = data[a[upper]];
lower_sum = (w - x) * (w - x) + (y - w) * (y - w);
upper_sum = (z - x) * (z - x) + (y - z) * (y - z);
return ( lower_sum < upper_sum ) ? lower : upper;
}
static void swap( unsigned int **a, unsigned int **b )
{
unsigned int *c = *a;
*a = *b;
*b = c;
}
/**
* Compare two partitionings of the data and return true if they differ
* from each other or false if not. This is the convergence criteria for
* the kmeans clustering.
*/
static int changed( unsigned int *a, unsigned int *b, unsigned int n )
{
unsigned int i;
for ( i = 0; i < n; i++ )
if ( a[i] != b[i] )
return 1;
return 0;
}
/**
* Perform k-means clustering on a one-dimensional data set. Due to the
* ordering imposed by scalar data, we can make the code much more
* efficient. Also, some tricks are used in order to keep the results
* numerically robust. There are certainly improvements to be made, though.
*/
float *kmeans( unsigned int n, float *a, unsigned int k )
{
unsigned int i, j, *split, *prev, *perm, iteration;
float *mean, temp, mean_1, mean_2;
int step;
/**
* Allocate an array in order to hold the sorted permutation of the data
* without having to sort the actual data set. Initialize with the
* identity permuatation.
*/
perm = ( unsigned int * ) malloc( sizeof( unsigned int ) * n );
for ( i = 0; i < n; i++ )
perm[i] = i;
/**
* Set a global pointer to the data
*/
data = a;
/**
* Sort the data via the permutation
*/
qsort( perm, n, sizeof( unsigned int ), my_compar );
/**
* Allocate space for the means
*/
mean = ( float * ) malloc( sizeof( float ) * k );
split = ( unsigned int * ) malloc( sizeof( unsigned int ) * ( k + 1 ));
prev = ( unsigned int * ) malloc( sizeof( unsigned int ) * ( k + 1 ));
/**
* Initialize the split points
*/
split[0] = prev[0] = 0;
split[k] = prev[k] = n;
for ( i = 1; i < k; i++ )
{
prev[i] = 0;
split[i] = ( i * n ) / k;
}
/**
* Compute the initial mean sums
*/
for ( i = 0; i < k; i++ )
{
mean[i] = 0.0;
for ( j = split[i]; j < split[i+1]; j++ )
{
mean[i] += data[perm[j]];
}
}
/**
* Loop until the intervals do not change
*/
iteration = 1;
while ( changed( split, prev, k ))
{
/**
* Make the current partition the previous
*/
swap( &prev, &split );
/**
* Find the new partitions
*/
for ( i = 1; i < k; i++ )
{
mean_1 = mean[i-1] / ((float) ( prev[i] - prev[i-1] ));
mean_2 = mean[i] / ((float) ( prev[i+1] - prev[i] ));
split[i] = find_boundary( perm, n, mean_1, mean_2 );
}
/**
* Update the mean sums. Accumulate the difference in order to
* slightly improve the numerical robusteness. It also saves some
* work.
*/
for ( i = 1; i < k; i++ )
{
step = ( prev[i] < split[i] ) ? 1 : -1;
temp = 0.0;
for ( j = prev[i]; j != split[i]; j += step )
{
temp += data[perm[j]];
}
temp = ((float) step) * temp;
mean[i-1] += temp;
mean[i] -= temp;
}
iteration++;
}
printf( "Converged in %d interations\n", iteration );
/**
* Divide the mean sums by the proper denominator
*/
for ( i = 0; i < k; i++ )
{
mean[i] /= (float) (split[i+1] - split[i]);
}
free( prev );
free( split );
free( perm );
return mean;
}