/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KMEANS_H #define KMEANS_H #include #include // Helper functions template inline void sum(T values[], int len, int dimension, int stride, N dst[]) { int x, y; // zero out dst vector for (x = 0; x < dimension; x++) { dst[x] = 0; } for (x = 0; x < len; x+= stride) { for (y = 0; y < dimension; y++) { dst[y] += values[x + y]; } } } template inline void set(T val1[], N val2[], int dimension) { int x; for (x = 0; x < dimension; x++) { val1[x] = val2[x]; } } template inline void add(T val[], N dst[], int dimension) { int x; for (x = 0; x < dimension; x++) { dst[x] += val[x]; } } template inline void divide(T dst[], N divisor, int dimension) { int x; if (divisor == 0) { return; } for (x = 0; x < dimension; x++) { dst[x] /= divisor; } } /** * Calculates euclidean distance. */ template inline N euclideanDist(T val1[], T val2[], int dimension) { int x; N sum = 0; for (x = 0; x < dimension; x++) { N diff = (N) val1[x] - (N) val2[x]; sum += diff * diff; } return sqrt(sum); } // K-Means /** * Picks k random starting points from the data set. */ template void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[], unsigned int seed) { int x, z, num_vals, cntr; num_vals = len / stride; cntr = 0; srand(seed); unsigned int r_vals[k]; unsigned int r; for (x = 0; x < k; x++) { // ensure randomly chosen value is unique int r_check = 0; while (r_check == 0) { r = (unsigned int) rand() % num_vals; r_check = 1; for (z = 0; z < x; z++) { if (r == r_vals[z]) { r_check = 0; } } } r_vals[x] = r; r *= stride; // set dst to be randomly chosen value set(dst + cntr, values + r, dimension); cntr += stride; } } /** * Finds index of closet centroid to a value */ template inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) { int best_ind = 0; N best_len = euclideanDist (values, oldCenters, dimension); int y; for (y = stride; y < pop_size; y+=stride) { N l = euclideanDist (values, oldCenters + y, dimension); if (l < best_len) { best_len = l; best_ind = y; } } return best_ind; } /** * Calculates new centroids by averaging value clusters for old centroids. */ template int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[], T dst[]) { int x, pop_size; pop_size = k * stride; int popularities[k]; N tmp[pop_size]; //zero popularities memset(popularities, 0, sizeof(int) * k); // zero dst, and tmp for (x = 0; x < pop_size; x++) { tmp[x] = 0; } // put summation for each k in tmp for (x = 0; x < len; x+=stride) { int best = findClosest(values + x, oldCenters, dimension, stride, pop_size); add(values + x, tmp + best, dimension); popularities[best / stride]++; } int ret = 0; int y; // divide to get centroid and set dst to result for (x = 0; x < pop_size; x+=stride) { divide(tmp + x, popularities[x / stride], dimension); for (y = 0; y < dimension; y++) { if ((dst + x)[y] != (T) ((tmp + x)[y])) { ret = 1; } } set(dst + x, tmp + x, dimension); } return ret; } template void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride, int iterations, T initialPicks[]){ int k_len = k * stride; int x; // zero newCenters for (x = 0; x < k_len; x++) { finalCentroids[x] = 0; } T * c1 = initialPicks; T * c2 = finalCentroids; T * temp; int ret = 1; for (x = 0; x < iterations; x++) { ret = calculateNewCentroids(k, values, len, dimension, stride, c1, c2); temp = c1; c1 = c2; c2 = temp; if (ret == 0) { x = iterations; } } set(finalCentroids, c1, dimension); } /** * Runs the k-means algorithm on dataset values with some initial centroids. */ template void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride, int iterations, unsigned int seed){ int k_len = k * stride; T initialPicks [k_len]; initialPickHeuristicRandom(k, values, len, dimension, stride, initialPicks, seed); runKMeansWithPicks(k, finalCentroids, values, len, dimension, stride, iterations, initialPicks); } /** * Sets each value in values to the closest centroid. */ template void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) { int x, pop_size; pop_size = k * stride; for (x = 0; x < len; x+= stride) { int best = findClosest(values + x, centroids, dimension, stride, pop_size); set(values + x, centroids + best, dimension); } } #endif // KMEANS_H