// This program implements the k-means clustering algorithm in three forms: // - sequential cpu // - parallel cpu // - gpu with conditional tasking // - gpu without conditional tasking #include #include #include #include #include #include #define L2(x1, y1, x2, y2) ((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2)) // ---------------------------------------------------------------------------- // CPU (sequential) implementation // ---------------------------------------------------------------------------- // run k-means on cpu std::pair, std::vector> cpu_seq( const int N, const int K, const int M, const std::vector& px, const std::vector& py ) { std::vector c(K); std::vector sx(K), sy(K), mx(K), my(K); // initial centroids for(int i=0; i::max(); int best_k = 0; for (int k = 0; k < K; ++k) { const float d = L2(x, y, mx[k], my[k]); if (d < best_d) { best_d = d; best_k = k; } } sx[best_k] += x; sy[best_k] += y; c [best_k] += 1; } // update the centroid for(int k=0; k, std::vector> cpu_par( const int N, const int K, const int M, const std::vector& px, const std::vector& py ) { const auto num_threads = std::thread::hardware_concurrency(); tf::Executor executor; tf::Taskflow taskflow("K-Means"); std::vector c(K), best_ks(N); std::vector sx(K), sy(K), mx(K), my(K); // initial centroids auto init = taskflow.emplace([&](){ for(int i=0; i::max(); int best_k = 0; for (int k = 0; k < K; ++k) { const float d = L2(x, y, mx[k], my[k]); if (d < best_d) { best_d = d; best_k = k; } } best_ks[i] = best_k; }); pf.name("parallel-for"); auto update_cluster = taskflow.emplace([&](){ for(int i=0; i= N) { return; } // Make global loads once. const float x = px[index]; const float y = py[index]; float best_distance = FLT_MAX; int best_cluster = 0; for (int cluster = 0; cluster < k; ++cluster) { const float distance = L2(x, y, mx[cluster], my[cluster]); if (distance < best_distance) { best_distance = distance; best_cluster = cluster; } } atomicAdd(&sx[best_cluster], x); atomicAdd(&sy[best_cluster], y); atomicAdd(&c [best_cluster], 1); } // Each thread is one cluster, which just recomputes its coordinates as the mean // of all points assigned to it. __global__ void compute_new_means( float* mx, float* my, const float* sx, const float* sy, const int* c ) { const int cluster = threadIdx.x; const int count = max(1, c[cluster]); // turn 0/0 to 0/1 mx[cluster] = sx[cluster] / count; my[cluster] = sy[cluster] / count; } // Runs k-means on gpu std::pair, std::vector> gpu_predicate( const int N, const int K, const int M, const std::vector& h_px, const std::vector& h_py ) { std::vector h_mx, h_my; float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy, *d_c; for(int i=0; i= N) { throw std::runtime_error("k must be smaller than the number of points"); } if(M < 1) { throw std::runtime_error("num_iterations must be larger than 0"); } std::vector h_px, h_py, mx, my; // Randomly generate N points std::cout << "generating " << N << " random points ...\n"; for(int i=0; i(send-sbeg).count() << " ms\n"; std::cout << "k centroids found by cpu (sequential)\n"; for(int k=0; k(pend-pbeg).count() << " ms\n"; std::cout << "k centroids found by cpu (parallel)\n"; for(int k=0; k(rend-rbeg).count() << " ms\n"; std::cout << "k centroids found by gpu\n"; for(int k=0; k