#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include #include #define L2(x1, y1, x2, y2) ((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2)) template void run_and_wait(T& cf) { tf::cudaStream stream; cf.run(stream); stream.synchronize(); } // Each point (thread) computes its distance to each centroid // and adds its x and y values to the sum of its closest // centroid, as well as incrementing that centroid's count of assigned points. __global__ void assign_clusters( const float* px, const float* py, int N, const float* mx, const float* my, float* sx, float* sy, int k, int* c ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; if (index >= N) { return; } // Make global loads once. const float x = px[index]; const float y = py[index]; float best_distance = FLT_MAX; int best_cluster = 0; for (int cluster = 0; cluster < k; ++cluster) { const float distance = L2(x, y, mx[cluster], my[cluster]); if (distance < best_distance) { best_distance = distance; best_cluster = cluster; } } atomicAdd(&sx[best_cluster], x); atomicAdd(&sy[best_cluster], y); atomicAdd(&c [best_cluster], 1); } // Each thread is one cluster, which just recomputes its coordinates as the mean // of all points assigned to it. __global__ void compute_new_means( float* mx, float* my, const float* sx, const float* sy, const int* c ) { const int cluster = threadIdx.x; const int count = max(1, c[cluster]); // turn 0/0 to 0/1 mx[cluster] = sx[cluster] / count; my[cluster] = sy[cluster] / count; } // k-means clustering void kmeans(int N, int K, int M, size_t num_cpus, size_t num_gpus) { std::vector h_px, h_py, h_mx, h_my, mx, my; std::vector c(K), best_ks(N); std::vector sx(K), sy(K); float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy, *d_c; // Randomly generate N points for(int i=0; i::max(); int best_k = 0; for (int k = 0; k < K; ++k) { const float d = L2(x, y, mx[k], my[k]); if (d < best_d) { best_d = d; best_k = k; } } best_ks[i] = best_k; }); auto update_cluster = taskflow.emplace([&](){ for(int i=0; i