mesytec-mnode/external/taskflow-3.8.0/unittests/cuda/test_cuda_reduce.cu
2025-01-04 01:25:05 +01:00

180 lines
4.3 KiB
Text

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cuda/cudaflow.hpp>
#include <taskflow/cuda/algorithm/reduce.hpp>
// ----------------------------------------------------------------------------
// cuda_reduce_bufsz
// ----------------------------------------------------------------------------
TEST_CASE("cuda_reduce.BufferSize") {
using P = tf::cudaExecutionPolicy<32, 3>;
// within one block
for(unsigned i=0; i<=P::nv; i++) {
REQUIRE(P::reduce_bufsz<int>(i) == 0);
}
// two blocks
for(unsigned i=P::nv+1; i<=2*P::nv; i++) {
REQUIRE(P::reduce_bufsz<int>(i) == 2*sizeof(int));
}
// three blocks
for(unsigned i=2*P::nv+1; i<=3*P::nv; i++) {
REQUIRE(P::reduce_bufsz<int>(i) == 3*sizeof(int));
}
REQUIRE(
P::reduce_bufsz<int>(P::nv*P::nv) == P::nv*sizeof(int)
);
REQUIRE(
P::reduce_bufsz<int>(P::nv*P::nv+1) == (P::nv + 3)*sizeof(int)
);
REQUIRE(
P::reduce_bufsz<int>(P::nv*P::nv*2) == (2*P::nv + 2)*sizeof(int)
);
}
// ----------------------------------------------------------------------------
// cuda_reduce
// ----------------------------------------------------------------------------
template <typename T>
void cuda_reduce() {
tf::Taskflow taskflow;
tf::Executor executor;
for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) {
taskflow.emplace([n](){
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);
unsigned bufsz = policy.reduce_bufsz<T>(n);
T gold {1000};
auto gpu = tf::cuda_malloc_shared<T>(n);
auto res = tf::cuda_malloc_shared<T>(1);
auto buf = tf::cuda_malloc_shared<T>(bufsz);
for(int i=0; i<n; i++) {
gpu[i] = i;
gold += i;
}
*res = T{1000}; // initial value
// reduce
tf::cuda_reduce(policy,
gpu, gpu + n, res, [] __device__ (T a, T b) { return a + b; }, buf
);
stream.synchronize();
REQUIRE(*res == gold);
// uninitialized reduce
tf::cuda_uninitialized_reduce(policy,
gpu, gpu + n, res, [] __device__ (T a, T b) { return a + b; }, buf
);
stream.synchronize();
if(n == 0) {
REQUIRE(*res == 1000);
}
else {
REQUIRE(*res == gold - 1000);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
REQUIRE(cudaFree(res) == cudaSuccess);
REQUIRE(cudaFree(buf) == cudaSuccess);
});
}
executor.run(taskflow).wait();
}
TEST_CASE("cuda_reduce.int" * doctest::timeout(300)) {
cuda_reduce<int>();
}
// ----------------------------------------------------------------------------
// cuda_transform_reduce
// ----------------------------------------------------------------------------
template <typename T>
void cuda_transform_reduce() {
tf::Taskflow taskflow;
tf::Executor executor;
for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) {
taskflow.emplace([n](){
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);
unsigned bufsz = policy.reduce_bufsz<T>(n);
T gold {1000};
auto gpu = tf::cuda_malloc_shared<T>(n);
auto res = tf::cuda_malloc_shared<T>(1);
auto buf = tf::cuda_malloc_shared<T>(bufsz);
for(int i=0; i<n; i++) {
gpu[i] = i;
gold += (-i);
}
*res = T{1000}; // initial value
// reduce
tf::cuda_transform_reduce(policy,
gpu, gpu + n, res,
[] __device__ (T a, T b) { return a + b; },
[] __device__ (T a) { return -a; },
buf
);
stream.synchronize();
REQUIRE(*res == gold);
// uninitialized reduce
tf::cuda_uninitialized_transform_reduce(policy,
gpu, gpu + n, res,
[] __device__ (T a, T b) { return a + b; },
[] __device__ (T a) { return -a; },
buf
);
stream.synchronize();
if(n == 0) {
REQUIRE(*res == 1000);
}
else {
REQUIRE(*res == gold - 1000);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
REQUIRE(cudaFree(res) == cudaSuccess);
REQUIRE(cudaFree(buf) == cudaSuccess);
});
}
executor.run(taskflow).wait();
}
TEST_CASE("cuda_transform_reduce.int" * doctest::timeout(300)) {
cuda_transform_reduce<int>();
}