#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include // ---------------------------------------------------------------------------- // cuda_reduce_bufsz // ---------------------------------------------------------------------------- TEST_CASE("cuda_reduce.BufferSize") { using P = tf::cudaExecutionPolicy<32, 3>; // within one block for(unsigned i=0; i<=P::nv; i++) { REQUIRE(P::reduce_bufsz(i) == 0); } // two blocks for(unsigned i=P::nv+1; i<=2*P::nv; i++) { REQUIRE(P::reduce_bufsz(i) == 2*sizeof(int)); } // three blocks for(unsigned i=2*P::nv+1; i<=3*P::nv; i++) { REQUIRE(P::reduce_bufsz(i) == 3*sizeof(int)); } REQUIRE( P::reduce_bufsz(P::nv*P::nv) == P::nv*sizeof(int) ); REQUIRE( P::reduce_bufsz(P::nv*P::nv+1) == (P::nv + 3)*sizeof(int) ); REQUIRE( P::reduce_bufsz(P::nv*P::nv*2) == (2*P::nv + 2)*sizeof(int) ); } // ---------------------------------------------------------------------------- // cuda_reduce // ---------------------------------------------------------------------------- template void cuda_reduce() { tf::Taskflow taskflow; tf::Executor executor; for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); unsigned bufsz = policy.reduce_bufsz(n); T gold {1000}; auto gpu = tf::cuda_malloc_shared(n); auto res = tf::cuda_malloc_shared(1); auto buf = tf::cuda_malloc_shared(bufsz); for(int i=0; i(); } // ---------------------------------------------------------------------------- // cuda_transform_reduce // ---------------------------------------------------------------------------- template void cuda_transform_reduce() { tf::Taskflow taskflow; tf::Executor executor; for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); unsigned bufsz = policy.reduce_bufsz(n); T gold {1000}; auto gpu = tf::cuda_malloc_shared(n); auto res = tf::cuda_malloc_shared(1); auto buf = tf::cuda_malloc_shared(bufsz); for(int i=0; i(); }