#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include // ---------------------------------------------------------------------------- // cuda_scan // ---------------------------------------------------------------------------- template void cuda_scan() { tf::Taskflow taskflow; tf::Executor executor; for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ auto data1 = tf::cuda_malloc_shared(n); auto data2 = tf::cuda_malloc_shared(n); auto scan1 = tf::cuda_malloc_shared(n); auto scan2 = tf::cuda_malloc_shared(n); // -------------------------------------------------------------------------- // inclusive/exclusive scan // -------------------------------------------------------------------------- // initialize the data std::iota(data1, data1 + n, 0); std::iota(data2, data2 + n, 0); tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); // declare the buffer void* buff; cudaMalloc(&buff, policy.scan_bufsz(n)); // create inclusive and exclusive scan tasks tf::cuda_inclusive_scan(policy, data1, data1+n, scan1, tf::cuda_plus{}, buff); tf::cuda_exclusive_scan(policy, data2, data2+n, scan2, tf::cuda_plus{}, buff); stream.synchronize(); // inspect for(int i=1; i(); } // ---------------------------------------------------------------------------- // transform_scan // ---------------------------------------------------------------------------- template void cuda_transform_scan() { tf::Taskflow taskflow; tf::Executor executor; for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ auto data1 = tf::cuda_malloc_shared(n); auto data2 = tf::cuda_malloc_shared(n); auto scan1 = tf::cuda_malloc_shared(n); auto scan2 = tf::cuda_malloc_shared(n); // -------------------------------------------------------------------------- // inclusive/exclusive scan // -------------------------------------------------------------------------- tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); // declare the buffer void* buff; cudaMalloc(&buff, policy.scan_bufsz(n)); // initialize the data std::iota(data1, data1 + n, 0); std::iota(data2, data2 + n, 0); // transform inclusive scan tf::cuda_transform_inclusive_scan(policy, data1, data1+n, scan1, tf::cuda_plus{}, [] __device__ (int a) { return a*10; }, buff ); // transform exclusive scan tf::cuda_transform_exclusive_scan(policy, data2, data2+n, scan2, tf::cuda_plus{}, [] __device__ (int a) { return a*11; }, buff ); stream.synchronize(); // inspect for(int i=1; i(); }