// This program demonstrate how to perform a parallel scan // using cudaFlow. #include #include int main(int argc, char* argv[]) { if(argc != 2) { std::cerr << "usage: ./cuda_scan N\n"; std::exit(EXIT_FAILURE); } int N = std::atoi(argv[1]); auto data1 = tf::cuda_malloc_shared(N); auto data2 = tf::cuda_malloc_shared(N); auto scan1 = tf::cuda_malloc_shared(N); auto scan2 = tf::cuda_malloc_shared(N); // -------------------------------------------------------------------------- // inclusive/exclusive scan // -------------------------------------------------------------------------- // initialize the data std::iota(data1, data1 + N, 0); std::iota(data2, data2 + N, 0); tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); // declare the buffer void* buff; cudaMalloc(&buff, policy.scan_bufsz(N)); // create inclusive and exclusive scan tasks tf::cuda_inclusive_scan(policy, data1, data1+N, scan1, tf::cuda_plus{}, buff); tf::cuda_exclusive_scan(policy, data2, data2+N, scan2, tf::cuda_plus{}, buff); stream.synchronize(); // inspect for(int i=1; i{}, [] __device__ (int a) { return a*10; }, buff ); // transform exclusive scan tf::cuda_transform_exclusive_scan(policy, data2, data2+N, scan2, tf::cuda_plus{}, [] __device__ (int a) { return a*11; }, buff ); stream.synchronize(); // inspect for(int i=1; i