// This program demonstrates how to performs a parallel transform // using cudaFlow. #include #include int main(int argc, char* argv[]) { if(argc != 2) { std::cerr << "usage: ./cuda_transform num_items\n"; std::exit(EXIT_FAILURE); } size_t N = std::atoi(argv[1]); auto input = tf::cuda_malloc_shared(N); auto output = tf::cuda_malloc_shared(N); // initialize the data for(size_t i=0; i