namespace tf {
/** @page cudaFlowScan Parallel Scan
%cudaFlow provides template methods to create parallel scan tasks
on a CUDA GPU.
@tableofcontents
@section CUDAParallelScanIncludeTheHeader Include the Header
You need to include the header file, `%taskflow/cuda/algorithm/scan.hpp`,
for creating a parallel-scan task.
@section cudaFlowScanARangeOfItems Scan a Range of Items
tf::cudaFlow::inclusive_scan computes an inclusive prefix sum operation using
the given binary operator over a range of elements specified by [first, last).
The term "inclusive" means that the i-th input element is included
in the i-th sum.
The following code computes the inclusive prefix sum over an input array and
stores the result in an output array.
@code{.cpp}
const size_t N = 1000000;
int* input = tf::cuda_malloc_shared(N); // input vector
int* output = tf::cuda_malloc_shared(N); // output vector
// initializes the data
for(size_t i=0; i[first, last)
and computes an inclusive prefix sum over these transformed items.
The following code multiplies each item by 10 and then compute the inclusive prefix sum
over 1000000 transformed items.
@code{.cpp}
const size_t N = 1000000;
int* input = tf::cuda_malloc_shared(N); // input vector
int* output = tf::cuda_malloc_shared(N); // output vector
// initializes the data
for(size_t i=0; i(N); // input vector
int* output = tf::cuda_malloc_shared(N); // output vector
// initializes the data
for(size_t i=0; i