137 lines
3.9 KiB
Text
137 lines
3.9 KiB
Text
|
namespace tf {
|
||
|
|
||
|
/** @page cudaFlowScan Parallel Scan
|
||
|
|
||
|
%cudaFlow provides template methods to create parallel scan tasks
|
||
|
on a CUDA GPU.
|
||
|
|
||
|
@tableofcontents
|
||
|
|
||
|
@section CUDAParallelScanIncludeTheHeader Include the Header
|
||
|
|
||
|
You need to include the header file, `%taskflow/cuda/algorithm/scan.hpp`,
|
||
|
for creating a parallel-scan task.
|
||
|
|
||
|
@section cudaFlowScanARangeOfItems Scan a Range of Items
|
||
|
|
||
|
tf::cudaFlow::inclusive_scan computes an inclusive prefix sum operation using
|
||
|
the given binary operator over a range of elements specified by <tt>[first, last)</tt>.
|
||
|
The term "inclusive" means that the i-th input element is included
|
||
|
in the i-th sum.
|
||
|
The following code computes the inclusive prefix sum over an input array and
|
||
|
stores the result in an output array.
|
||
|
|
||
|
@code{.cpp}
|
||
|
const size_t N = 1000000;
|
||
|
int* input = tf::cuda_malloc_shared<int>(N); // input vector
|
||
|
int* output = tf::cuda_malloc_shared<int>(N); // output vector
|
||
|
|
||
|
// initializes the data
|
||
|
for(size_t i=0; i<N; input[i++]=rand());
|
||
|
|
||
|
// creates a cudaFlow of one task to perform inclusive scan
|
||
|
tf::cudaFlow cf;
|
||
|
tf::cudaTask task = cf.inclusive_scan(
|
||
|
input, input + N, output, [] __device__ (int a, int b) { return a + b; }
|
||
|
);
|
||
|
cf.offload();
|
||
|
|
||
|
// verifies the result
|
||
|
for(size_t i=1; i<N; i++) {
|
||
|
assert(output[i] == output[i-1] + input[i]);
|
||
|
}
|
||
|
@endcode
|
||
|
|
||
|
On the other hand, tf::cudaFlow::exclusive_scan computes an exclusive prefix sum operation.
|
||
|
The term "exclusive" means that the i-th input element is @em NOT included
|
||
|
in the i-th sum.
|
||
|
|
||
|
@code{.cpp}
|
||
|
// creates a cudaFlow of one task to perform exclusive scan
|
||
|
tf::cudaFlow cf;
|
||
|
tf::cudaTask task = cf.exclusive_scan(
|
||
|
input, input + N, output, [] __device__ (int a, int b) { return a + b; }
|
||
|
);
|
||
|
cf.offload();
|
||
|
|
||
|
// verifies the result
|
||
|
for(size_t i=1; i<N; i++) {
|
||
|
assert(output[i] == output[i-1] + input[i-1]);
|
||
|
}
|
||
|
@endcode
|
||
|
|
||
|
@section cudaFlowScanTransformedItems Scan a Range of Transformed Items
|
||
|
|
||
|
tf::cudaFlow::transform_inclusive_scan transforms
|
||
|
each item in the range <tt>[first, last)</tt>
|
||
|
and computes an inclusive prefix sum over these transformed items.
|
||
|
The following code multiplies each item by 10 and then compute the inclusive prefix sum
|
||
|
over 1000000 transformed items.
|
||
|
|
||
|
@code{.cpp}
|
||
|
const size_t N = 1000000;
|
||
|
int* input = tf::cuda_malloc_shared<int>(N); // input vector
|
||
|
int* output = tf::cuda_malloc_shared<int>(N); // output vector
|
||
|
|
||
|
// initializes the data
|
||
|
for(size_t i=0; i<N; i++)
|
||
|
input[i] = rand();
|
||
|
}
|
||
|
|
||
|
// creates a cudaFlow of one task to inclusively scan over transformed input
|
||
|
tf::cudaFlow cf;
|
||
|
tf::cudaTask task = cf.transform_inclusive_scan(
|
||
|
input, input + N, output,
|
||
|
[] __device__ (int a, int b) { return a + b; }, // binary scan operator
|
||
|
[] __device__ (int a) { return a*10; } // unary transform operator
|
||
|
);
|
||
|
cf.offload();
|
||
|
|
||
|
// verifies the result
|
||
|
for(size_t i=1; i<N; i++) {
|
||
|
assert(output[i] == output[i-1] + input[i] * 10);
|
||
|
}
|
||
|
@endcode
|
||
|
|
||
|
Similarly, tf::cudaFlow::transform_exclusive_scan performs an exclusive prefix sum
|
||
|
over a range of transformed items.
|
||
|
The following code computes the exclusive prefix sum over 1000000 transformed items
|
||
|
each multiplied by 10.
|
||
|
|
||
|
@code{.cpp}
|
||
|
const size_t N = 1000000;
|
||
|
int* input = tf::cuda_malloc_shared<int>(N); // input vector
|
||
|
int* output = tf::cuda_malloc_shared<int>(N); // output vector
|
||
|
|
||
|
// initializes the data
|
||
|
for(size_t i=0; i<N; input[i++]=rand());
|
||
|
|
||
|
// creates a cudaFlow of one task to exclusively scan over transformed input
|
||
|
tf::cudaFlow cf;
|
||
|
tf::cudaTask task = cf.transform_exclusive_scan(
|
||
|
input, input + N, output,
|
||
|
[] __device__ (int a, int b) { return a + b; }, // binary scan operator
|
||
|
[] __device__ (int a) { return a*10; } // unary transform operator
|
||
|
);
|
||
|
cf.offload();
|
||
|
|
||
|
// verifies the result
|
||
|
for(size_t i=1; i<N; i++) {
|
||
|
assert(output[i] == output[i-1] + input[i-1] * 10);
|
||
|
}
|
||
|
@endcode
|
||
|
|
||
|
@section cudaFlowScanMiscellaneousItems Miscellaneous Items
|
||
|
|
||
|
Parallel scan algorithms are also available in tf::cudaFlowCapturer
|
||
|
with the same API.
|
||
|
|
||
|
*/
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|