102 lines
2.7 KiB
Text
102 lines
2.7 KiB
Text
namespace tf {
|
|
|
|
/** @page cudaFlowMerge Parallel Merge
|
|
|
|
%cudaFlow provides template methods to create parallel merge tasks
|
|
on a CUDA GPU.
|
|
|
|
@tableofcontents
|
|
|
|
@section CUDAParallelMergeIncludeTheHeader Include the Header
|
|
|
|
You need to include the header file, `%taskflow/cuda/algorithm/merge.hpp`,
|
|
for creating a parallel-merge task.
|
|
|
|
@section cudaFlowMergeTwoRangesOfItems Merge two Sorted Ranges of Items
|
|
|
|
tf::cudaFlow::merge performs a parallel merge over two ranges of elements
|
|
into a sorted range of items.
|
|
The following code merges two sorted arrays @c input_1 and @c input_2,
|
|
each of 1000 items, into a sorted array @c output of 2000 items.
|
|
|
|
@code{.cpp}
|
|
const size_t N = 1000;
|
|
int* input_1 = tf::cuda_malloc_shared<int>(N); // input vector 1
|
|
int* input_2 = tf::cuda_malloc_shared<int>(N); // input vector 2
|
|
int* output = tf::cuda_malloc_shared<int>(2*N); // output vector
|
|
|
|
// initializes the data
|
|
for(size_t i=0; i<N; i++) {
|
|
input_1[i] = rand()%100;
|
|
input_2[i] = rand()%100;
|
|
}
|
|
std::sort(input_1, input1 + N);
|
|
std::sort(input_2, input2 + N);
|
|
|
|
// merge input_1 and input_2 to output
|
|
tf::cudaFlow cf;
|
|
tf::cudaTask task = cf.merge(
|
|
input_1, input_1 + N, input_2, input_2 + N, output,
|
|
[]__device__ (int a, int b) { return a < b; } // comparator
|
|
);
|
|
cf.offload();
|
|
@endcode
|
|
|
|
@section cudaFlowMergeTwoRangesOfKeyValueItems Merge two Sorted Ranges of Key-Value Items
|
|
|
|
tf::cudaFlow::merge_by_key performs key-value merge over two sorted ranges
|
|
in a similar way to tf::cuda_merge; additionally, it copies elements
|
|
from the two ranges of values associated with two input keys, respectively.
|
|
The following code performs key-value merge over @c a and @c b:
|
|
|
|
@code{.cpp}
|
|
const size_t N = 2;
|
|
int* a_keys = tf::cuda_malloc_shared<int>(N);
|
|
int* a_vals = tf::cuda_malloc_shared<int>(N);
|
|
int* b_keys = tf::cuda_malloc_shared<int>(N);
|
|
int* b_vals = tf::cuda_malloc_shared<int>(N);
|
|
int* c_keys = tf::cuda_malloc_shared<int>(2*N);
|
|
int* c_vals = tf::cuda_malloc_shared<int>(2*N);
|
|
|
|
// initializes the data
|
|
a_keys[0] = 8, a_keys[1] = 1;
|
|
a_vals[0] = 1, a_vals[1] = 2;
|
|
b_keys[0] = 3, b_keys[1] = 7;
|
|
b_vals[0] = 3, b_vals[1] = 4;
|
|
|
|
// performs key-value merge
|
|
tf::cudaFlow cf;
|
|
cf.merge_by_key(
|
|
a_keys, a_keys+N, a_vals,
|
|
b_keys, b_keys+N, b_vals,
|
|
c_keys, c_vals,
|
|
[] __device__ (int a, int b) { return a < b; },
|
|
);
|
|
cf.offload();
|
|
|
|
// now, c_keys = {1, 3, 7, 8}
|
|
// now, c_vals = {2, 3, 4, 1}
|
|
|
|
// delete the device memory
|
|
tf::cuda_free(buffer);
|
|
tf::cuda_free(a_keys);
|
|
tf::cuda_free(b_keys);
|
|
tf::cuda_free(c_keys);
|
|
tf::cuda_free(a_vals);
|
|
tf::cuda_free(b_vals);
|
|
tf::cuda_free(c_vals);
|
|
@endcode
|
|
|
|
@section cudaFlowMergeMiscellaneousItems Miscellaneous Items
|
|
|
|
Parallel merge algorithms are also available in tf::cudaFlowCapturer
|
|
with the same API.
|
|
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|