namespace tf { /** @page cudaFlowMerge Parallel Merge %cudaFlow provides template methods to create parallel merge tasks on a CUDA GPU. @tableofcontents @section CUDAParallelMergeIncludeTheHeader Include the Header You need to include the header file, `%taskflow/cuda/algorithm/merge.hpp`, for creating a parallel-merge task. @section cudaFlowMergeTwoRangesOfItems Merge two Sorted Ranges of Items tf::cudaFlow::merge performs a parallel merge over two ranges of elements into a sorted range of items. The following code merges two sorted arrays @c input_1 and @c input_2, each of 1000 items, into a sorted array @c output of 2000 items. @code{.cpp} const size_t N = 1000; int* input_1 = tf::cuda_malloc_shared(N); // input vector 1 int* input_2 = tf::cuda_malloc_shared(N); // input vector 2 int* output = tf::cuda_malloc_shared(2*N); // output vector // initializes the data for(size_t i=0; i(N); int* a_vals = tf::cuda_malloc_shared(N); int* b_keys = tf::cuda_malloc_shared(N); int* b_vals = tf::cuda_malloc_shared(N); int* c_keys = tf::cuda_malloc_shared(2*N); int* c_vals = tf::cuda_malloc_shared(2*N); // initializes the data a_keys[0] = 8, a_keys[1] = 1; a_vals[0] = 1, a_vals[1] = 2; b_keys[0] = 3, b_keys[1] = 7; b_vals[0] = 3, b_vals[1] = 4; // performs key-value merge tf::cudaFlow cf; cf.merge_by_key( a_keys, a_keys+N, a_vals, b_keys, b_keys+N, b_vals, c_keys, c_vals, [] __device__ (int a, int b) { return a < b; }, ); cf.offload(); // now, c_keys = {1, 3, 7, 8} // now, c_vals = {2, 3, 4, 1} // delete the device memory tf::cuda_free(buffer); tf::cuda_free(a_keys); tf::cuda_free(b_keys); tf::cuda_free(c_keys); tf::cuda_free(a_vals); tf::cuda_free(b_vals); tf::cuda_free(c_vals); @endcode @section cudaFlowMergeMiscellaneousItems Miscellaneous Items Parallel merge algorithms are also available in tf::cudaFlowCapturer with the same API. */ }