#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include // ---------------------------------------------------------------------------- // cuda_merge // ---------------------------------------------------------------------------- template void cuda_merge() { tf::Taskflow taskflow; tf::Executor executor; for(int n1=0; n1<=123456; n1 = n1*2 + 1) { for(int n2=0; n2<=123456; n2 = n2*2 + 1) { taskflow.emplace([n1, n2](){ // gpu data auto da = tf::cuda_malloc_shared(n1); auto db = tf::cuda_malloc_shared(n2); auto dc = tf::cuda_malloc_shared(n1 + n2); // host data std::vector ha(n1), hb(n2), hc(n1 + n2); for(int i=0; i{}, buf ); stream.synchronize(); // -------------------------------------------------------------------------- // CPU merge // -------------------------------------------------------------------------- std::merge(ha.begin(), ha.end(), hb.begin(), hb.end(), hc.begin()); // -------------------------------------------------------------------------- // verify the result // -------------------------------------------------------------------------- for(int i=0; i(); } TEST_CASE("cuda_merge.float" * doctest::timeout(300)) { cuda_merge(); } // ---------------------------------------------------------------------------- // cuda_merge_by_key // ---------------------------------------------------------------------------- template void cuda_merge_by_key() { tf::Taskflow taskflow; tf::Executor executor; for(int n1=0; n1<=123456; n1 = n1*2 + 1) { for(int n2=0; n2<=123456; n2 = n2*2 + 1) { taskflow.emplace([n1, n2](){ // gpu data auto da_k = tf::cuda_malloc_shared(n1); auto da_v = tf::cuda_malloc_shared(n1); auto db_k = tf::cuda_malloc_shared(n2); auto db_v = tf::cuda_malloc_shared(n2); auto dc_k = tf::cuda_malloc_shared(n1 + n2); auto dc_v = tf::cuda_malloc_shared(n1 + n2); std::unordered_map map; for(int i=0; i{}, buf ); stream.synchronize(); // -------------------------------------------------------------------------- // verify the result // -------------------------------------------------------------------------- REQUIRE(std::is_sorted(dc_k, dc_k+n1+n2)); for(int i=0; i(); } TEST_CASE("cuda_merge_by_key.float" * doctest::timeout(300)) { cuda_merge_by_key(); }