#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include // ---------------------------------------------------------------------------- // amax, amin, and asum // ---------------------------------------------------------------------------- template void amax_amin_asum() { int N = 11111; T min_v = 100000, max_v = -1; T sum = 0, h_sum = -1; std::vector host(N); for(int i=0; i(N); auto min_i = tf::cuda_malloc_device(1); auto max_i = tf::cuda_malloc_device(1); auto gsum = tf::cuda_malloc_device(1); int h_min_i = -1, h_max_i = -1; tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ auto cublas = cf.capture([&](tf::cudaFlowCapturer& cap){ auto capturer = cap.make_capturer(); auto amax = capturer->amax(N, gpu, 1, max_i); auto amin = capturer->amin(N, gpu, 1, min_i); auto vset = capturer->vset(N, host.data(), 1, gpu, 1); auto back = cap.single_task([min_i, max_i] __device__ () { (*min_i)--; (*max_i)--; }); auto asum = capturer->asum(N, gpu, 1, gsum); vset.precede(amin, amax, asum); back.succeed(amin, amax); }); auto copy_min_i = cf.copy(&h_min_i, min_i, 1); auto copy_max_i = cf.copy(&h_max_i, max_i, 1); auto copy_sum = cf.copy(&h_sum, gsum, 1); cublas.precede(copy_min_i, copy_max_i, copy_sum); }); executor.run(taskflow).wait(); REQUIRE(std::abs(host[h_min_i]) == min_v); REQUIRE(std::abs(host[h_max_i]) == max_v); REQUIRE(std::abs(sum-h_sum) < 0.0001); taskflow.clear(); h_min_i = -1; h_max_i = -1; // pure capturer taskflow.emplace([&](tf::cudaFlowCapturer& cap){ auto capturer = cap.make_capturer(); auto amax = capturer->amax(N, gpu, 1, max_i); auto amin = capturer->amin(N, gpu, 1, min_i); auto vset = capturer->vset(N, host.data(), 1, gpu, 1); auto back = cap.single_task([min_i, max_i] __device__ () { (*min_i)--; (*max_i)--; }); auto asum = capturer->asum(N, gpu, 1, gsum); vset.precede(amin, amax, asum); back.succeed(amin, amax); auto copy_min_i = cap.copy(&h_min_i, min_i, 1); auto copy_max_i = cap.memcpy(&h_max_i, max_i, sizeof(h_max_i)); auto copy_sum = cap.copy(&h_sum, gsum, 1); back.precede(copy_min_i, copy_max_i, copy_sum); }); executor.run(taskflow).wait(); REQUIRE(std::abs(host[h_min_i]) == min_v); REQUIRE(std::abs(host[h_max_i]) == max_v); REQUIRE(std::abs(sum-h_sum) < 0.0001); tf::cuda_free(gpu); tf::cuda_free(min_i); tf::cuda_free(max_i); } TEST_CASE("amax-amin-asum.float" * doctest::timeout(300)) { amax_amin_asum(); } TEST_CASE("amax-amin-asum.double" * doctest::timeout(300)) { amax_amin_asum(); } // ---------------------------------------------------------------------------- // axpy // ---------------------------------------------------------------------------- template void axpy() { int N = 1745; std::vector hx(N), hy(N), golden(N), res(N); for(int i=0; i(N); auto dy = tf::cuda_malloc_device(N); auto alpha = tf::cuda_malloc_device(1); tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ cf.capture([&](tf::cudaFlowCapturer& cap){ auto capturer = cap.make_capturer(); auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1); auto vsety = capturer->vset(N, hy.data(), 1, dy, 1); auto spar = cap.single_task([alpha] __device__ () { *alpha = 2; }); auto axpy = capturer->axpy(N, alpha, dx, 1, dy, 1); auto vgety = capturer->vget(N, dy, 1, res.data(), 1); axpy.succeed(vsetx, vsety, spar) .precede(vgety); }); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("axpy.double" * doctest::timeout(300)) { axpy(); } // ---------------------------------------------------------------------------- // dot // ---------------------------------------------------------------------------- template void dot() { int N = 1745; T res = -1, golden = 0; std::vector hx(N), hy(N); for(int i=0; i(N); auto dy = tf::cuda_malloc_device(N); auto dr = tf::cuda_malloc_device(1); tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ cf.capture([&](tf::cudaFlowCapturer& cap){ auto capturer = cap.make_capturer(); auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1); auto vsety = capturer->vset(N, hy.data(), 1, dy, 1); auto xydot = capturer->dot(N, dx, 1, dy, 1, dr); auto copyr = cap.memcpy(&res, dr, sizeof(T)); xydot.succeed(vsetx, vsety) .precede(copyr); }); }); executor.run(taskflow).wait(); REQUIRE(std::abs(res-golden) < 0.0001); tf::cuda_free(dx); tf::cuda_free(dy); tf::cuda_free(dr); } TEST_CASE("dot.float" * doctest::timeout(300)) { dot(); } TEST_CASE("dot.double" * doctest::timeout(300)) { dot(); } // ---------------------------------------------------------------------------- // swap // ---------------------------------------------------------------------------- template void swap() { int N = 1745; std::vector hx(N), hy(N), rx(N), ry(N); for(int i=0; i(N); auto dy = tf::cuda_malloc_device(N); tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ cf.capture([&](tf::cudaFlowCapturer& cap){ auto capturer = cap.make_capturer(); auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1); auto vsety = capturer->vset(N, hy.data(), 1, dy, 1); auto xyswp = capturer->swap(N, dx, 1, dy, 1); auto copyx = cap.memcpy(rx.data(), dx, N*sizeof(T)); auto copyy = cap.memcpy(ry.data(), dy, N*sizeof(T)); xyswp.succeed(vsetx, vsety) .precede(copyx, copyy); }); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("swap.double" * doctest::timeout(300)) { swap(); } // ---------------------------------------------------------------------------- // scal // ---------------------------------------------------------------------------- template void scal() { int N = 17; std::vector hx(N), rx(N); for(int i=0; i(N); auto alpha = tf::cuda_malloc_device(1); tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ cf.capture([&](tf::cudaFlowCapturer& cap){ auto capturer = cap.make_capturer(); auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1); auto spar = cap.single_task([alpha] __device__ () { *alpha = 2; }); auto vgetx = capturer->vget(N, dx, 1, rx.data(), 1); auto scal = capturer->scal(N, alpha, dx, 1); scal.succeed(vsetx, spar) .precede(vgetx); }); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("scal.double" * doctest::timeout(300)) { scal(); }