#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include constexpr float eps = 0.0001f; template void run_and_wait(T& cf) { tf::cudaStream stream; cf.run(stream); stream.synchronize(); } // ---------------------------------------------------------------------------- // cuda transform // ---------------------------------------------------------------------------- template void cuda_transform() { tf::Taskflow taskflow; tf::Executor executor; for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); T v1 = ::rand() % 100; T v2 = ::rand() % 100; T* dx = tf::cuda_malloc_shared(n); T* dy = tf::cuda_malloc_shared(n); for(int i=0; i(); } TEST_CASE("cuda_transform.float" * doctest::timeout(300)) { cuda_transform(); } TEST_CASE("cuda_transform.double" * doctest::timeout(300)) { cuda_transform(); } // ---------------------------------------------------------------------------- // cudaflow transform // ---------------------------------------------------------------------------- template void cudaflow_transform() { tf::Taskflow taskflow; tf::Executor executor; for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ T v1 = ::rand() % 100; T v2 = ::rand() % 100; std::vector hx, hy; T* dx {nullptr}; T* dy {nullptr}; // allocate x hx.resize(n, v1); REQUIRE(cudaMalloc(&dx, n*sizeof(T)) == cudaSuccess); // allocate y hy.resize(n, v2); REQUIRE(cudaMalloc(&dy, n*sizeof(T)) == cudaSuccess); // axpy F cf; auto h2d_x = cf.copy(dx, hx.data(), n).name("h2d_x"); auto h2d_y = cf.copy(dy, hy.data(), n).name("h2d_y"); auto d2h_x = cf.copy(hx.data(), dx, n).name("d2h_x"); auto d2h_y = cf.copy(hy.data(), dy, n).name("d2h_y"); auto kernel = cf.transform(dx, dx+n, dy, [] __device__ (T x) { return x + 2; } ); kernel.succeed(h2d_x, h2d_y) .precede(d2h_x, d2h_y); run_and_wait(cf); // verify the result for (int i = 0; i < n; i++) { REQUIRE(std::fabs(hx[i] - v1) < eps); REQUIRE(std::fabs(hy[i] - (hx[i] + 2)) < eps); } // update the kernel and run the cf again cf.transform(kernel, dy, dy+n, dx, [] __device__ (T y) { return y - 4; } ); run_and_wait(cf); // verify the result for (int i = 0; i < n; i++) { REQUIRE(std::fabs(hx[i] - (v1 - 2)) < eps); REQUIRE(std::fabs(hy[i] - (v1 + 2)) < eps); } // free memory REQUIRE(cudaFree(dx) == cudaSuccess); REQUIRE(cudaFree(dy) == cudaSuccess); }); } executor.run(taskflow).wait(); } TEST_CASE("cudaFlow.transform.int" * doctest::timeout(300)) { cudaflow_transform(); } TEST_CASE("cudaFlow.transform.float" * doctest::timeout(300)) { cudaflow_transform(); } TEST_CASE("cudaFlow.transform.double" * doctest::timeout(300)) { cudaflow_transform(); } TEST_CASE("cudaFlowCapturer.transform.int" * doctest::timeout(300)) { cudaflow_transform(); } TEST_CASE("cudaFlowCapturer.transform.float" * doctest::timeout(300)) { cudaflow_transform(); } TEST_CASE("cudaFlowCapturer.transform.double" * doctest::timeout(300)) { cudaflow_transform(); } // ---------------------------------------------------------------------------- // cuda transform2 // ---------------------------------------------------------------------------- template void cuda_transform2() { tf::Taskflow taskflow; tf::Executor executor; for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ tf::cudaStream stream; tf::cudaDefaultExecutionPolicy policy(stream); T v1 = ::rand() % 100; T v2 = ::rand() % 100; T v3 = ::rand() % 1000; T* dx = tf::cuda_malloc_shared(n); T* dy = tf::cuda_malloc_shared(n); T* dz = tf::cuda_malloc_shared(n); for(int i=0; i(); } TEST_CASE("cuda_transform2.float" * doctest::timeout(300)) { cuda_transform2(); } TEST_CASE("cuda_transform2.double" * doctest::timeout(300)) { cuda_transform2(); } // ---------------------------------------------------------------------------- // cudaflow transform2 // ---------------------------------------------------------------------------- template void cudaflow_transform2() { tf::Taskflow taskflow; tf::Executor executor; for(int n=1; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) { taskflow.emplace([n](){ T v1 = ::rand() % 100; T v2 = ::rand() % 100; T v3 = ::rand() % 100; std::vector hx, hy, hz; T* dx {nullptr}; T* dy {nullptr}; T* dz {nullptr}; // allocate x hx.resize(n, v1); REQUIRE(cudaMalloc(&dx, n*sizeof(T)) == cudaSuccess); // allocate y hy.resize(n, v2); REQUIRE(cudaMalloc(&dy, n*sizeof(T)) == cudaSuccess); // allocate z hz.resize(n, v3); REQUIRE(cudaMalloc(&dz, n*sizeof(T)) == cudaSuccess); // axpy F cf; auto h2d_x = cf.copy(dx, hx.data(), n).name("h2d_x"); auto h2d_y = cf.copy(dy, hy.data(), n).name("h2d_y"); auto h2d_z = cf.copy(dz, hz.data(), n).name("h2d_z"); auto d2h_x = cf.copy(hx.data(), dx, n).name("d2h_x"); auto d2h_y = cf.copy(hy.data(), dy, n).name("d2h_y"); auto d2h_z = cf.copy(hz.data(), dz, n).name("d2h_z"); auto kernel = cf.transform(dx, dx+n, dy, dz, [] __device__ (T x, T y) { return x + y; } ); kernel.succeed(h2d_x, h2d_y, h2d_z) .precede(d2h_x, d2h_y, d2h_z); run_and_wait(cf); // verify the result for (int i = 0; i < n; i++) { REQUIRE(std::fabs(hx[i] - v1) < eps); REQUIRE(std::fabs(hy[i] - v2) < eps); REQUIRE(std::fabs(hz[i] - v1 - v2) < eps); } // update the kernel and run the cf again // dz = v1 + v2 // dx = v1 // dy = v2 cf.transform(kernel, dz, dz+n, dx, dy, [] __device__ (T z, T x) { return z + x + T(10); } ); run_and_wait(cf); // verify the result for (int i = 0; i < n; i++) { REQUIRE(std::fabs(hy[i] - (v1 + v2 + v1 + T(10))) < eps); } // free memory REQUIRE(cudaFree(dx) == cudaSuccess); REQUIRE(cudaFree(dy) == cudaSuccess); REQUIRE(cudaFree(dz) == cudaSuccess); }); } executor.run(taskflow).wait(); } TEST_CASE("cudaFlow.transform2.int" * doctest::timeout(300)) { cudaflow_transform2(); } TEST_CASE("cudaFlow.transform2.float" * doctest::timeout(300)) { cudaflow_transform2(); } TEST_CASE("cudaFlow.transform2.double" * doctest::timeout(300)) { cudaflow_transform2(); } TEST_CASE("cudaFlowCapturer.transform2.int" * doctest::timeout(300)) { cudaflow_transform2(); } TEST_CASE("cudaFlowCapturer.transform2.float" * doctest::timeout(300)) { cudaflow_transform2(); } TEST_CASE("cudaFlowCapturer.transform2.double" * doctest::timeout(300)) { cudaflow_transform2(); }