#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include // ---------------------------------------------------------------------------- // Testcase: gemv and c_gemv // ---------------------------------------------------------------------------- template void gemv( bool row_major, const int M, const int N, const std::vector& hA, const std::vector& hx, const std::vector& golden, bool trans ) { for(size_t d=0; d(M*N, d); auto dAlpha = tf::cuda_malloc_device(1, d); auto dBeta = tf::cuda_malloc_device(1, d); T* hy; T* dx; T* dy; if(trans) { hy = new T[N]; dx = tf::cuda_malloc_device(M, d); dy = tf::cuda_malloc_device(N, d); } else { hy = new T[M]; dx = tf::cuda_malloc_device(N, d); dy = tf::cuda_malloc_device(M, d); } auto cudaflow = taskflow.emplace_on([=](tf::cudaFlow& cf){ REQUIRE(tf::cuda_get_device() == d); auto copyA = cf.copy(dA, hA.data(), M*N); tf::cudaTask copyx; (trans) ? copyx = cf.copy(dx, hx.data(), M) : copyx = cf.copy(dx, hx.data(), N); auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; }); auto beta = cf.single_task([=] __device__ () { *dBeta = 0; }); tf::cudaTask gemv; if(trans) { if(row_major) { // C = A^T * x (r-major) gemv = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->c_gemv( CUBLAS_OP_T, M, N, dAlpha, dA, N, dx, 1, dBeta, dy, 1 ); }); } else { gemv = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->gemv( CUBLAS_OP_N, N, M, dAlpha, dA, N, dx, 1, dBeta, dy, 1 ); }); } } else { if(row_major) { // C = A * x (r-major) gemv = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->c_gemv( CUBLAS_OP_N, M, N, dAlpha, dA, N, dx, 1, dBeta, dy, 1 ); }); } else { gemv = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->gemv( CUBLAS_OP_T, N, M, dAlpha, dA, N, dx, 1, dBeta, dy, 1 ); }); } } tf::cudaTask copyy; (trans) ? copyy = cf.copy(hy, dy, N) : copyy = cf.copy(hy, dy, M); gemv.precede(copyy) .succeed(copyA, copyx, alpha, beta); }, d); auto verify = taskflow.emplace([=, &golden](){ for(size_t i=0; i void gemv_test(bool row_major, bool trans) { int M = 3, N = 4; const std::vector hA = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; // 3x4 std::vector hx; std::vector golden; //ha.T * hx if(trans) { hx = {11, 12, 13}; golden = {548, 584, 620, 656}; } else { hx = {11, 12, 13, 14}; golden = {630, 830, 1030}; } gemv(row_major, M, N, hA, hx, golden, trans); } // gemv (column-major) TEST_CASE("gemv_n.float" * doctest::timeout(300)) { gemv_test(false, false); } TEST_CASE("gemv_n.double" * doctest::timeout(300)) { gemv_test(false, false); } TEST_CASE("gemv_t.float" * doctest::timeout(300)) { gemv_test(false, true); } TEST_CASE("gemv_t.double" * doctest::timeout(300)) { gemv_test(false, true); } // gemv (row-major) TEST_CASE("c_gemv_n.float" * doctest::timeout(300)) { gemv_test(true, false); } TEST_CASE("c_gemv_n.double" * doctest::timeout(300)) { gemv_test(true, false); } TEST_CASE("c_gemv_t.float" * doctest::timeout(300)) { gemv_test(true, true); } TEST_CASE("c_gemv_t.double" * doctest::timeout(300)) { gemv_test(true, true); } // ---------------------------------------------------------------------------- // trsv // ---------------------------------------------------------------------------- template void c_trsv_test() { int N = 3; int L = 6; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hB = { 5, 4, 7 }; const std::vector sol = { 2.5, 0.75, 1.875 }; std::vector res(N, 0); tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dB = tf::cuda_malloc_device(hB.size()); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dB = capturer.copy(dB, hB.data(), hB.size()); auto trsv = blas->c_trsv( CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, dA + 7, L, dB, 1 ); auto d2h = capturer.copy(res.data(), dB, res.size()); trsv.succeed(h2dA, h2dB) .precede(d2h); }); executor.run(taskflow).wait(); for(size_t i=0; i(); } TEST_CASE("c_trsv.double" * doctest::timeout(300)) { c_trsv_test(); } // ---------------------------------------------------------------------------- // trmv // ---------------------------------------------------------------------------- template void c_trmv_test() { int N = 3; int L = 6; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 1, 1, -1, -1, -1, 0, 2, 1, -1, -1, -1, 0, 0, 2, -1, -1 }; const std::vector hB = { 5, -1, 4, -1, 7, -1 }; const std::vector sol = { 21, -1, 15, -1, 14, -1 }; std::vector res(hB.size(), 0); tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dB = tf::cuda_malloc_device(hB.size()); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dB = capturer.copy(dB, hB.data(), hB.size()); auto trmv = blas->c_trmv( CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, dA + 7, L, dB, 2 ); auto d2h = capturer.copy(res.data(), dB, res.size()); trmv.succeed(h2dA, h2dB) .precede(d2h); }); executor.run(taskflow).wait(); for(size_t i=0; i(); } TEST_CASE("c_trmv.double" * doctest::timeout(300)) { c_trmv_test(); } // ---------------------------------------------------------------------------- // symv: y = alpha * A * x + beta * y // ---------------------------------------------------------------------------- template void c_symv_test() { int N = 3; int L = 6; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 1, 1, -1, -1, -1, 1, 2, 1, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hx = { 5, -1, 4, -1, 7, -1 }; std::vector hy = { 2, -1, 3, -1, 45, -1 }; const std::vector sol = { 23, -1, 23, -1, 68, -1 }; tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dx = tf::cuda_malloc_device(hx.size()); auto dy = tf::cuda_malloc_device(hy.size()); auto dalpha = tf::cuda_malloc_device(1); auto dbeta = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto beta = capturer.single_task([=] __device__ () { *dbeta = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dx = capturer.copy(dx, hx.data(), hx.size()); auto h2dy = capturer.copy(dy, hy.data(), hy.size()); auto symv = blas->c_symv(CUBLAS_FILL_MODE_UPPER, N, dalpha, dA + 7, L, dx, 2, dbeta, dy, 2 ); auto d2h = capturer.copy(hy.data(), dy, hy.size()); symv.succeed(h2dA, h2dx, h2dy, alpha, beta) .precede(d2h); }); executor.run(taskflow).wait(); for(size_t i=0; i(); } TEST_CASE("c_symv.double" * doctest::timeout(300)) { c_symv_test(); } // ---------------------------------------------------------------------------- // syr: A = alpha * x * x^T + A // ---------------------------------------------------------------------------- template void c_syr_test() { int N = 3; int L = 6; std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 1, 1, -1, -1, -1, 0, 2, 1, -1, -1, -1, 0, 0, 2, -1, -1 }; const std::vector hx = { 5, -1, 4, -1, 7, -1 }; const std::vector sol = { -1, -1, -1, -1, -1, -1, -1, 27, 21, 36, -1, -1, -1, 0, 18, 29, -1, -1, -1, 0, 0, 51, -1, -1 }; tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dx = tf::cuda_malloc_device(hx.size()); auto dalpha = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dx = capturer.copy(dx, hx.data(), hx.size()); auto syr = blas->c_syr(CUBLAS_FILL_MODE_UPPER, N, dalpha, dx, 2, dA + 7, L ); auto d2h = capturer.copy(hA.data(), dA, hA.size()); syr.succeed(h2dA, h2dx, alpha) .precede(d2h); }); executor.run(taskflow).wait(); for(size_t i=0; i(); } TEST_CASE("c_syr.double" * doctest::timeout(300)) { c_syr_test(); } // ---------------------------------------------------------------------------- // syr2: A = alpha * x * y^T + y * x^T + A // ---------------------------------------------------------------------------- template void c_syr2_test() { int N = 3; int L = 6; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hx = { 5, -1, 4, -1, 7, -1 }; const std::vector hy = { 2, -1, 3, -1, 1, -1 }; const std::vector sol = { -1, -1, -1, -1, -1, -1, -1, 22, 0, 0, -1, -1, -1, 24, 26, 0, -1, -1, -1, 20, 26, 16, -1, -1 }; std::vector res(hA.size(), 0); tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dx = tf::cuda_malloc_device(hx.size()); auto dy = tf::cuda_malloc_device(hy.size()); auto dalpha = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dx = capturer.copy(dx, hx.data(), hx.size()); auto h2dy = capturer.copy(dy, hy.data(), hy.size()); auto syr2 = blas->c_syr2(CUBLAS_FILL_MODE_LOWER, N, dalpha, dx, 2, dy, 2, dA + 7, L ); auto d2h = capturer.copy(res.data(), dA, res.size()); syr2.succeed(h2dA, h2dx, h2dy, alpha) .precede(d2h); }); executor.run(taskflow).wait(); //for(int i=0; i<=N; i++) { // for(int l=0; l(); } TEST_CASE("c_syr2.double" * doctest::timeout(300)) { c_syr2_test(); }