#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include // ---------------------------------------------------------------------------- // utilities // ---------------------------------------------------------------------------- template std::vector transpose(int M, int N, std::vector& in) { std::vector out(in.size()); for(int i=0; i void print_matrix(int M, int N, const std::vector& mat) { for(int i=0; i void geam( bool row_major, const int M, const int N, const std::vector& hA, const std::vector& hB, const std::vector& golden, bool tranA, bool tranB ) { tf::Taskflow taskflow; tf::Executor executor; for(size_t d=0; d(M*N, d); auto dB = tf::cuda_malloc_device(M*N, d); auto dC = tf::cuda_malloc_device(M*N, d); auto dAlpha = tf::cuda_malloc_device(1, d); auto dBeta = tf::cuda_malloc_device(1, d); T* hC = new T[N*M]; auto cudaflow = taskflow.emplace_on([=, &hA, &hB](tf::cudaFlow& cf){ REQUIRE(tf::cuda_get_device() == d); auto copyA = cf.copy(dA, hA.data(), M*N); auto copyB = cf.copy(dB, hB.data(), M*N); auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; }); auto beta = cf.single_task([=] __device__ () { *dBeta = 2; }); tf::cudaTask geam; if(tranA && !tranB) { // C = A^T + B if (row_major) { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->c_geam( CUBLAS_OP_T, CUBLAS_OP_N, M, N, dAlpha, dA, M, dBeta, dB, N, dC, N ); }); } else { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->geam( CUBLAS_OP_T, CUBLAS_OP_N, N, M, dAlpha, dA, M, dBeta, dB, N, dC, N ); }); } } else if(!tranA && !tranB) { // C = A + B (r-major) if (row_major) { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->c_geam( CUBLAS_OP_N, CUBLAS_OP_N, M, N, dAlpha, dA, N, dBeta, dB, N, dC, N ); }); } else { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->geam( CUBLAS_OP_N, CUBLAS_OP_N, N, M, dAlpha, dA, N, dBeta, dB, N, dC, N ); }); } } else if(!tranA && tranB) { // C = A + B^T (r-major) if(row_major) { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->c_geam( CUBLAS_OP_N, CUBLAS_OP_T, M, N, dAlpha, dA, N, dBeta, dB, M, dC, N ); }); } else { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->geam( CUBLAS_OP_N, CUBLAS_OP_T, N, M, dAlpha, dA, N, dBeta, dB, M, dC, N ); }); } } else { // C = A^T * B^T (r-major) if (row_major) { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->c_geam( CUBLAS_OP_T, CUBLAS_OP_T, M, N, dAlpha, dA, M, dBeta, dB, M, dC, N ); }); } else { geam = cf.capture([&](tf::cudaFlowCapturer& cap){ cap.make_capturer()->geam( CUBLAS_OP_T, CUBLAS_OP_T, N, M, dAlpha, dA, M, dBeta, dB, M, dC, N ); }); } } auto copyC = cf.copy(hC, dC, M*N); geam.precede(copyC) .succeed(copyA, copyB, alpha, beta); }, d); auto verify = taskflow.emplace([=, &golden](){ for(size_t i=0; i void geam_tn(bool row_major) { int M = 2, N = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; // 3x2 const std::vector hB = { 1, 1, 1, -1, -1, -1 }; // 2x3 const std::vector golden = { 13, 14, 15, 12, 13, 14 }; // 2x3 geam(row_major, M, N, hA, hB, golden, true, false); } // C = A + B template void geam_nn(bool row_major) { int M = 2, N = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; // 2x3 const std::vector hB = { 1, 1, 1, -1, -1, -1 }; // 2x3 const std::vector golden = { 13, 14, 15, 12, 13, 14 }; // 2x3 geam(row_major, M, N, hA, hB, golden, false, false); } // C = A + B^T template void geam_nt(bool row_major) { int M = 2, N = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; // 2x3 const std::vector hB = { 1, -1, 1, -1, 1, -1 }; // 3x2 const std::vector golden = { 13, 14, 15, 12, 13, 14 }; // 2x3 geam(row_major, M, N, hA, hB, golden, false, true); } // C = A^T + B^T template void geam_tt(bool row_major) { int M = 2, N = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; // 3x2 const std::vector hB = { 1, -1, 1, -1, 1, -1 }; // 3x2 const std::vector golden = { 13, 14, 15, 12, 13, 14 }; // 2x3 geam(row_major, M, N, hA, hB, golden, true, true); } // column major TEST_CASE("geam_tn.float" * doctest::timeout(300)) { geam_tn(false); } TEST_CASE("geam_nn.float" * doctest::timeout(300)) { geam_nn(false); } TEST_CASE("geam_nt.float" * doctest::timeout(300)) { geam_nt(false); } TEST_CASE("geam_tt.float" * doctest::timeout(300)) { geam_tt(false); } TEST_CASE("geam_tn.double" * doctest::timeout(300)) { geam_tn(false); } TEST_CASE("geam_nn.double" * doctest::timeout(300)) { geam_nn(false); } TEST_CASE("geam_nt.double" * doctest::timeout(300)) { geam_nt(false); } TEST_CASE("geam_tt.double" * doctest::timeout(300)) { geam_tt(false); } // row major TEST_CASE("c_geam_tn.float" * doctest::timeout(300)) { geam_tn(true); } TEST_CASE("c_geam_nn.float" * doctest::timeout(300)) { geam_nn(true); } TEST_CASE("c_geam_nt.float" * doctest::timeout(300)) { geam_nt(true); } TEST_CASE("c_geam_tt.float" * doctest::timeout(300)) { geam_tt(true); } TEST_CASE("c_geam_tn.double" * doctest::timeout(300)) { geam_tn(true); } TEST_CASE("c_geam_nn.double" * doctest::timeout(300)) { geam_nn(true); } TEST_CASE("c_geam_nt.double" * doctest::timeout(300)) { geam_nt(true); } TEST_CASE("c_geam_tt.double" * doctest::timeout(300)) { geam_tt(true); } // ---------------------------------------------------------------------------- // Testcase: gemm and c_gemm // ---------------------------------------------------------------------------- template void gemm( bool row_major, const int M, const int N, const int K, const std::vector& hA, const std::vector& hB, const std::vector& golden, bool tranA, bool tranB ) { tf::Taskflow taskflow; tf::Executor executor; for(size_t d=0; d(K*M, d); auto dB = tf::cuda_malloc_device(K*N, d); auto dC = tf::cuda_malloc_device(M*N, d); auto dAlpha = tf::cuda_malloc_device(1, d); auto dBeta = tf::cuda_malloc_device(1, d); T* hC = new T[N*M]; auto cudaflow = taskflow.emplace_on([=, &hA, &hB](tf::cudaFlow& cf){ REQUIRE(tf::cuda_get_device() == d); auto copyA = cf.copy(dA, hA.data(), K*M); auto copyB = cf.copy(dB, hB.data(), K*N); auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; }); auto beta = cf.single_task([=] __device__ () { *dBeta = 0; }); tf::cudaTask gemm; if(tranA && !tranB) { // C = A^T * B (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm( CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, dAlpha, dA, M, dB, N, dBeta, dC, N ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm( CUBLAS_OP_N, CUBLAS_OP_T, N, M, K, dAlpha, dB, N, dA, M, dBeta, dC, N ); }); } } else if(!tranA && !tranB) { // C = A * B (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm( CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, dAlpha, dA, K, dB, N, dBeta, dC, N ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm( CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, dAlpha, dB, N, dA, K, dBeta, dC, N ); }); } } else if(!tranA && tranB) { // C = A * B^T (r-major) if(row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm( CUBLAS_OP_N, CUBLAS_OP_T, M, N, K, dAlpha, dA, K, dB, K, dBeta, dC, N ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm( CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, dAlpha, dB, K, dA, K, dBeta, dC, N ); }); } } else { // C = A^T * B^T (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm( CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, dAlpha, dA, M, dB, K, dBeta, dC, N ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm( CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, dAlpha, dB, K, dA, M, dBeta, dC, N ); }); } } auto copyC = cf.copy(hC, dC, M*N); gemm.precede(copyC) .succeed(copyA, copyB, alpha, beta); }, d); auto verify = taskflow.emplace([=, &golden](){ for(size_t i=0; i void gemm_tn(bool row_major) { int N = 4, M = 2, K = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; // 3x2 const std::vector hB = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; // 3x4 const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // 2x4 gemm(row_major, M, N, K, hA, hB, golden, true, false); } // C = A * B template void gemm_nn(bool row_major) { int N = 4, M = 2, K = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; const std::vector hB = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN gemm(row_major, M, N, K, hA, hB, golden, false, false); } // C = A * B^T template void gemm_nt(bool row_major) { int M = 2, N = 4, K = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; // MxK const std::vector hB = { 11, 15, 19, 12, 16, 20, 13, 17, 21, 14, 18, 22 }; // NxK const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN gemm(row_major, M, N, K, hA, hB, golden, false, true); } // C = A^T * B^T template void gemm_tt(bool row_major) { int M = 2, N = 4, K = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; // KxM const std::vector hB = { 11, 15, 19, 12, 16, 20, 13, 17, 21, 14, 18, 22 }; // NxK const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN gemm(row_major, M, N, K, hA, hB, golden, true, true); } // gemm (column-major) TEST_CASE("gemm_nn.float" * doctest::timeout(300)) { gemm_nn(false); } TEST_CASE("gemm_nn.double" * doctest::timeout(300)) { gemm_nn(false); } TEST_CASE("gemm_tn.float" * doctest::timeout(300)) { gemm_tn(false); } TEST_CASE("gemm_tn.double" * doctest::timeout(300)) { gemm_tn(false); } TEST_CASE("gemm_nt.float" * doctest::timeout(300)) { gemm_nt(false); } TEST_CASE("gemm_nt.double" * doctest::timeout(300)) { gemm_nt(false); } TEST_CASE("gemm_tt.float" * doctest::timeout(300)) { gemm_tt(false); } TEST_CASE("gemm_tt.double" * doctest::timeout(300)) { gemm_tt(false); } // c_gemm (row_major) TEST_CASE("c_gemm_nn.float" * doctest::timeout(300)) { gemm_nn(true); } TEST_CASE("c_gemm_nn.double" * doctest::timeout(300)) { gemm_nn(true); } TEST_CASE("c_gemm_tn.float" * doctest::timeout(300)) { gemm_tn(true); } TEST_CASE("c_gemm_tn.double" * doctest::timeout(300)) { gemm_tn(true); } TEST_CASE("c_gemm_nt.float" * doctest::timeout(300)) { gemm_nt(true); } TEST_CASE("c_gemm_nt.double" * doctest::timeout(300)) { gemm_nt(true); } TEST_CASE("c_gemm_tt.float" * doctest::timeout(300)) { gemm_tt(true); } TEST_CASE("c_gemm_tt.double" * doctest::timeout(300)) { gemm_tt(true); } // ---------------------------------------------------------------------------- // Testcase: gemm_batched and c_gemm_batched // ---------------------------------------------------------------------------- constexpr size_t S = 10; template void gemm_batched( bool row_major, const int M, const int N, const int K, const T* hA[], const T* hB[], const std::vector& golden, bool tranA, bool tranB ) { tf::Taskflow taskflow; tf::Executor executor; int d = 0; auto dA = tf::cuda_malloc_device(S*K*M, d); auto dB = tf::cuda_malloc_device(S*K*N, d); auto dC = tf::cuda_malloc_device(S*M*N, d); auto dAlpha = tf::cuda_malloc_device(1, d); auto dBeta = tf::cuda_malloc_device(1, d); auto hC = new T[S*M*N]; auto dAs = tf::cuda_malloc_device(S, d); auto dBs = tf::cuda_malloc_device(S, d); auto dCs = tf::cuda_malloc_device(S, d); auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf){ tf::cudaTask copyA[S], copyB[S]; for(size_t s=0; s()->c_gemm_batched(CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, dAlpha, (const T**)dAs, K, (const T**)dBs, N, dBeta, dCs, N, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_batched(CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, dAlpha, (const T**)dBs, N, (const T**)dAs, K, dBeta, dCs, N, S ); }); } } else if(tranA && !tranB) { // C = A^T * B (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_batched(CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, dAlpha, (const T**)dAs, M, (const T**)dBs, N, dBeta, dCs, N, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_batched(CUBLAS_OP_N, CUBLAS_OP_T, N, M, K, dAlpha, (const T**)dBs, N, (const T**)dAs, M, dBeta, dCs, N, S ); }); } } else if(!tranA && tranB) { // C = A * B^T (r-major) if(row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_batched(CUBLAS_OP_N, CUBLAS_OP_T, M, N, K, dAlpha, (const T**)dAs, K, (const T**)dBs, K, dBeta, dCs, N, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_batched(CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, dAlpha, (const T**)dBs, K, (const T**)dAs, K, dBeta, dCs, N, S ); }); } } else { // C = A^T * B^T (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_batched(CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, dAlpha, (const T**)dAs, M, (const T**)dBs, K, dBeta, dCs, N, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_batched(CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, dAlpha, (const T**)dBs, K, (const T**)dAs, M, dBeta, dCs, N, S ); }); } } gemm.succeed(alpha, beta, array); for(size_t s=0; s void gemm_batched_nn(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; const std::vector hB = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN const T* hAs[S]; const T* hBs[S]; for(size_t s=0; s(row_major, M, N, K, hAs, hBs, golden, false, false); } // C = A^T * B template void gemm_batched_tn(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; const std::vector hB = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN const T* hAs[S]; const T* hBs[S]; for(size_t s=0; s(row_major, M, N, K, hAs, hBs, golden, true, false); } // C = A * B^T template void gemm_batched_nt(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; const std::vector hB = { 11, 15, 19, 12, 16, 20, 13, 17, 21, 14, 18, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN const T* hAs[S]; const T* hBs[S]; for(size_t s=0; s(row_major, M, N, K, hAs, hBs, golden, false, true); } // C = A^T * B^T template void gemm_batched_tt(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; const std::vector hB = { 11, 15, 19, 12, 16, 20, 13, 17, 21, 14, 18, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN const T* hAs[S]; const T* hBs[S]; for(size_t s=0; s(row_major, M, N, K, hAs, hBs, golden, true, true); } // gemm_batched (column-major) TEST_CASE("gemm_batched_nn.float" * doctest::timeout(300)) { gemm_batched_nn(false); } TEST_CASE("gemm_batched_tn.float" * doctest::timeout(300)) { gemm_batched_tn(false); } TEST_CASE("gemm_batched_nt.float" * doctest::timeout(300)) { gemm_batched_nt(false); } TEST_CASE("gemm_batched_tt.float" * doctest::timeout(300)) { gemm_batched_tt(false); } TEST_CASE("gemm_batched_nn.double" * doctest::timeout(300)) { gemm_batched_nn(false); } TEST_CASE("gemm_batched_tn.double" * doctest::timeout(300)) { gemm_batched_tn(false); } TEST_CASE("gemm_batched_nt.double" * doctest::timeout(300)) { gemm_batched_nt(false); } TEST_CASE("gemm_batched_tt.double" * doctest::timeout(300)) { gemm_batched_tt(false); } // c_gemm_batched (row-major) TEST_CASE("c_gemm_batched_nn.float" * doctest::timeout(300)) { gemm_batched_nn(true); } TEST_CASE("c_gemm_batched_tn.float" * doctest::timeout(300)) { gemm_batched_tn(true); } TEST_CASE("c_gemm_batched_nt.float" * doctest::timeout(300)) { gemm_batched_nt(true); } TEST_CASE("c_gemm_batched_tt.float" * doctest::timeout(300)) { gemm_batched_tt(true); } TEST_CASE("c_gemm_batched_nn.double" * doctest::timeout(300)) { gemm_batched_nn(true); } TEST_CASE("c_gemm_batched_tn.double" * doctest::timeout(300)) { gemm_batched_tn(true); } TEST_CASE("c_gemm_batched_nt.double" * doctest::timeout(300)) { gemm_batched_nt(true); } TEST_CASE("c_gemm_batched_tt.double" * doctest::timeout(300)) { gemm_batched_tt(true); } // ---------------------------------------------------------------------------- // Testcase: gemm_strided_batched // ---------------------------------------------------------------------------- template void gemm_strided_batched( bool row_major, const int M, const int N, const int K, const T* hA, const T* hB, const std::vector& golden, bool tranA, bool tranB ) { tf::Taskflow taskflow; tf::Executor executor; int d = 0; auto dA = tf::cuda_malloc_device(S*K*M, d); auto dB = tf::cuda_malloc_device(S*K*N, d); auto dC = tf::cuda_malloc_device(S*M*N, d); auto dAlpha = tf::cuda_malloc_device(1, d); auto dBeta = tf::cuda_malloc_device(1, d); auto hC = new T[S*M*N]; int sA = K*M; int sB = K*N; int sC = M*N; auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf){ auto copyA = cf.copy(dA, hA, S*K*M); auto copyB = cf.copy(dB, hB, S*K*N); auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; }); auto beta = cf.single_task([=] __device__ () { *dBeta = 0; }); tf::cudaTask gemm; if(!tranA && !tranB) { // C = A * B (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_sbatched( CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, dAlpha, dA, K, sA, dB, N, sB, dBeta, dC, N, sC, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_sbatched( CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, dAlpha, dB, N, sB, dA, K, sA, dBeta, dC, N, sC, S ); }); } } else if(tranA && !tranB) { // C = A^T * B (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_sbatched( CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, dAlpha, dA, M, sA, dB, N, sB, dBeta, dC, N, sC, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_sbatched( CUBLAS_OP_N, CUBLAS_OP_T, N, M, K, dAlpha, dB, N, sB, dA, M, sA, dBeta, dC, N, sC, S ); }); } } else if(!tranA && tranB) { // C = A * B^T (r-major) if(row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_sbatched( CUBLAS_OP_N, CUBLAS_OP_T, M, N, K, dAlpha, dA, K, sA, dB, K, sB, dBeta, dC, N, sC, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_sbatched( CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, dAlpha, dB, K, sB, dA, K, sA, dBeta, dC, N, sC, S ); }); } } else { // C = A^T * B^T (r-major) if (row_major) { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->c_gemm_sbatched( CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, dAlpha, dA, M, sA, dB, K, sB, dBeta, dC, N, sC, S ); }); } else { gemm = cf.capture([&](tf::cudaFlowCapturer& flow){ flow.make_capturer()->gemm_sbatched( CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, dAlpha, dB, K, sB, dA, M, sA, dBeta, dC, N, sC, S ); }); } } auto copyC = cf.copy(hC, dC, S*M*N); gemm.succeed(alpha, beta, copyA, copyB) .precede(copyC); }); auto verify = taskflow.emplace([&](){ for(size_t s=0; s void gemm_strided_batched_nn(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; const std::vector hB = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN std::vector hAs, hBs; for(size_t s=0; s( row_major, M, N, K, hAs.data(), hBs.data(), golden, false, false ); } // C = A^T * B template void gemm_strided_batched_tn(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; const std::vector hB = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN std::vector hAs, hBs; for(size_t s=0; s( row_major, M, N, K, hAs.data(), hBs.data(), golden, true, false ); } // C = A * B^T template void gemm_strided_batched_nt(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 12, 13, 14, 15, 16 }; const std::vector hB = { 11, 15, 19, 12, 16, 20, 13, 17, 21, 14, 18, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN std::vector hAs, hBs; for(size_t s=0; s( row_major, M, N, K, hAs.data(), hBs.data(), golden, false, true ); } // C = A^T * B^T template void gemm_strided_batched_tt(bool row_major) { const int N = 4, M = 2, K = 3; const std::vector hA = { 11, 14, 12, 15, 13, 16 }; const std::vector hB = { 11, 15, 19, 12, 16, 20, 13, 17, 21, 14, 18, 22 }; const std::vector golden = { 548, 584, 620, 656, 683, 728, 773, 818 }; // MxN std::vector hAs, hBs; for(size_t s=0; s( row_major, M, N, K, hAs.data(), hBs.data(), golden, true, true ); } // gemm_strided_batched (column-major) TEST_CASE("gemm_strided_batched_nn.float" * doctest::timeout(300)) { gemm_strided_batched_nn(false); } TEST_CASE("gemm_strided_batched_tn.float" * doctest::timeout(300)) { gemm_strided_batched_tn(false); } TEST_CASE("gemm_strided_batched_nt.float" * doctest::timeout(300)) { gemm_strided_batched_nt(false); } TEST_CASE("gemm_strided_batched_tt.float" * doctest::timeout(300)) { gemm_strided_batched_tt(false); } TEST_CASE("gemm_strided_batched_nn.double" * doctest::timeout(300)) { gemm_strided_batched_nn(false); } TEST_CASE("gemm_strided_batched_tn.double" * doctest::timeout(300)) { gemm_strided_batched_tn(false); } TEST_CASE("gemm_strided_batched_nt.double" * doctest::timeout(300)) { gemm_strided_batched_nt(false); } TEST_CASE("gemm_strided_batched_tt.double" * doctest::timeout(300)) { gemm_strided_batched_tt(false); } // gemm_strided_batched (row-major) TEST_CASE("c_gemm_strided_batched_nn.float" * doctest::timeout(300)) { gemm_strided_batched_nn(true); } TEST_CASE("c_gemm_strided_batched_tn.float" * doctest::timeout(300)) { gemm_strided_batched_tn(true); } TEST_CASE("c_gemm_strided_batched_nt.float" * doctest::timeout(300)) { gemm_strided_batched_nt(true); } TEST_CASE("c_gemm_strided_batched_tt.float" * doctest::timeout(300)) { gemm_strided_batched_tt(true); } TEST_CASE("c_gemm_strided_batched_nn.double" * doctest::timeout(300)) { gemm_strided_batched_nn(true); } TEST_CASE("c_gemm_strided_batched_tn.double" * doctest::timeout(300)) { gemm_strided_batched_tn(true); } TEST_CASE("c_gemm_strided_batched_nt.double" * doctest::timeout(300)) { gemm_strided_batched_nt(true); } TEST_CASE("c_gemm_strided_batched_tt.double" * doctest::timeout(300)) { gemm_strided_batched_tt(true); } // ---------------------------------------------------------------------------- // symm // ---------------------------------------------------------------------------- template void symm_test() { int M = 3; int N = 4; int LA = 6, LB = 6, LC = N; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hB = { -1, -1, -1, -1, -1, -1, -1, 1, 1, 3, 1, -1, -1, 1, 4, 1, 1, -1, -1, 1, 1, 7, 1, -1 }; const std::vector gold = { 4, 7, 14, 4, 4, 10, 12, 4, 4, 7, 18, 4 }; std::vector hC(M*N); tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dB = tf::cuda_malloc_device(hB.size()); auto dC = tf::cuda_malloc_device(hC.size()); auto dalpha = tf::cuda_malloc_device(1); auto dbeta = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto beta = capturer.single_task([=] __device__ () { *dbeta = 0; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dB = capturer.copy(dB, hB.data(), hB.size()); auto symm = blas->c_symm( CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, M, N, dalpha, dA + 7, LA, dB + 7, LB, dbeta, dC, LC ); auto d2hC = capturer.copy(hC.data(), dC, hC.size()); symm.succeed(h2dA, h2dB, alpha, beta) .precede(d2hC); }); executor.run(taskflow).wait(); for(size_t i=0; i(); } TEST_CASE("c_symm.double" * doctest::timeout(300)) { symm_test(); } // ---------------------------------------------------------------------------- // syrk // ---------------------------------------------------------------------------- template void syrk_test() { int N = 3; int K = 4; int LA = 6, LC = 6; std::vector hC = { -1, -1, -1, -1, -1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 1, 1, 3, 1, -1, -1, 1, 4, 1, 1, -1, -1, 1, 1, 7, 1, -1 }; const std::vector gold = { -1, -1, -1, -1, -1, -1, -1, 14, 0, 0, -1, -1, -1, 10, 21, 0, -1, -1, -1, 25, 14, 54, -1, -1 }; tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dC = tf::cuda_malloc_device(hC.size()); auto dalpha = tf::cuda_malloc_device(1); auto dbeta = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto beta = capturer.single_task([=] __device__ () { *dbeta = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dC = capturer.copy(dC, hC.data(), hC.size()); auto syrk = blas->c_syrk( CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, N, K, dalpha, dA + 7, LA, dbeta, dC + 7, LC ); auto d2hC = capturer.copy(hC.data(), dC, hC.size()); syrk.succeed(h2dA, h2dC, alpha, beta) .precede(d2hC); }); executor.run(taskflow).wait(); //print_matrix(4, 6, hC); for(size_t i=0; i(); } TEST_CASE("c_syrk.double" * doctest::timeout(300)) { syrk_test(); } // ---------------------------------------------------------------------------- // syr2k // ---------------------------------------------------------------------------- template void syr2k_test() { int N = 3; int K = 4; int LA = 6, LC = 6, LB = 6; std::vector hC = { -1, -1, -1, -1, -1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 1, 1, 3, 1, -1, -1, 1, 4, 1, 1, -1, -1, 1, 1, 7, 1, -1 }; const std::vector hB = { -1, -1, -1, -1, -1, -1, -1, 1, 10, 2, 9, -1, -1, 8, 14, 2, 1, -1, -1, 13, 3, 1, 4, -1 }; const std::vector gold = { -1, -1, -1, -1, -1, -1, -1, 54, 0, 0, -1, -1, -1, 82, 136, 0, -1, -1, -1, 58, 68, 56, -1, -1 }; tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dB = tf::cuda_malloc_device(hB.size()); auto dC = tf::cuda_malloc_device(hC.size()); auto dalpha = tf::cuda_malloc_device(1); auto dbeta = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto beta = capturer.single_task([=] __device__ () { *dbeta = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dB = capturer.copy(dB, hB.data(), hB.size()); auto h2dC = capturer.copy(dC, hC.data(), hC.size()); auto syr2k = blas->c_syr2k( CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, N, K, dalpha, dA + 7, LA, dB + 7, LB, dbeta, dC + 7, LC ); auto d2hC = capturer.copy(hC.data(), dC, hC.size()); syr2k.succeed(h2dA, h2dC, h2dB, alpha, beta) .precede(d2hC); }); executor.run(taskflow).wait(); //print_matrix(4, 6, hC); for(size_t i=0; i(); } TEST_CASE("c_syr2k.double" * doctest::timeout(300)) { syr2k_test(); } // ---------------------------------------------------------------------------- // trmm // ---------------------------------------------------------------------------- template void trmm_test() { int N = 4; int M = 3; int LA = 6, LC = 6, LB = 6; std::vector hC = { -1, 1, 1, 1, 1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, 1, 4, 0, -1, -1, -1, 1, 1, 7, -1, -1 }; const std::vector hB = { -1, -1, -1, -1, -1, -1, -1, 1, 10, 2, 9, -1, -1, 8, 14, 2, 1, -1, -1, 13, 3, 1, 4, -1 }; const std::vector gold = { -1, -1, -1, -1, -1, -1, -1, 1, 10, 2, 9, -1, -1, 33, 66, 10, 13, -1, -1, 100, 45, 11, 38, -1 }; tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dB = tf::cuda_malloc_device(hB.size()); auto dC = tf::cuda_malloc_device(hC.size()); auto dalpha = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dB = capturer.copy(dB, hB.data(), hB.size()); auto setC = capturer.for_each(dC, dC + hC.size(), []__device__(T& v) { v = -1; }); auto trmm = blas->c_trmm( CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, M, N, dalpha, dA + 7, LA, dB + 7, LB, dC + 7, LC ); auto d2hC = capturer.copy(hC.data(), dC, hC.size()); trmm.succeed(h2dA, h2dB, alpha, setC) .precede(d2hC); }); executor.run(taskflow).wait(); //print_matrix(4, 6, hC); for(size_t i=0; i(); } TEST_CASE("c_trmm.double" * doctest::timeout(300)) { trmm_test(); } // ---------------------------------------------------------------------------- // trsm // ---------------------------------------------------------------------------- template void trsm_test() { int N = 2; int M = 3; int LA = 6; int LB = 2; const std::vector hA = { -1, -1, -1, -1, -1, -1, -1, 2, 0, 0, -1, -1, -1, 1, 2, 0, -1, -1, -1, 1, 1, 2, -1, -1 }; std::vector hB = { 5, 10, 4, 8, 7, 14 }; const std::vector sol = { 2.5, 5, 0.75, 1.5, 1.875, 3.75 }; tf::Taskflow taskflow; tf::Executor executor; auto dA = tf::cuda_malloc_device(hA.size()); auto dB = tf::cuda_malloc_device(hB.size()); auto dAlpha = tf::cuda_malloc_device(1); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto blas = capturer.make_capturer(); auto alpha = capturer.single_task([=] __device__ () { *dAlpha = 1; }); auto h2dA = capturer.copy(dA, hA.data(), hA.size()); auto h2dB = capturer.copy(dB, hB.data(), hB.size()); auto trsm = blas->c_trsm( CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, M, N, dAlpha, dA + 7, LA, dB, LB ); auto d2h = capturer.copy(hB.data(), dB, hB.size()); trsm.succeed(h2dA, h2dB, alpha) .precede(d2h); }); executor.run(taskflow).wait(); //print_matrix(3, 2, hB); for(size_t i=0; i(); } TEST_CASE("c_trsm.double" * doctest::timeout(300)) { trsm_test(); }