mesytec-mnode/external/taskflow-3.8.0/sandbox/cublas_unittests/level3.cu

1732 lines
41 KiB
Text
Raw Normal View History

2025-01-04 01:25:05 +01:00
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cublasflow.hpp>
// ----------------------------------------------------------------------------
// utilities
// ----------------------------------------------------------------------------
template <typename T>
std::vector<T> transpose(int M, int N, std::vector<T>& in) {
std::vector<T> out(in.size());
for(int i=0; i<M; i++) {
for(int j=0; j<N; j++) {
out[i*N + j] = in[j*N + i];
}
}
return out;
}
template <typename T>
void print_matrix(int M, int N, const std::vector<T>& mat) {
for(int i=0; i<M; i++) {
for(int j=0; j<N; j++) {
std::cout << mat[i*N+j] << ' ';
}
std::cout << '\n';
}
}
// ----------------------------------------------------------------------------
template <typename T>
void geam(
bool row_major,
const int M,
const int N,
const std::vector<T>& hA,
const std::vector<T>& hB,
const std::vector<T>& golden,
bool tranA,
bool tranB
) {
tf::Taskflow taskflow;
tf::Executor executor;
for(size_t d=0; d<tf::cuda_get_num_devices(); d++) {
auto dA = tf::cuda_malloc_device<T>(M*N, d);
auto dB = tf::cuda_malloc_device<T>(M*N, d);
auto dC = tf::cuda_malloc_device<T>(M*N, d);
auto dAlpha = tf::cuda_malloc_device<T>(1, d);
auto dBeta = tf::cuda_malloc_device<T>(1, d);
T* hC = new T[N*M];
auto cudaflow = taskflow.emplace_on([=, &hA, &hB](tf::cudaFlow& cf){
REQUIRE(tf::cuda_get_device() == d);
auto copyA = cf.copy(dA, hA.data(), M*N);
auto copyB = cf.copy(dB, hB.data(), M*N);
auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; });
auto beta = cf.single_task([=] __device__ () { *dBeta = 2; });
tf::cudaTask geam;
if(tranA && !tranB) { // C = A^T + B
if (row_major) {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->c_geam(
CUBLAS_OP_T, CUBLAS_OP_N,
M, N, dAlpha, dA, M, dBeta, dB, N, dC, N
);
});
}
else {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->geam(
CUBLAS_OP_T, CUBLAS_OP_N,
N, M, dAlpha, dA, M, dBeta, dB, N, dC, N
);
});
}
}
else if(!tranA && !tranB) { // C = A + B (r-major)
if (row_major) {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->c_geam(
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, dAlpha, dA, N, dBeta, dB, N, dC, N
);
});
}
else {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->geam(
CUBLAS_OP_N, CUBLAS_OP_N,
N, M, dAlpha, dA, N, dBeta, dB, N, dC, N
);
});
}
}
else if(!tranA && tranB) { // C = A + B^T (r-major)
if(row_major) {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->c_geam(
CUBLAS_OP_N, CUBLAS_OP_T,
M, N, dAlpha, dA, N, dBeta, dB, M, dC, N
);
});
}
else {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->geam(
CUBLAS_OP_N, CUBLAS_OP_T,
N, M, dAlpha, dA, N, dBeta, dB, M, dC, N
);
});
}
}
else { // C = A^T * B^T (r-major)
if (row_major) {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->c_geam(
CUBLAS_OP_T, CUBLAS_OP_T,
M, N, dAlpha, dA, M, dBeta, dB, M, dC, N
);
});
}
else {
geam = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->geam(
CUBLAS_OP_T, CUBLAS_OP_T,
N, M, dAlpha, dA, M, dBeta, dB, M, dC, N
);
});
}
}
auto copyC = cf.copy(hC, dC, M*N);
geam.precede(copyC)
.succeed(copyA, copyB, alpha, beta);
}, d);
auto verify = taskflow.emplace([=, &golden](){
for(size_t i=0; i<golden.size(); i++) {
REQUIRE(std::fabs(hC[i]-golden[i]) < 0.0001);
}
tf::cuda_free(dA);
tf::cuda_free(dB);
tf::cuda_free(dC);
tf::cuda_free(dAlpha);
tf::cuda_free(dBeta);
delete [] hC;
});
cudaflow.precede(verify);
}
executor.run(taskflow).wait();
}
// C = A^T + B
template <typename T>
void geam_tn(bool row_major) {
int M = 2, N = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
}; // 3x2
const std::vector<T> hB = {
1, 1, 1,
-1, -1, -1
}; // 2x3
const std::vector<T> golden = {
13, 14, 15,
12, 13, 14
}; // 2x3
geam<T>(row_major, M, N, hA, hB, golden, true, false);
}
// C = A + B
template <typename T>
void geam_nn(bool row_major) {
int M = 2, N = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
}; // 2x3
const std::vector<T> hB = {
1, 1, 1,
-1, -1, -1
}; // 2x3
const std::vector<T> golden = {
13, 14, 15,
12, 13, 14
}; // 2x3
geam<T>(row_major, M, N, hA, hB, golden, false, false);
}
// C = A + B^T
template <typename T>
void geam_nt(bool row_major) {
int M = 2, N = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
}; // 2x3
const std::vector<T> hB = {
1, -1,
1, -1,
1, -1
}; // 3x2
const std::vector<T> golden = {
13, 14, 15,
12, 13, 14
}; // 2x3
geam<T>(row_major, M, N, hA, hB, golden, false, true);
}
// C = A^T + B^T
template <typename T>
void geam_tt(bool row_major) {
int M = 2, N = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
}; // 3x2
const std::vector<T> hB = {
1, -1,
1, -1,
1, -1
}; // 3x2
const std::vector<T> golden = {
13, 14, 15,
12, 13, 14
}; // 2x3
geam<T>(row_major, M, N, hA, hB, golden, true, true);
}
// column major
TEST_CASE("geam_tn.float" * doctest::timeout(300)) {
geam_tn<float>(false);
}
TEST_CASE("geam_nn.float" * doctest::timeout(300)) {
geam_nn<float>(false);
}
TEST_CASE("geam_nt.float" * doctest::timeout(300)) {
geam_nt<float>(false);
}
TEST_CASE("geam_tt.float" * doctest::timeout(300)) {
geam_tt<float>(false);
}
TEST_CASE("geam_tn.double" * doctest::timeout(300)) {
geam_tn<double>(false);
}
TEST_CASE("geam_nn.double" * doctest::timeout(300)) {
geam_nn<double>(false);
}
TEST_CASE("geam_nt.double" * doctest::timeout(300)) {
geam_nt<double>(false);
}
TEST_CASE("geam_tt.double" * doctest::timeout(300)) {
geam_tt<double>(false);
}
// row major
TEST_CASE("c_geam_tn.float" * doctest::timeout(300)) {
geam_tn<float>(true);
}
TEST_CASE("c_geam_nn.float" * doctest::timeout(300)) {
geam_nn<float>(true);
}
TEST_CASE("c_geam_nt.float" * doctest::timeout(300)) {
geam_nt<float>(true);
}
TEST_CASE("c_geam_tt.float" * doctest::timeout(300)) {
geam_tt<float>(true);
}
TEST_CASE("c_geam_tn.double" * doctest::timeout(300)) {
geam_tn<double>(true);
}
TEST_CASE("c_geam_nn.double" * doctest::timeout(300)) {
geam_nn<double>(true);
}
TEST_CASE("c_geam_nt.double" * doctest::timeout(300)) {
geam_nt<double>(true);
}
TEST_CASE("c_geam_tt.double" * doctest::timeout(300)) {
geam_tt<double>(true);
}
// ----------------------------------------------------------------------------
// Testcase: gemm and c_gemm
// ----------------------------------------------------------------------------
template <typename T>
void gemm(
bool row_major,
const int M,
const int N,
const int K,
const std::vector<T>& hA,
const std::vector<T>& hB,
const std::vector<T>& golden,
bool tranA,
bool tranB
) {
tf::Taskflow taskflow;
tf::Executor executor;
for(size_t d=0; d<tf::cuda_get_num_devices(); d++) {
auto dA = tf::cuda_malloc_device<T>(K*M, d);
auto dB = tf::cuda_malloc_device<T>(K*N, d);
auto dC = tf::cuda_malloc_device<T>(M*N, d);
auto dAlpha = tf::cuda_malloc_device<T>(1, d);
auto dBeta = tf::cuda_malloc_device<T>(1, d);
T* hC = new T[N*M];
auto cudaflow = taskflow.emplace_on([=, &hA, &hB](tf::cudaFlow& cf){
REQUIRE(tf::cuda_get_device() == d);
auto copyA = cf.copy(dA, hA.data(), K*M);
auto copyB = cf.copy(dB, hB.data(), K*N);
auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; });
auto beta = cf.single_task([=] __device__ () { *dBeta = 0; });
tf::cudaTask gemm;
if(tranA && !tranB) { // C = A^T * B (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm(
CUBLAS_OP_T, CUBLAS_OP_N,
M, N, K, dAlpha, dA, M, dB, N, dBeta, dC, N
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm(
CUBLAS_OP_N, CUBLAS_OP_T,
N, M, K, dAlpha, dB, N, dA, M, dBeta, dC, N
);
});
}
}
else if(!tranA && !tranB) { // C = A * B (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm(
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K, dAlpha, dA, K, dB, N, dBeta, dC, N
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm(
CUBLAS_OP_N, CUBLAS_OP_N,
N, M, K, dAlpha, dB, N, dA, K, dBeta, dC, N
);
});
}
}
else if(!tranA && tranB) { // C = A * B^T (r-major)
if(row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm(
CUBLAS_OP_N, CUBLAS_OP_T,
M, N, K, dAlpha, dA, K, dB, K, dBeta, dC, N
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm(
CUBLAS_OP_T, CUBLAS_OP_N,
N, M, K, dAlpha, dB, K, dA, K, dBeta, dC, N
);
});
}
}
else { // C = A^T * B^T (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm(
CUBLAS_OP_T, CUBLAS_OP_T,
M, N, K, dAlpha, dA, M, dB, K, dBeta, dC, N
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm(
CUBLAS_OP_T, CUBLAS_OP_T,
N, M, K, dAlpha, dB, K, dA, M, dBeta, dC, N
);
});
}
}
auto copyC = cf.copy(hC, dC, M*N);
gemm.precede(copyC)
.succeed(copyA, copyB, alpha, beta);
}, d);
auto verify = taskflow.emplace([=, &golden](){
for(size_t i=0; i<golden.size(); i++) {
REQUIRE(std::fabs(hC[i]-golden[i]) < 0.0001);
}
tf::cuda_free(dA);
tf::cuda_free(dB);
tf::cuda_free(dC);
tf::cuda_free(dAlpha);
tf::cuda_free(dBeta);
delete [] hC;
});
cudaflow.precede(verify);
}
executor.run(taskflow).wait();
}
// C = A^T * B
template <typename T>
void gemm_tn(bool row_major) {
int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
}; // 3x2
const std::vector<T> hB = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
}; // 3x4
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // 2x4
gemm<T>(row_major, M, N, K, hA, hB, golden, true, false);
}
// C = A * B
template <typename T>
void gemm_nn(bool row_major) {
int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
};
const std::vector<T> hB = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
gemm<T>(row_major, M, N, K, hA, hB, golden, false, false);
}
// C = A * B^T
template <typename T>
void gemm_nt(bool row_major) {
int M = 2, N = 4, K = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
}; // MxK
const std::vector<T> hB = {
11, 15, 19,
12, 16, 20,
13, 17, 21,
14, 18, 22
}; // NxK
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
gemm<T>(row_major, M, N, K, hA, hB, golden, false, true);
}
// C = A^T * B^T
template <typename T>
void gemm_tt(bool row_major) {
int M = 2, N = 4, K = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
}; // KxM
const std::vector<T> hB = {
11, 15, 19,
12, 16, 20,
13, 17, 21,
14, 18, 22
}; // NxK
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
gemm<T>(row_major, M, N, K, hA, hB, golden, true, true);
}
// gemm (column-major)
TEST_CASE("gemm_nn.float" * doctest::timeout(300)) {
gemm_nn<float>(false);
}
TEST_CASE("gemm_nn.double" * doctest::timeout(300)) {
gemm_nn<double>(false);
}
TEST_CASE("gemm_tn.float" * doctest::timeout(300)) {
gemm_tn<float>(false);
}
TEST_CASE("gemm_tn.double" * doctest::timeout(300)) {
gemm_tn<double>(false);
}
TEST_CASE("gemm_nt.float" * doctest::timeout(300)) {
gemm_nt<float>(false);
}
TEST_CASE("gemm_nt.double" * doctest::timeout(300)) {
gemm_nt<double>(false);
}
TEST_CASE("gemm_tt.float" * doctest::timeout(300)) {
gemm_tt<float>(false);
}
TEST_CASE("gemm_tt.double" * doctest::timeout(300)) {
gemm_tt<double>(false);
}
// c_gemm (row_major)
TEST_CASE("c_gemm_nn.float" * doctest::timeout(300)) {
gemm_nn<float>(true);
}
TEST_CASE("c_gemm_nn.double" * doctest::timeout(300)) {
gemm_nn<double>(true);
}
TEST_CASE("c_gemm_tn.float" * doctest::timeout(300)) {
gemm_tn<float>(true);
}
TEST_CASE("c_gemm_tn.double" * doctest::timeout(300)) {
gemm_tn<double>(true);
}
TEST_CASE("c_gemm_nt.float" * doctest::timeout(300)) {
gemm_nt<float>(true);
}
TEST_CASE("c_gemm_nt.double" * doctest::timeout(300)) {
gemm_nt<double>(true);
}
TEST_CASE("c_gemm_tt.float" * doctest::timeout(300)) {
gemm_tt<float>(true);
}
TEST_CASE("c_gemm_tt.double" * doctest::timeout(300)) {
gemm_tt<double>(true);
}
// ----------------------------------------------------------------------------
// Testcase: gemm_batched and c_gemm_batched
// ----------------------------------------------------------------------------
constexpr size_t S = 10;
template <typename T>
void gemm_batched(
bool row_major,
const int M,
const int N,
const int K,
const T* hA[],
const T* hB[],
const std::vector<T>& golden,
bool tranA,
bool tranB
) {
tf::Taskflow taskflow;
tf::Executor executor;
int d = 0;
auto dA = tf::cuda_malloc_device<T>(S*K*M, d);
auto dB = tf::cuda_malloc_device<T>(S*K*N, d);
auto dC = tf::cuda_malloc_device<T>(S*M*N, d);
auto dAlpha = tf::cuda_malloc_device<T>(1, d);
auto dBeta = tf::cuda_malloc_device<T>(1, d);
auto hC = new T[S*M*N];
auto dAs = tf::cuda_malloc_device<T*>(S, d);
auto dBs = tf::cuda_malloc_device<T*>(S, d);
auto dCs = tf::cuda_malloc_device<T*>(S, d);
auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf){
tf::cudaTask copyA[S], copyB[S];
for(size_t s=0; s<S; s++) {
copyA[s] = cf.copy(dA + s*K*M, hA[s], K*M);
copyB[s] = cf.copy(dB + s*K*N, hB[s], K*N);
}
auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; });
auto beta = cf.single_task([=] __device__ () { *dBeta = 0; });
auto array = cf.single_task([=] __device__ () {
for(size_t s=0; s<S; s++) {
dAs[s] = dA + s*K*M;
dBs[s] = dB + s*K*N;
dCs[s] = dC + s*M*N;
}
});
tf::cudaTask gemm;
if(!tranA && !tranB) { // C = A * B (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_batched(CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K, dAlpha, (const T**)dAs, K, (const T**)dBs, N, dBeta, dCs, N, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_batched(CUBLAS_OP_N, CUBLAS_OP_N,
N, M, K, dAlpha, (const T**)dBs, N, (const T**)dAs, K, dBeta, dCs, N, S
);
});
}
}
else if(tranA && !tranB) { // C = A^T * B (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_batched(CUBLAS_OP_T, CUBLAS_OP_N,
M, N, K, dAlpha, (const T**)dAs, M, (const T**)dBs, N, dBeta, dCs, N, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_batched(CUBLAS_OP_N, CUBLAS_OP_T,
N, M, K, dAlpha, (const T**)dBs, N, (const T**)dAs, M, dBeta, dCs, N, S
);
});
}
}
else if(!tranA && tranB) { // C = A * B^T (r-major)
if(row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_batched(CUBLAS_OP_N, CUBLAS_OP_T,
M, N, K, dAlpha, (const T**)dAs, K, (const T**)dBs, K, dBeta, dCs, N, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_batched(CUBLAS_OP_T, CUBLAS_OP_N,
N, M, K, dAlpha, (const T**)dBs, K, (const T**)dAs, K, dBeta, dCs, N, S
);
});
}
}
else { // C = A^T * B^T (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_batched(CUBLAS_OP_T, CUBLAS_OP_T,
M, N, K, dAlpha, (const T**)dAs, M, (const T**)dBs, K, dBeta, dCs, N, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_batched(CUBLAS_OP_T, CUBLAS_OP_T,
N, M, K, dAlpha, (const T**)dBs, K, (const T**)dAs, M, dBeta, dCs, N, S
);
});
}
}
gemm.succeed(alpha, beta, array);
for(size_t s=0; s<S; s++) {
auto copyC = cf.copy(hC, dC, S*M*N);
gemm.succeed(copyA[s], copyB[s])
.precede(copyC);
}
});
auto verify = taskflow.emplace([&](){
for(size_t s=0; s<S; s++) {
auto p = hC + s*M*N;
for(size_t i=0; i<golden.size(); i++) {
REQUIRE(std::fabs(p[i]-golden[i]) < 0.0001);
}
}
tf::cuda_free(dA);
tf::cuda_free(dB);
tf::cuda_free(dC);
tf::cuda_free(dAlpha);
tf::cuda_free(dBeta);
tf::cuda_free(dAs);
tf::cuda_free(dBs);
tf::cuda_free(dCs);
delete [] hC;
});
cudaflow.precede(verify);
executor.run(taskflow).wait();
}
// C = A * B
template <typename T>
void gemm_batched_nn(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
};
const std::vector<T> hB = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
const T* hAs[S];
const T* hBs[S];
for(size_t s=0; s<S; s++) {
hAs[s] = hA.data();
hBs[s] = hB.data();
}
gemm_batched<T>(row_major, M, N, K, hAs, hBs, golden, false, false);
}
// C = A^T * B
template <typename T>
void gemm_batched_tn(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
};
const std::vector<T> hB = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
const T* hAs[S];
const T* hBs[S];
for(size_t s=0; s<S; s++) {
hAs[s] = hA.data();
hBs[s] = hB.data();
}
gemm_batched<T>(row_major, M, N, K, hAs, hBs, golden, true, false);
}
// C = A * B^T
template <typename T>
void gemm_batched_nt(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
};
const std::vector<T> hB = {
11, 15, 19,
12, 16, 20,
13, 17, 21,
14, 18, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
const T* hAs[S];
const T* hBs[S];
for(size_t s=0; s<S; s++) {
hAs[s] = hA.data();
hBs[s] = hB.data();
}
gemm_batched<T>(row_major, M, N, K, hAs, hBs, golden, false, true);
}
// C = A^T * B^T
template <typename T>
void gemm_batched_tt(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
};
const std::vector<T> hB = {
11, 15, 19,
12, 16, 20,
13, 17, 21,
14, 18, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
const T* hAs[S];
const T* hBs[S];
for(size_t s=0; s<S; s++) {
hAs[s] = hA.data();
hBs[s] = hB.data();
}
gemm_batched<T>(row_major, M, N, K, hAs, hBs, golden, true, true);
}
// gemm_batched (column-major)
TEST_CASE("gemm_batched_nn.float" * doctest::timeout(300)) {
gemm_batched_nn<float>(false);
}
TEST_CASE("gemm_batched_tn.float" * doctest::timeout(300)) {
gemm_batched_tn<float>(false);
}
TEST_CASE("gemm_batched_nt.float" * doctest::timeout(300)) {
gemm_batched_nt<float>(false);
}
TEST_CASE("gemm_batched_tt.float" * doctest::timeout(300)) {
gemm_batched_tt<float>(false);
}
TEST_CASE("gemm_batched_nn.double" * doctest::timeout(300)) {
gemm_batched_nn<double>(false);
}
TEST_CASE("gemm_batched_tn.double" * doctest::timeout(300)) {
gemm_batched_tn<double>(false);
}
TEST_CASE("gemm_batched_nt.double" * doctest::timeout(300)) {
gemm_batched_nt<double>(false);
}
TEST_CASE("gemm_batched_tt.double" * doctest::timeout(300)) {
gemm_batched_tt<double>(false);
}
// c_gemm_batched (row-major)
TEST_CASE("c_gemm_batched_nn.float" * doctest::timeout(300)) {
gemm_batched_nn<float>(true);
}
TEST_CASE("c_gemm_batched_tn.float" * doctest::timeout(300)) {
gemm_batched_tn<float>(true);
}
TEST_CASE("c_gemm_batched_nt.float" * doctest::timeout(300)) {
gemm_batched_nt<float>(true);
}
TEST_CASE("c_gemm_batched_tt.float" * doctest::timeout(300)) {
gemm_batched_tt<float>(true);
}
TEST_CASE("c_gemm_batched_nn.double" * doctest::timeout(300)) {
gemm_batched_nn<double>(true);
}
TEST_CASE("c_gemm_batched_tn.double" * doctest::timeout(300)) {
gemm_batched_tn<double>(true);
}
TEST_CASE("c_gemm_batched_nt.double" * doctest::timeout(300)) {
gemm_batched_nt<double>(true);
}
TEST_CASE("c_gemm_batched_tt.double" * doctest::timeout(300)) {
gemm_batched_tt<double>(true);
}
// ----------------------------------------------------------------------------
// Testcase: gemm_strided_batched
// ----------------------------------------------------------------------------
template <typename T>
void gemm_strided_batched(
bool row_major,
const int M,
const int N,
const int K,
const T* hA,
const T* hB,
const std::vector<T>& golden,
bool tranA,
bool tranB
) {
tf::Taskflow taskflow;
tf::Executor executor;
int d = 0;
auto dA = tf::cuda_malloc_device<T>(S*K*M, d);
auto dB = tf::cuda_malloc_device<T>(S*K*N, d);
auto dC = tf::cuda_malloc_device<T>(S*M*N, d);
auto dAlpha = tf::cuda_malloc_device<T>(1, d);
auto dBeta = tf::cuda_malloc_device<T>(1, d);
auto hC = new T[S*M*N];
int sA = K*M;
int sB = K*N;
int sC = M*N;
auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf){
auto copyA = cf.copy(dA, hA, S*K*M);
auto copyB = cf.copy(dB, hB, S*K*N);
auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; });
auto beta = cf.single_task([=] __device__ () { *dBeta = 0; });
tf::cudaTask gemm;
if(!tranA && !tranB) { // C = A * B (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_sbatched(
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K, dAlpha, dA, K, sA, dB, N, sB, dBeta, dC, N, sC, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_sbatched(
CUBLAS_OP_N, CUBLAS_OP_N,
N, M, K, dAlpha, dB, N, sB, dA, K, sA, dBeta, dC, N, sC, S
);
});
}
}
else if(tranA && !tranB) { // C = A^T * B (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_sbatched(
CUBLAS_OP_T, CUBLAS_OP_N,
M, N, K, dAlpha, dA, M, sA, dB, N, sB, dBeta, dC, N, sC, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_sbatched(
CUBLAS_OP_N, CUBLAS_OP_T,
N, M, K, dAlpha, dB, N, sB, dA, M, sA, dBeta, dC, N, sC, S
);
});
}
}
else if(!tranA && tranB) { // C = A * B^T (r-major)
if(row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_sbatched(
CUBLAS_OP_N, CUBLAS_OP_T,
M, N, K, dAlpha, dA, K, sA, dB, K, sB, dBeta, dC, N, sC, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_sbatched(
CUBLAS_OP_T, CUBLAS_OP_N,
N, M, K, dAlpha, dB, K, sB, dA, K, sA, dBeta, dC, N, sC, S
);
});
}
}
else { // C = A^T * B^T (r-major)
if (row_major) {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->c_gemm_sbatched(
CUBLAS_OP_T, CUBLAS_OP_T,
M, N, K, dAlpha, dA, M, sA, dB, K, sB, dBeta, dC, N, sC, S
);
});
}
else {
gemm = cf.capture([&](tf::cudaFlowCapturer& flow){
flow.make_capturer<tf::cublasFlowCapturer>()->gemm_sbatched(
CUBLAS_OP_T, CUBLAS_OP_T,
N, M, K, dAlpha, dB, K, sB, dA, M, sA, dBeta, dC, N, sC, S
);
});
}
}
auto copyC = cf.copy(hC, dC, S*M*N);
gemm.succeed(alpha, beta, copyA, copyB)
.precede(copyC);
});
auto verify = taskflow.emplace([&](){
for(size_t s=0; s<S; s++) {
auto p = hC + s*M*N;
for(size_t i=0; i<golden.size(); i++) {
REQUIRE(std::fabs(p[i]-golden[i]) < 0.0001);
}
}
tf::cuda_free(dA);
tf::cuda_free(dB);
tf::cuda_free(dC);
tf::cuda_free(dAlpha);
tf::cuda_free(dBeta);
delete [] hC;
});
cudaflow.precede(verify);
executor.run(taskflow).wait();
}
// C = A * B
template <typename T>
void gemm_strided_batched_nn(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
};
const std::vector<T> hB = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
std::vector<T> hAs, hBs;
for(size_t s=0; s<S; s++) {
for(auto a : hA) hAs.push_back(a);
for(auto b : hB) hBs.push_back(b);
}
gemm_strided_batched<T>(
row_major, M, N, K, hAs.data(), hBs.data(), golden, false, false
);
}
// C = A^T * B
template <typename T>
void gemm_strided_batched_tn(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
};
const std::vector<T> hB = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
std::vector<T> hAs, hBs;
for(size_t s=0; s<S; s++) {
for(auto a : hA) hAs.push_back(a);
for(auto b : hB) hBs.push_back(b);
}
gemm_strided_batched<T>(
row_major, M, N, K, hAs.data(), hBs.data(), golden, true, false
);
}
// C = A * B^T
template <typename T>
void gemm_strided_batched_nt(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 12, 13,
14, 15, 16
};
const std::vector<T> hB = {
11, 15, 19,
12, 16, 20,
13, 17, 21,
14, 18, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
std::vector<T> hAs, hBs;
for(size_t s=0; s<S; s++) {
for(auto a : hA) hAs.push_back(a);
for(auto b : hB) hBs.push_back(b);
}
gemm_strided_batched<T>(
row_major, M, N, K, hAs.data(), hBs.data(), golden, false, true
);
}
// C = A^T * B^T
template <typename T>
void gemm_strided_batched_tt(bool row_major) {
const int N = 4, M = 2, K = 3;
const std::vector<T> hA = {
11, 14,
12, 15,
13, 16
};
const std::vector<T> hB = {
11, 15, 19,
12, 16, 20,
13, 17, 21,
14, 18, 22
};
const std::vector<T> golden = {
548, 584, 620, 656,
683, 728, 773, 818
}; // MxN
std::vector<T> hAs, hBs;
for(size_t s=0; s<S; s++) {
for(auto a : hA) hAs.push_back(a);
for(auto b : hB) hBs.push_back(b);
}
gemm_strided_batched<T>(
row_major, M, N, K, hAs.data(), hBs.data(), golden, true, true
);
}
// gemm_strided_batched (column-major)
TEST_CASE("gemm_strided_batched_nn.float" * doctest::timeout(300)) {
gemm_strided_batched_nn<float>(false);
}
TEST_CASE("gemm_strided_batched_tn.float" * doctest::timeout(300)) {
gemm_strided_batched_tn<float>(false);
}
TEST_CASE("gemm_strided_batched_nt.float" * doctest::timeout(300)) {
gemm_strided_batched_nt<float>(false);
}
TEST_CASE("gemm_strided_batched_tt.float" * doctest::timeout(300)) {
gemm_strided_batched_tt<float>(false);
}
TEST_CASE("gemm_strided_batched_nn.double" * doctest::timeout(300)) {
gemm_strided_batched_nn<double>(false);
}
TEST_CASE("gemm_strided_batched_tn.double" * doctest::timeout(300)) {
gemm_strided_batched_tn<double>(false);
}
TEST_CASE("gemm_strided_batched_nt.double" * doctest::timeout(300)) {
gemm_strided_batched_nt<double>(false);
}
TEST_CASE("gemm_strided_batched_tt.double" * doctest::timeout(300)) {
gemm_strided_batched_tt<double>(false);
}
// gemm_strided_batched (row-major)
TEST_CASE("c_gemm_strided_batched_nn.float" * doctest::timeout(300)) {
gemm_strided_batched_nn<float>(true);
}
TEST_CASE("c_gemm_strided_batched_tn.float" * doctest::timeout(300)) {
gemm_strided_batched_tn<float>(true);
}
TEST_CASE("c_gemm_strided_batched_nt.float" * doctest::timeout(300)) {
gemm_strided_batched_nt<float>(true);
}
TEST_CASE("c_gemm_strided_batched_tt.float" * doctest::timeout(300)) {
gemm_strided_batched_tt<float>(true);
}
TEST_CASE("c_gemm_strided_batched_nn.double" * doctest::timeout(300)) {
gemm_strided_batched_nn<double>(true);
}
TEST_CASE("c_gemm_strided_batched_tn.double" * doctest::timeout(300)) {
gemm_strided_batched_tn<double>(true);
}
TEST_CASE("c_gemm_strided_batched_nt.double" * doctest::timeout(300)) {
gemm_strided_batched_nt<double>(true);
}
TEST_CASE("c_gemm_strided_batched_tt.double" * doctest::timeout(300)) {
gemm_strided_batched_tt<double>(true);
}
// ----------------------------------------------------------------------------
// symm
// ----------------------------------------------------------------------------
template <typename T>
void symm_test() {
int M = 3;
int N = 4;
int LA = 6, LB = 6, LC = N;
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hB = {
-1, -1, -1, -1, -1, -1,
-1, 1, 1, 3, 1, -1,
-1, 1, 4, 1, 1, -1,
-1, 1, 1, 7, 1, -1
};
const std::vector<T> gold = {
4, 7, 14, 4,
4, 10, 12, 4,
4, 7, 18, 4
};
std::vector<T> hC(M*N);
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dB = tf::cuda_malloc_device<T>(hB.size());
auto dC = tf::cuda_malloc_device<T>(hC.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
auto dbeta = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto beta = capturer.single_task([=] __device__ () { *dbeta = 0; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dB = capturer.copy(dB, hB.data(), hB.size());
auto symm = blas->c_symm(
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
M, N, dalpha, dA + 7, LA, dB + 7, LB, dbeta, dC, LC
);
auto d2hC = capturer.copy(hC.data(), dC, hC.size());
symm.succeed(h2dA, h2dB, alpha, beta)
.precede(d2hC);
});
executor.run(taskflow).wait();
for(size_t i=0; i<hC.size(); i++) {
REQUIRE(std::fabs(hC[i] - gold[i]) < 0.0001);
}
}
TEST_CASE("c_symm.float" * doctest::timeout(300)) {
symm_test<float>();
}
TEST_CASE("c_symm.double" * doctest::timeout(300)) {
symm_test<double>();
}
// ----------------------------------------------------------------------------
// syrk
// ----------------------------------------------------------------------------
template <typename T>
void syrk_test() {
int N = 3;
int K = 4;
int LA = 6, LC = 6;
std::vector<T> hC = {
-1, -1, -1, -1, -1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 1, 1, 3, 1, -1,
-1, 1, 4, 1, 1, -1,
-1, 1, 1, 7, 1, -1
};
const std::vector<T> gold = {
-1, -1, -1, -1, -1, -1,
-1, 14, 0, 0, -1, -1,
-1, 10, 21, 0, -1, -1,
-1, 25, 14, 54, -1, -1
};
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dC = tf::cuda_malloc_device<T>(hC.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
auto dbeta = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto beta = capturer.single_task([=] __device__ () { *dbeta = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dC = capturer.copy(dC, hC.data(), hC.size());
auto syrk = blas->c_syrk(
CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
N, K, dalpha, dA + 7, LA, dbeta, dC + 7, LC
);
auto d2hC = capturer.copy(hC.data(), dC, hC.size());
syrk.succeed(h2dA, h2dC, alpha, beta)
.precede(d2hC);
});
executor.run(taskflow).wait();
//print_matrix(4, 6, hC);
for(size_t i=0; i<hC.size(); i++) {
REQUIRE(std::fabs(hC[i] - gold[i]) < 0.0001);
}
}
TEST_CASE("c_syrk.float" * doctest::timeout(300)) {
syrk_test<float>();
}
TEST_CASE("c_syrk.double" * doctest::timeout(300)) {
syrk_test<double>();
}
// ----------------------------------------------------------------------------
// syr2k
// ----------------------------------------------------------------------------
template <typename T>
void syr2k_test() {
int N = 3;
int K = 4;
int LA = 6, LC = 6, LB = 6;
std::vector<T> hC = {
-1, -1, -1, -1, -1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 1, 1, 3, 1, -1,
-1, 1, 4, 1, 1, -1,
-1, 1, 1, 7, 1, -1
};
const std::vector<T> hB = {
-1, -1, -1, -1, -1, -1,
-1, 1, 10, 2, 9, -1,
-1, 8, 14, 2, 1, -1,
-1, 13, 3, 1, 4, -1
};
const std::vector<T> gold = {
-1, -1, -1, -1, -1, -1,
-1, 54, 0, 0, -1, -1,
-1, 82, 136, 0, -1, -1,
-1, 58, 68, 56, -1, -1
};
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dB = tf::cuda_malloc_device<T>(hB.size());
auto dC = tf::cuda_malloc_device<T>(hC.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
auto dbeta = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto beta = capturer.single_task([=] __device__ () { *dbeta = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dB = capturer.copy(dB, hB.data(), hB.size());
auto h2dC = capturer.copy(dC, hC.data(), hC.size());
auto syr2k = blas->c_syr2k(
CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
N, K, dalpha, dA + 7, LA, dB + 7, LB, dbeta, dC + 7, LC
);
auto d2hC = capturer.copy(hC.data(), dC, hC.size());
syr2k.succeed(h2dA, h2dC, h2dB, alpha, beta)
.precede(d2hC);
});
executor.run(taskflow).wait();
//print_matrix(4, 6, hC);
for(size_t i=0; i<hC.size(); i++) {
REQUIRE(std::fabs(hC[i] - gold[i]) < 0.0001);
}
}
TEST_CASE("c_syr2k.float" * doctest::timeout(300)) {
syr2k_test<float>();
}
TEST_CASE("c_syr2k.double" * doctest::timeout(300)) {
syr2k_test<double>();
}
// ----------------------------------------------------------------------------
// trmm
// ----------------------------------------------------------------------------
template <typename T>
void trmm_test() {
int N = 4;
int M = 3;
int LA = 6, LC = 6, LB = 6;
std::vector<T> hC = {
-1, 1, 1, 1, 1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 1, 0, 0, -1, -1,
-1, 1, 4, 0, -1, -1,
-1, 1, 1, 7, -1, -1
};
const std::vector<T> hB = {
-1, -1, -1, -1, -1, -1,
-1, 1, 10, 2, 9, -1,
-1, 8, 14, 2, 1, -1,
-1, 13, 3, 1, 4, -1
};
const std::vector<T> gold = {
-1, -1, -1, -1, -1, -1,
-1, 1, 10, 2, 9, -1,
-1, 33, 66, 10, 13, -1,
-1, 100, 45, 11, 38, -1
};
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dB = tf::cuda_malloc_device<T>(hB.size());
auto dC = tf::cuda_malloc_device<T>(hC.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dB = capturer.copy(dB, hB.data(), hB.size());
auto setC = capturer.for_each(dC, dC + hC.size(),
[]__device__(T& v) { v = -1; });
auto trmm = blas->c_trmm(
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
M, N, dalpha, dA + 7, LA, dB + 7, LB, dC + 7, LC
);
auto d2hC = capturer.copy(hC.data(), dC, hC.size());
trmm.succeed(h2dA, h2dB, alpha, setC)
.precede(d2hC);
});
executor.run(taskflow).wait();
//print_matrix(4, 6, hC);
for(size_t i=0; i<hC.size(); i++) {
REQUIRE(std::fabs(hC[i] - gold[i]) < 0.0001);
}
}
TEST_CASE("c_trmm.float" * doctest::timeout(300)) {
trmm_test<float>();
}
TEST_CASE("c_trmm.double" * doctest::timeout(300)) {
trmm_test<double>();
}
// ----------------------------------------------------------------------------
// trsm
// ----------------------------------------------------------------------------
template <typename T>
void trsm_test() {
int N = 2;
int M = 3;
int LA = 6;
int LB = 2;
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
std::vector<T> hB = {
5, 10,
4, 8,
7, 14
};
const std::vector<T> sol = {
2.5, 5,
0.75, 1.5,
1.875, 3.75
};
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dB = tf::cuda_malloc_device<T>(hB.size());
auto dAlpha = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dAlpha = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dB = capturer.copy(dB, hB.data(), hB.size());
auto trsm = blas->c_trsm(
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
M, N, dAlpha, dA + 7, LA, dB, LB
);
auto d2h = capturer.copy(hB.data(), dB, hB.size());
trsm.succeed(h2dA, h2dB, alpha)
.precede(d2h);
});
executor.run(taskflow).wait();
//print_matrix(3, 2, hB);
for(size_t i=0; i<hB.size(); ++i) {
//std::cout << res[i] << '\n';
REQUIRE(std::fabs(hB[i] - sol[i]) < 0.0001);
}
}
TEST_CASE("c_trsm.float" * doctest::timeout(300)) {
trsm_test<float>();
}
TEST_CASE("c_trsm.double" * doctest::timeout(300)) {
trsm_test<double>();
}