mesytec-mnode/external/taskflow-3.8.0/sandbox/cublas_unittests/level2.cu
2025-01-04 01:25:05 +01:00

572 lines
13 KiB
Text

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cublasflow.hpp>
// ----------------------------------------------------------------------------
// Testcase: gemv and c_gemv
// ----------------------------------------------------------------------------
template <typename T>
void gemv(
bool row_major,
const int M,
const int N,
const std::vector<T>& hA,
const std::vector<T>& hx,
const std::vector<T>& golden,
bool trans
) {
for(size_t d=0; d<tf::cuda_get_num_devices(); d++) {
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(M*N, d);
auto dAlpha = tf::cuda_malloc_device<T>(1, d);
auto dBeta = tf::cuda_malloc_device<T>(1, d);
T* hy;
T* dx;
T* dy;
if(trans) {
hy = new T[N];
dx = tf::cuda_malloc_device<T>(M, d);
dy = tf::cuda_malloc_device<T>(N, d);
}
else {
hy = new T[M];
dx = tf::cuda_malloc_device<T>(N, d);
dy = tf::cuda_malloc_device<T>(M, d);
}
auto cudaflow = taskflow.emplace_on([=](tf::cudaFlow& cf){
REQUIRE(tf::cuda_get_device() == d);
auto copyA = cf.copy(dA, hA.data(), M*N);
tf::cudaTask copyx;
(trans) ? copyx = cf.copy(dx, hx.data(), M)
: copyx = cf.copy(dx, hx.data(), N);
auto alpha = cf.single_task([=] __device__ () { *dAlpha = 1; });
auto beta = cf.single_task([=] __device__ () { *dBeta = 0; });
tf::cudaTask gemv;
if(trans) {
if(row_major) { // C = A^T * x (r-major)
gemv = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->c_gemv(
CUBLAS_OP_T,
M, N, dAlpha, dA, N, dx, 1, dBeta, dy, 1
);
});
}
else {
gemv = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->gemv(
CUBLAS_OP_N,
N, M, dAlpha, dA, N, dx, 1, dBeta, dy, 1
);
});
}
}
else {
if(row_major) { // C = A * x (r-major)
gemv = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->c_gemv(
CUBLAS_OP_N,
M, N, dAlpha, dA, N, dx, 1, dBeta, dy, 1
);
});
}
else {
gemv = cf.capture([&](tf::cudaFlowCapturer& cap){
cap.make_capturer<tf::cublasFlowCapturer>()->gemv(
CUBLAS_OP_T,
N, M, dAlpha, dA, N, dx, 1, dBeta, dy, 1
);
});
}
}
tf::cudaTask copyy;
(trans) ? copyy = cf.copy(hy, dy, N)
: copyy = cf.copy(hy, dy, M);
gemv.precede(copyy)
.succeed(copyA, copyx, alpha, beta);
}, d);
auto verify = taskflow.emplace([=, &golden](){
for(size_t i=0; i<golden.size(); i++) {
//std::cerr << hy[i] << " ";
REQUIRE(std::fabs(hy[i]-golden[i]) < 0.0001);
}
//std::cerr << '\n';
tf::cuda_free(dA);
tf::cuda_free(dx);
tf::cuda_free(dy);
tf::cuda_free(dAlpha);
tf::cuda_free(dBeta);
delete [] hy;
});
cudaflow.precede(verify);
executor.run(taskflow).wait();
}
}
template <typename T>
void gemv_test(bool row_major, bool trans) {
int M = 3, N = 4;
const std::vector<T> hA = {
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 21, 22
}; // 3x4
std::vector<T> hx;
std::vector<T> golden;
//ha.T * hx
if(trans) {
hx = {11, 12, 13};
golden = {548, 584, 620, 656};
}
else {
hx = {11, 12, 13, 14};
golden = {630, 830, 1030};
}
gemv<T>(row_major, M, N, hA, hx, golden, trans);
}
// gemv (column-major)
TEST_CASE("gemv_n.float" * doctest::timeout(300)) {
gemv_test<float>(false, false);
}
TEST_CASE("gemv_n.double" * doctest::timeout(300)) {
gemv_test<double>(false, false);
}
TEST_CASE("gemv_t.float" * doctest::timeout(300)) {
gemv_test<float>(false, true);
}
TEST_CASE("gemv_t.double" * doctest::timeout(300)) {
gemv_test<double>(false, true);
}
// gemv (row-major)
TEST_CASE("c_gemv_n.float" * doctest::timeout(300)) {
gemv_test<float>(true, false);
}
TEST_CASE("c_gemv_n.double" * doctest::timeout(300)) {
gemv_test<double>(true, false);
}
TEST_CASE("c_gemv_t.float" * doctest::timeout(300)) {
gemv_test<float>(true, true);
}
TEST_CASE("c_gemv_t.double" * doctest::timeout(300)) {
gemv_test<double>(true, true);
}
// ----------------------------------------------------------------------------
// trsv
// ----------------------------------------------------------------------------
template <typename T>
void c_trsv_test() {
int N = 3;
int L = 6;
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hB = {
5,
4,
7
};
const std::vector<T> sol = {
2.5,
0.75,
1.875
};
std::vector<T> res(N, 0);
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dB = tf::cuda_malloc_device<T>(hB.size());
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dB = capturer.copy(dB, hB.data(), hB.size());
auto trsv = blas->c_trsv(
CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
N, dA + 7, L, dB, 1
);
auto d2h = capturer.copy(res.data(), dB, res.size());
trsv.succeed(h2dA, h2dB)
.precede(d2h);
});
executor.run(taskflow).wait();
for(size_t i=0; i<res.size(); ++i) {
//std::cout << res[i] << '\n';
REQUIRE(std::fabs(res[i] - sol[i]) < 0.0001);
}
}
TEST_CASE("c_trsv.float" * doctest::timeout(300)) {
c_trsv_test<float>();
}
TEST_CASE("c_trsv.double" * doctest::timeout(300)) {
c_trsv_test<double>();
}
// ----------------------------------------------------------------------------
// trmv
// ----------------------------------------------------------------------------
template <typename T>
void c_trmv_test() {
int N = 3;
int L = 6;
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 1, 1, -1, -1,
-1, 0, 2, 1, -1, -1,
-1, 0, 0, 2, -1, -1
};
const std::vector<T> hB = {
5,
-1,
4,
-1,
7,
-1
};
const std::vector<T> sol = {
21,
-1,
15,
-1,
14,
-1
};
std::vector<T> res(hB.size(), 0);
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dB = tf::cuda_malloc_device<T>(hB.size());
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dB = capturer.copy(dB, hB.data(), hB.size());
auto trmv = blas->c_trmv(
CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
N, dA + 7, L, dB, 2
);
auto d2h = capturer.copy(res.data(), dB, res.size());
trmv.succeed(h2dA, h2dB)
.precede(d2h);
});
executor.run(taskflow).wait();
for(size_t i=0; i<res.size(); ++i) {
REQUIRE(std::fabs(res[i] - sol[i]) < 0.0001);
}
}
TEST_CASE("c_trmv.float" * doctest::timeout(300)) {
c_trmv_test<float>();
}
TEST_CASE("c_trmv.double" * doctest::timeout(300)) {
c_trmv_test<double>();
}
// ----------------------------------------------------------------------------
// symv: y = alpha * A * x + beta * y
// ----------------------------------------------------------------------------
template <typename T>
void c_symv_test() {
int N = 3;
int L = 6;
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 1, 1, -1, -1,
-1, 1, 2, 1, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hx = {
5,
-1,
4,
-1,
7,
-1
};
std::vector<T> hy = {
2,
-1,
3,
-1,
45,
-1
};
const std::vector<T> sol = {
23,
-1,
23,
-1,
68,
-1
};
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dx = tf::cuda_malloc_device<T>(hx.size());
auto dy = tf::cuda_malloc_device<T>(hy.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
auto dbeta = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto beta = capturer.single_task([=] __device__ () { *dbeta = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dx = capturer.copy(dx, hx.data(), hx.size());
auto h2dy = capturer.copy(dy, hy.data(), hy.size());
auto symv = blas->c_symv(CUBLAS_FILL_MODE_UPPER,
N, dalpha, dA + 7, L, dx, 2, dbeta, dy, 2
);
auto d2h = capturer.copy(hy.data(), dy, hy.size());
symv.succeed(h2dA, h2dx, h2dy, alpha, beta)
.precede(d2h);
});
executor.run(taskflow).wait();
for(size_t i=0; i<hy.size(); ++i) {
//std::cout << hy[i] << '\n';
REQUIRE(std::fabs(hy[i] - sol[i]) < 0.0001);
}
}
TEST_CASE("c_symv.float" * doctest::timeout(300)) {
c_symv_test<float>();
}
TEST_CASE("c_symv.double" * doctest::timeout(300)) {
c_symv_test<double>();
}
// ----------------------------------------------------------------------------
// syr: A = alpha * x * x^T + A
// ----------------------------------------------------------------------------
template <typename T>
void c_syr_test() {
int N = 3;
int L = 6;
std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 1, 1, -1, -1,
-1, 0, 2, 1, -1, -1,
-1, 0, 0, 2, -1, -1
};
const std::vector<T> hx = {
5,
-1,
4,
-1,
7,
-1
};
const std::vector<T> sol = {
-1, -1, -1, -1, -1, -1,
-1, 27, 21, 36, -1, -1,
-1, 0, 18, 29, -1, -1,
-1, 0, 0, 51, -1, -1
};
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dx = tf::cuda_malloc_device<T>(hx.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dx = capturer.copy(dx, hx.data(), hx.size());
auto syr = blas->c_syr(CUBLAS_FILL_MODE_UPPER,
N, dalpha, dx, 2, dA + 7, L
);
auto d2h = capturer.copy(hA.data(), dA, hA.size());
syr.succeed(h2dA, h2dx, alpha)
.precede(d2h);
});
executor.run(taskflow).wait();
for(size_t i=0; i<hA.size(); ++i) {
REQUIRE(std::fabs(hA[i] - sol[i]) < 0.0001);
}
}
TEST_CASE("c_syr.float" * doctest::timeout(300)) {
c_syr_test<float>();
}
TEST_CASE("c_syr.double" * doctest::timeout(300)) {
c_syr_test<double>();
}
// ----------------------------------------------------------------------------
// syr2: A = alpha * x * y^T + y * x^T + A
// ----------------------------------------------------------------------------
template <typename T>
void c_syr2_test() {
int N = 3;
int L = 6;
const std::vector<T> hA = {
-1, -1, -1, -1, -1, -1,
-1, 2, 0, 0, -1, -1,
-1, 1, 2, 0, -1, -1,
-1, 1, 1, 2, -1, -1
};
const std::vector<T> hx = {
5,
-1,
4,
-1,
7,
-1
};
const std::vector<T> hy = {
2,
-1,
3,
-1,
1,
-1
};
const std::vector<T> sol = {
-1, -1, -1, -1, -1, -1,
-1, 22, 0, 0, -1, -1,
-1, 24, 26, 0, -1, -1,
-1, 20, 26, 16, -1, -1
};
std::vector<T> res(hA.size(), 0);
tf::Taskflow taskflow;
tf::Executor executor;
auto dA = tf::cuda_malloc_device<T>(hA.size());
auto dx = tf::cuda_malloc_device<T>(hx.size());
auto dy = tf::cuda_malloc_device<T>(hy.size());
auto dalpha = tf::cuda_malloc_device<T>(1);
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto blas = capturer.make_capturer<tf::cublasFlowCapturer>();
auto alpha = capturer.single_task([=] __device__ () { *dalpha = 1; });
auto h2dA = capturer.copy(dA, hA.data(), hA.size());
auto h2dx = capturer.copy(dx, hx.data(), hx.size());
auto h2dy = capturer.copy(dy, hy.data(), hy.size());
auto syr2 = blas->c_syr2(CUBLAS_FILL_MODE_LOWER,
N, dalpha, dx, 2, dy, 2, dA + 7, L
);
auto d2h = capturer.copy(res.data(), dA, res.size());
syr2.succeed(h2dA, h2dx, h2dy, alpha)
.precede(d2h);
});
executor.run(taskflow).wait();
//for(int i=0; i<=N; i++) {
// for(int l=0; l<L; l++) {
// std::cout << res[i*L+l] << ' ';
// }
// std::cout << '\n';
//}
for(size_t i=0; i<res.size(); ++i) {
REQUIRE(std::fabs(res[i] - sol[i]) < 0.0001);
}
}
TEST_CASE("c_syr2.float" * doctest::timeout(300)) {
c_syr2_test<float>();
}
TEST_CASE("c_syr2.double" * doctest::timeout(300)) {
c_syr2_test<double>();
}