mesytec-mnode/external/taskflow-3.8.0/sandbox/cublas_unittests/level1.cu
2025-01-04 01:25:05 +01:00

326 lines
8.4 KiB
Text

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cublasflow.hpp>
// ----------------------------------------------------------------------------
// amax, amin, and asum
// ----------------------------------------------------------------------------
template <typename T>
void amax_amin_asum() {
int N = 11111;
T min_v = 100000, max_v = -1;
T sum = 0, h_sum = -1;
std::vector<T> host(N);
for(int i=0; i<N; i++) {
host[i] = rand() % 100 - 50;
min_v = std::min(min_v, std::abs(host[i]));
max_v = std::max(max_v, std::abs(host[i]));
sum += std::abs(host[i]);
}
auto gpu = tf::cuda_malloc_device<T>(N);
auto min_i = tf::cuda_malloc_device<int>(1);
auto max_i = tf::cuda_malloc_device<int>(1);
auto gsum = tf::cuda_malloc_device<T>(1);
int h_min_i = -1, h_max_i = -1;
tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](tf::cudaFlow& cf){
auto cublas = cf.capture([&](tf::cudaFlowCapturer& cap){
auto capturer = cap.make_capturer<tf::cublasFlowCapturer>();
auto amax = capturer->amax(N, gpu, 1, max_i);
auto amin = capturer->amin(N, gpu, 1, min_i);
auto vset = capturer->vset(N, host.data(), 1, gpu, 1);
auto back = cap.single_task([min_i, max_i] __device__ () {
(*min_i)--;
(*max_i)--;
});
auto asum = capturer->asum(N, gpu, 1, gsum);
vset.precede(amin, amax, asum);
back.succeed(amin, amax);
});
auto copy_min_i = cf.copy(&h_min_i, min_i, 1);
auto copy_max_i = cf.copy(&h_max_i, max_i, 1);
auto copy_sum = cf.copy(&h_sum, gsum, 1);
cublas.precede(copy_min_i, copy_max_i, copy_sum);
});
executor.run(taskflow).wait();
REQUIRE(std::abs(host[h_min_i]) == min_v);
REQUIRE(std::abs(host[h_max_i]) == max_v);
REQUIRE(std::abs(sum-h_sum) < 0.0001);
taskflow.clear();
h_min_i = -1;
h_max_i = -1;
// pure capturer
taskflow.emplace([&](tf::cudaFlowCapturer& cap){
auto capturer = cap.make_capturer<tf::cublasFlowCapturer>();
auto amax = capturer->amax(N, gpu, 1, max_i);
auto amin = capturer->amin(N, gpu, 1, min_i);
auto vset = capturer->vset(N, host.data(), 1, gpu, 1);
auto back = cap.single_task([min_i, max_i] __device__ () {
(*min_i)--;
(*max_i)--;
});
auto asum = capturer->asum(N, gpu, 1, gsum);
vset.precede(amin, amax, asum);
back.succeed(amin, amax);
auto copy_min_i = cap.copy(&h_min_i, min_i, 1);
auto copy_max_i = cap.memcpy(&h_max_i, max_i, sizeof(h_max_i));
auto copy_sum = cap.copy(&h_sum, gsum, 1);
back.precede(copy_min_i, copy_max_i, copy_sum);
});
executor.run(taskflow).wait();
REQUIRE(std::abs(host[h_min_i]) == min_v);
REQUIRE(std::abs(host[h_max_i]) == max_v);
REQUIRE(std::abs(sum-h_sum) < 0.0001);
tf::cuda_free(gpu);
tf::cuda_free(min_i);
tf::cuda_free(max_i);
}
TEST_CASE("amax-amin-asum.float" * doctest::timeout(300)) {
amax_amin_asum<float>();
}
TEST_CASE("amax-amin-asum.double" * doctest::timeout(300)) {
amax_amin_asum<double>();
}
// ----------------------------------------------------------------------------
// axpy
// ----------------------------------------------------------------------------
template <typename T>
void axpy() {
int N = 1745;
std::vector<T> hx(N), hy(N), golden(N), res(N);
for(int i=0; i<N; i++) {
hx[i] = rand() % 100 - 50;
hy[i] = rand() % 100 - 50;
golden[i] = 2 * hx[i] + hy[i];
res[i] = rand();
}
auto dx = tf::cuda_malloc_device<T>(N);
auto dy = tf::cuda_malloc_device<T>(N);
auto alpha = tf::cuda_malloc_device<T>(1);
tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](tf::cudaFlow& cf){
cf.capture([&](tf::cudaFlowCapturer& cap){
auto capturer = cap.make_capturer<tf::cublasFlowCapturer>();
auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1);
auto vsety = capturer->vset(N, hy.data(), 1, dy, 1);
auto spar = cap.single_task([alpha] __device__ () {
*alpha = 2;
});
auto axpy = capturer->axpy(N, alpha, dx, 1, dy, 1);
auto vgety = capturer->vget(N, dy, 1, res.data(), 1);
axpy.succeed(vsetx, vsety, spar)
.precede(vgety);
});
});
executor.run(taskflow).wait();
for(int i=0; i<N; i++) {
REQUIRE(std::abs(res[i] - golden[i]) < 0.0001);
}
tf::cuda_free(dx);
tf::cuda_free(dy);
tf::cuda_free(alpha);
}
TEST_CASE("axpy.float" * doctest::timeout(300)) {
axpy<float>();
}
TEST_CASE("axpy.double" * doctest::timeout(300)) {
axpy<double>();
}
// ----------------------------------------------------------------------------
// dot
// ----------------------------------------------------------------------------
template <typename T>
void dot() {
int N = 1745;
T res = -1, golden = 0;
std::vector<T> hx(N), hy(N);
for(int i=0; i<N; i++) {
hx[i] = rand() % 100 - 50;
hy[i] = rand() % 100 - 50;
golden += hx[i] * hy[i];
}
auto dx = tf::cuda_malloc_device<T>(N);
auto dy = tf::cuda_malloc_device<T>(N);
auto dr = tf::cuda_malloc_device<T>(1);
tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](tf::cudaFlow& cf){
cf.capture([&](tf::cudaFlowCapturer& cap){
auto capturer = cap.make_capturer<tf::cublasFlowCapturer>();
auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1);
auto vsety = capturer->vset(N, hy.data(), 1, dy, 1);
auto xydot = capturer->dot(N, dx, 1, dy, 1, dr);
auto copyr = cap.memcpy(&res, dr, sizeof(T));
xydot.succeed(vsetx, vsety)
.precede(copyr);
});
});
executor.run(taskflow).wait();
REQUIRE(std::abs(res-golden) < 0.0001);
tf::cuda_free(dx);
tf::cuda_free(dy);
tf::cuda_free(dr);
}
TEST_CASE("dot.float" * doctest::timeout(300)) {
dot<float>();
}
TEST_CASE("dot.double" * doctest::timeout(300)) {
dot<double>();
}
// ----------------------------------------------------------------------------
// swap
// ----------------------------------------------------------------------------
template <typename T>
void swap() {
int N = 1745;
std::vector<T> hx(N), hy(N), rx(N), ry(N);
for(int i=0; i<N; i++) {
hx[i] = rand() % 100 - 50;
hy[i] = rand() % 100 - 50;
}
auto dx = tf::cuda_malloc_device<T>(N);
auto dy = tf::cuda_malloc_device<T>(N);
tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](tf::cudaFlow& cf){
cf.capture([&](tf::cudaFlowCapturer& cap){
auto capturer = cap.make_capturer<tf::cublasFlowCapturer>();
auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1);
auto vsety = capturer->vset(N, hy.data(), 1, dy, 1);
auto xyswp = capturer->swap(N, dx, 1, dy, 1);
auto copyx = cap.memcpy(rx.data(), dx, N*sizeof(T));
auto copyy = cap.memcpy(ry.data(), dy, N*sizeof(T));
xyswp.succeed(vsetx, vsety)
.precede(copyx, copyy);
});
});
executor.run(taskflow).wait();
for(int i=0; i<N; i++) {
REQUIRE(rx[i] == hy[i]);
REQUIRE(ry[i] == hx[i]);
}
tf::cuda_free(dx);
tf::cuda_free(dy);
}
TEST_CASE("swap.float" * doctest::timeout(300)) {
swap<float>();
}
TEST_CASE("swap.double" * doctest::timeout(300)) {
swap<double>();
}
// ----------------------------------------------------------------------------
// scal
// ----------------------------------------------------------------------------
template <typename T>
void scal() {
int N = 17;
std::vector<T> hx(N), rx(N);
for(int i=0; i<N; i++) {
hx[i] = rand() % 100 - 50;
rx[i] = rand() % 100 - 50;
}
auto dx = tf::cuda_malloc_device<T>(N);
auto alpha = tf::cuda_malloc_device<T>(1);
tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](tf::cudaFlow& cf){
cf.capture([&](tf::cudaFlowCapturer& cap){
auto capturer = cap.make_capturer<tf::cublasFlowCapturer>();
auto vsetx = capturer->vset(N, hx.data(), 1, dx, 1);
auto spar = cap.single_task([alpha] __device__ () {
*alpha = 2;
});
auto vgetx = capturer->vget(N, dx, 1, rx.data(), 1);
auto scal = capturer->scal(N, alpha, dx, 1);
scal.succeed(vsetx, spar)
.precede(vgetx);
});
});
executor.run(taskflow).wait();
for(int i=0; i<N; i++) {
REQUIRE(std::abs(rx[i] - 2.0*hx[i]) < 0.0001);
}
tf::cuda_free(dx);
tf::cuda_free(alpha);
}
TEST_CASE("scal.float" * doctest::timeout(300)) {
scal<float>();
}
TEST_CASE("scal.double" * doctest::timeout(300)) {
scal<double>();
}