mesytec-mnode/external/taskflow-3.8.0/unittests/cuda/test_cuda_matrix.cu
2025-01-04 01:25:05 +01:00

263 lines
6.7 KiB
Text

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cuda/cudaflow.hpp>
template <typename T>
void run_and_wait(T& cf) {
tf::cudaStream stream;
cf.run(stream);
stream.synchronize();
}
// ----------------------------------------------------------------------------
// Matrix Multiplication Kernel
// ----------------------------------------------------------------------------
__global__ void k_multiplication(
int *a, int *b, int *c, int m, int n, int k
) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if(col < k && row < m) {
for(int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
TEST_CASE("multiply" * doctest::timeout(300)) {
tf::Taskflow taskflow;
tf::Executor executor;
std::vector<int> a, b, c;
const unsigned B = 16;
for(int m=1; m<=256; m<<=1) {
for(int n=1; n<=256; n<<=1) {
for(int k=1; k<=256; k<<=1) {
taskflow.clear();
int* ha {nullptr};
int* hb {nullptr};
int* hc {nullptr};
int* da {nullptr};
int* db {nullptr};
int* dc {nullptr};
dim3 grid ((k+B-1)/B, (m+B-1)/B);
dim3 block (B, B);
auto hosta = taskflow.emplace([&](){
a.resize(m*n);
std::fill_n(a.begin(), m*n, m+n);
ha = a.data();
REQUIRE(cudaMalloc(&da, m*n*sizeof(int)) == cudaSuccess);
}).name("ha");
auto hostb = taskflow.emplace([&](){
b.resize(n*k);
std::fill_n(b.begin(), n*k, n+k);
hb = b.data();
REQUIRE(cudaMalloc(&db, n*k*sizeof(int)) == cudaSuccess);
}).name("hb");
auto hostc = taskflow.emplace([&](){
c.resize(m*k);
hc = c.data();
REQUIRE(cudaMalloc(&dc, m*k*sizeof(int)) == cudaSuccess);
}).name("hc");
auto cuda = taskflow.emplace([&](){
tf::cudaFlow cf;
auto pa = cf.copy(da, ha, m*n);
auto pb = cf.copy(db, hb, n*k);
auto op = cf.kernel(
grid, block, 0, k_multiplication, da, db, dc, m, n, k
).name("op");
auto cc = cf.copy(hc, dc, m*k)
.name("cc");
op.precede(cc).succeed(pa, pb);
run_and_wait(cf);
});
cuda.succeed(hosta, hostb, hostc);
executor.run(taskflow).wait();
for(const auto& x : c) {
REQUIRE(x == (int)(m+n)*(n+k)*n);
}
REQUIRE(cudaFree(da) == cudaSuccess);
REQUIRE(cudaFree(db) == cudaSuccess);
REQUIRE(cudaFree(dc) == cudaSuccess);
}
}
}
}
// ----------------------------------------------------------------------------
// Matrix Transpose
// ----------------------------------------------------------------------------
__global__ void k_transpose(int *mat_in, int *mat_out, int rows, int cols) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < cols && idy < rows) {
unsigned int pos = idy * cols + idx;
unsigned int trans_pos = idx * rows + idy;
mat_out[trans_pos] = mat_in[pos];
}
}
TEST_CASE("transpose" * doctest::timeout(300)) {
std::vector<int> in, out;
tf::Taskflow taskflow;
tf::Executor executor;
const unsigned B = 16;
for(int m=1; m<=256; m<<=1) {
for(int n=1; n<=256; n<<=1) {
taskflow.clear();
int* ptr_in {nullptr};
int* ptr_out {nullptr};
int* sin {nullptr};
int* sout {nullptr};
dim3 grid ((n+B-1)/B, (m+B-1)/B);
dim3 block (B, B);
auto hin = taskflow.emplace([&](){
in.resize(m*n);
out.resize(m*n);
for(auto& item : in) {
item = ::rand()%100;
}
ptr_in = in.data();
ptr_out = out.data();
REQUIRE(cudaMalloc(&sin, m*n*sizeof(int)) == cudaSuccess);
REQUIRE(cudaMalloc(&sout, m*n*sizeof(int)) == cudaSuccess);
}).name("ha");
auto op = taskflow.emplace([&](){
tf::cudaFlow cf;
auto copyin = cf.copy(sin, ptr_in, m*n);
auto copyout = cf.copy(ptr_out, sout, m*n);
auto trans = cf.kernel(grid, block, 0, k_transpose, sin, sout, m, n);
trans.succeed(copyin).precede(copyout);
run_and_wait(cf);
});
hin.precede(op);
executor.run(taskflow).wait();
for(int x=0; x<m; x++) {
for(int y=0; y<n; ++y) {
REQUIRE(in[x*n+y] == out[y*m+x]);
}
}
REQUIRE(cudaFree(sin) == cudaSuccess);
REQUIRE(cudaFree(sout) == cudaSuccess);
}
}
}
// ----------------------------------------------------------------------------
// vector product
// ----------------------------------------------------------------------------
__global__ void k_product(int *a, int *b, int *c, int N) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
c[idx] = a[idx] * b[idx];
}
}
TEST_CASE("product" * doctest::timeout(300)) {
tf::Taskflow taskflow;
tf::Executor executor;
const unsigned num_batches = 1024;
const unsigned N = 1024;
const unsigned B = 128;
dim3 grid ((N+B-1)/B);
dim3 block (B);
std::vector<int*> hA(num_batches);
std::vector<int*> hB(num_batches);
std::vector<int*> hC(num_batches);
std::vector<int*> dA(num_batches);
std::vector<int*> dB(num_batches);
std::vector<int*> dC(num_batches);
for(unsigned i=0; i<num_batches; ++i) {
int v1 = ::rand()%10;
int v2 = ::rand()%10;
auto allocate = taskflow.emplace([&, i, v1, v2](){
hA[i] = new int [N];
hB[i] = new int [N];
hC[i] = new int [N];
REQUIRE(cudaMalloc(&dA[i], N*sizeof(int)) == cudaSuccess);
REQUIRE(cudaMalloc(&dB[i], N*sizeof(int)) == cudaSuccess);
REQUIRE(cudaMalloc(&dC[i], N*sizeof(int)) == cudaSuccess);
for(unsigned j=0; j<N; ++j) {
hA[i][j] = v1;
hB[i][j] = v2;
}
});
auto kernel = taskflow.emplace([&, i](){
tf::cudaFlow cf;
auto copyA = cf.copy(dA[i], hA[i], N);
auto copyB = cf.copy(dB[i], hB[i], N);
auto op = cf.kernel(grid, block, 0, k_product, dA[i], dB[i], dC[i], N);
auto copyC = cf.copy(hC[i], dC[i], N);
op.succeed(copyA, copyB).precede(copyC);
run_and_wait(cf);
});
auto deallocate = taskflow.emplace([&, i, v1, v2](){
for(unsigned j=0; j<N; ++j) {
REQUIRE(hC[i][j] == v1*v2);
}
delete hA[i];
delete hB[i];
delete hC[i];
REQUIRE(cudaFree(dA[i]) == cudaSuccess);
REQUIRE(cudaFree(dB[i]) == cudaSuccess);
REQUIRE(cudaFree(dC[i]) == cudaSuccess);
});
kernel.precede(deallocate).succeed(allocate);
}
executor.run(taskflow).wait();
}