#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include template void run_and_wait(T& cf) { tf::cudaStream stream; cf.run(stream); stream.synchronize(); } // ---------------------------------------------------------------------------- // Matrix Multiplication Kernel // ---------------------------------------------------------------------------- __global__ void k_multiplication( int *a, int *b, int *c, int m, int n, int k ) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; int sum = 0; if(col < k && row < m) { for(int i = 0; i < n; i++) { sum += a[row * n + i] * b[i * k + col]; } c[row * k + col] = sum; } } TEST_CASE("multiply" * doctest::timeout(300)) { tf::Taskflow taskflow; tf::Executor executor; std::vector a, b, c; const unsigned B = 16; for(int m=1; m<=256; m<<=1) { for(int n=1; n<=256; n<<=1) { for(int k=1; k<=256; k<<=1) { taskflow.clear(); int* ha {nullptr}; int* hb {nullptr}; int* hc {nullptr}; int* da {nullptr}; int* db {nullptr}; int* dc {nullptr}; dim3 grid ((k+B-1)/B, (m+B-1)/B); dim3 block (B, B); auto hosta = taskflow.emplace([&](){ a.resize(m*n); std::fill_n(a.begin(), m*n, m+n); ha = a.data(); REQUIRE(cudaMalloc(&da, m*n*sizeof(int)) == cudaSuccess); }).name("ha"); auto hostb = taskflow.emplace([&](){ b.resize(n*k); std::fill_n(b.begin(), n*k, n+k); hb = b.data(); REQUIRE(cudaMalloc(&db, n*k*sizeof(int)) == cudaSuccess); }).name("hb"); auto hostc = taskflow.emplace([&](){ c.resize(m*k); hc = c.data(); REQUIRE(cudaMalloc(&dc, m*k*sizeof(int)) == cudaSuccess); }).name("hc"); auto cuda = taskflow.emplace([&](){ tf::cudaFlow cf; auto pa = cf.copy(da, ha, m*n); auto pb = cf.copy(db, hb, n*k); auto op = cf.kernel( grid, block, 0, k_multiplication, da, db, dc, m, n, k ).name("op"); auto cc = cf.copy(hc, dc, m*k) .name("cc"); op.precede(cc).succeed(pa, pb); run_and_wait(cf); }); cuda.succeed(hosta, hostb, hostc); executor.run(taskflow).wait(); for(const auto& x : c) { REQUIRE(x == (int)(m+n)*(n+k)*n); } REQUIRE(cudaFree(da) == cudaSuccess); REQUIRE(cudaFree(db) == cudaSuccess); REQUIRE(cudaFree(dc) == cudaSuccess); } } } } // ---------------------------------------------------------------------------- // Matrix Transpose // ---------------------------------------------------------------------------- __global__ void k_transpose(int *mat_in, int *mat_out, int rows, int cols) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; if (idx < cols && idy < rows) { unsigned int pos = idy * cols + idx; unsigned int trans_pos = idx * rows + idy; mat_out[trans_pos] = mat_in[pos]; } } TEST_CASE("transpose" * doctest::timeout(300)) { std::vector in, out; tf::Taskflow taskflow; tf::Executor executor; const unsigned B = 16; for(int m=1; m<=256; m<<=1) { for(int n=1; n<=256; n<<=1) { taskflow.clear(); int* ptr_in {nullptr}; int* ptr_out {nullptr}; int* sin {nullptr}; int* sout {nullptr}; dim3 grid ((n+B-1)/B, (m+B-1)/B); dim3 block (B, B); auto hin = taskflow.emplace([&](){ in.resize(m*n); out.resize(m*n); for(auto& item : in) { item = ::rand()%100; } ptr_in = in.data(); ptr_out = out.data(); REQUIRE(cudaMalloc(&sin, m*n*sizeof(int)) == cudaSuccess); REQUIRE(cudaMalloc(&sout, m*n*sizeof(int)) == cudaSuccess); }).name("ha"); auto op = taskflow.emplace([&](){ tf::cudaFlow cf; auto copyin = cf.copy(sin, ptr_in, m*n); auto copyout = cf.copy(ptr_out, sout, m*n); auto trans = cf.kernel(grid, block, 0, k_transpose, sin, sout, m, n); trans.succeed(copyin).precede(copyout); run_and_wait(cf); }); hin.precede(op); executor.run(taskflow).wait(); for(int x=0; x hA(num_batches); std::vector hB(num_batches); std::vector hC(num_batches); std::vector dA(num_batches); std::vector dB(num_batches); std::vector dC(num_batches); for(unsigned i=0; i