#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include #include #include #include #include #include constexpr float eps = 0.0001f; template void run_and_wait(T& cf) { tf::cudaStream stream; cf.run(stream); stream.synchronize(); } // -------------------------------------------------------- // Testcase: add2 // -------------------------------------------------------- template void add2() { //const unsigned N = 1<<20; tf::Taskflow taskflow; tf::Executor executor; for(size_t N=1; N<=(1<<20); N <<= 1) { taskflow.clear(); T v1 = ::rand() % 100; T v2 = ::rand() % 100; std::vector hx, hy; T* dx {nullptr}; T* dy {nullptr}; // allocate x auto allocate_x = taskflow.emplace([&]() { hx.resize(N, v1); REQUIRE(cudaMalloc(&dx, N*sizeof(T)) == cudaSuccess); }).name("allocate_x"); // allocate y auto allocate_y = taskflow.emplace([&]() { hy.resize(N, v2); REQUIRE(cudaMalloc(&dy, N*sizeof(T)) == cudaSuccess); }).name("allocate_y"); // axpy auto cudaflow = taskflow.emplace([&]() { F cf; auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x"); auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y"); auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x"); auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y"); //auto kernel = cf.add(dx, N, dx, dy); auto kernel = cf.transform(dx, dx+N, dy, [] __device__ (T x) { return x + 2; } ); kernel.succeed(h2d_x, h2d_y) .precede(d2h_x, d2h_y); run_and_wait(cf); }).name("saxpy"); cudaflow.succeed(allocate_x, allocate_y); // Add a verification task auto verifier = taskflow.emplace([&](){ for (size_t i = 0; i < N; i++) { REQUIRE(std::fabs(hx[i] - v1) < eps); REQUIRE(std::fabs(hy[i] - (hx[i] + 2)) < eps); } }).succeed(cudaflow).name("verify"); // free memory auto deallocate_x = taskflow.emplace([&](){ REQUIRE(cudaFree(dx) == cudaSuccess); }).name("deallocate_x"); auto deallocate_y = taskflow.emplace([&](){ REQUIRE(cudaFree(dy) == cudaSuccess); }).name("deallocate_y"); verifier.precede(deallocate_x, deallocate_y); executor.run(taskflow).wait(); // standalone tramsform tf::cudaDefaultExecutionPolicy p; auto input = tf::cuda_malloc_shared(N); auto output = tf::cuda_malloc_shared(N); for(size_t n=0; n(); } TEST_CASE("add2.float" * doctest::timeout(300)) { add2(); } TEST_CASE("add2.double" * doctest::timeout(300)) { add2(); } TEST_CASE("capture_add2.int" * doctest::timeout(300)) { add2(); } TEST_CASE("capture_add2.float" * doctest::timeout(300)) { add2(); } TEST_CASE("capture_add2.double" * doctest::timeout(300)) { add2(); } // ---------------------------------------------------------------------------- // transform // ---------------------------------------------------------------------------- template void transform() { F cudaflow; for(unsigned n=1; n<=1234567; n = n*2 + 1) { cudaflow.clear(); auto src1 = tf::cuda_malloc_shared(n); auto src2 = tf::cuda_malloc_shared(n); auto dest = tf::cuda_malloc_shared(n); for(unsigned i=0; i(); } TEST_CASE("capture.transform" * doctest::timeout(300) ) { transform(); } // ---------------------------------------------------------------------------- // reduce // ---------------------------------------------------------------------------- template void reduce() { for(int n=1; n<=1234567; n = n*2 + 1) { tf::Taskflow taskflow; tf::Executor executor; T sum = 0; std::vector cpu(n); for(auto& i : cpu) { i = ::rand()%100-50; sum += i; } T sol; T* gpu = nullptr; T* res = nullptr; auto cputask = taskflow.emplace([&](){ REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess); }); tf::Task gputask; gputask = taskflow.emplace([&]() { F cf; auto d2h = cf.copy(&sol, res, 1); auto h2d = cf.copy(gpu, cpu.data(), n); auto set = cf.single_task([res] __device__ () mutable { *res = 1000; }); auto kernel = cf.reduce( gpu, gpu+n, res, [] __device__ (T a, T b) mutable { return a + b; } ); kernel.succeed(h2d, set); d2h.succeed(kernel); run_and_wait(cf); }); cputask.precede(gputask); executor.run(taskflow).wait(); REQUIRE(std::fabs(sum-sol+1000) < 0.0001); REQUIRE(cudaFree(gpu) == cudaSuccess); REQUIRE(cudaFree(res) == cudaSuccess); } } TEST_CASE("cudaflow.reduce.int" * doctest::timeout(300)) { reduce(); } TEST_CASE("cudaflow.reduce.float" * doctest::timeout(300)) { reduce(); } TEST_CASE("cudaflow.reduce.double" * doctest::timeout(300)) { reduce(); } TEST_CASE("capture.reduce.int" * doctest::timeout(300)) { reduce(); } TEST_CASE("capture.reduce.float" * doctest::timeout(300)) { reduce(); } TEST_CASE("capture.reduce.double" * doctest::timeout(300)) { reduce(); } // ---------------------------------------------------------------------------- // uninitialized_reduce // ---------------------------------------------------------------------------- template void uninitialized_reduce() { for(int n=1; n<=1234567; n = n*2 + 1) { tf::Taskflow taskflow; tf::Executor executor; T sum = 0; std::vector cpu(n); for(auto& i : cpu) { i = ::rand()%100-50; sum += i; } T sol; T* gpu = nullptr; T* res = nullptr; auto cputask = taskflow.emplace([&](){ REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess); }); tf::Task gputask; gputask = taskflow.emplace([&]() { F cf; auto d2h = cf.copy(&sol, res, 1); auto h2d = cf.copy(gpu, cpu.data(), n); auto set = cf.single_task([res] __device__ () mutable { *res = 1000; }); auto kernel = cf.uninitialized_reduce( gpu, gpu+n, res, [] __device__ (T a, T b) { return a + b; } ); kernel.succeed(h2d, set); d2h.succeed(kernel); run_and_wait(cf); }); cputask.precede(gputask); executor.run(taskflow).wait(); REQUIRE(std::fabs(sum-sol) < 0.0001); REQUIRE(cudaFree(gpu) == cudaSuccess); REQUIRE(cudaFree(res) == cudaSuccess); } } TEST_CASE("cudaflow.uninitialized_reduce.int" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("cudaflow.uninitialized_reduce.float" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("cudaflow.uninitialized_reduce.double" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("capture.uninitialized_reduce.int" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("capture.uninitialized_reduce.float" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("capture.uninitialized_reduce.double" * doctest::timeout(300)) { uninitialized_reduce(); } // ---------------------------------------------------------------------------- // transform_reduce // ---------------------------------------------------------------------------- template void transform_reduce() { tf::Executor executor; for(int n=1; n<=1234567; n = n*2 + 1) { tf::Taskflow taskflow; T sum = 0; std::vector cpu(n); for(auto& i : cpu) { i = ::rand()%100-50; sum += i; } T sol; T* gpu = nullptr; T* res = nullptr; auto cputask = taskflow.emplace([&](){ REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess); }); tf::Task gputask; gputask = taskflow.emplace([&]() { F cf; auto d2h = cf.copy(&sol, res, 1); auto h2d = cf.copy(gpu, cpu.data(), n); auto set = cf.single_task([res] __device__ () mutable { *res = 1000; }); auto kernel = cf.transform_reduce( gpu, gpu+n, res, [] __device__ (T a, T b) { return a + b; }, [] __device__ (T a) { return a + 1; } ); kernel.succeed(h2d, set); d2h.succeed(kernel); run_and_wait(cf); }); cputask.precede(gputask); executor.run(taskflow).wait(); REQUIRE(std::fabs(sum+n+1000-sol) < 0.0001); REQUIRE(cudaFree(gpu) == cudaSuccess); REQUIRE(cudaFree(res) == cudaSuccess); } } TEST_CASE("cudaflow.transform_reduce.int" * doctest::timeout(300)) { transform_reduce(); } TEST_CASE("cudaflow.transform_reduce.float" * doctest::timeout(300)) { transform_reduce(); } TEST_CASE("cudaflow.transform_reduce.double" * doctest::timeout(300)) { transform_reduce(); } TEST_CASE("capture.transform_reduce.int" * doctest::timeout(300)) { transform_reduce(); } TEST_CASE("capture.transform_reduce.float" * doctest::timeout(300)) { transform_reduce(); } TEST_CASE("capture.transform_reduce.double" * doctest::timeout(300)) { transform_reduce(); } // ---------------------------------------------------------------------------- // transform_uninitialized_reduce // ---------------------------------------------------------------------------- template void transform_uninitialized_reduce() { tf::Executor executor; for(int n=1; n<=1234567; n = n*2 + 1) { tf::Taskflow taskflow; T sum = 0; std::vector cpu(n); for(auto& i : cpu) { i = ::rand()%100-50; sum += i; } T sol; T* gpu = nullptr; T* res = nullptr; auto cputask = taskflow.emplace([&](){ REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess); }); tf::Task gputask; gputask = taskflow.emplace([&]() { F cf; auto d2h = cf.copy(&sol, res, 1); auto h2d = cf.copy(gpu, cpu.data(), n); auto set = cf.single_task([res] __device__ () mutable { *res = 1000; }); auto kernel = cf.transform_uninitialized_reduce( gpu, gpu+n, res, [] __device__ (T a, T b) { return a + b; }, [] __device__ (T a) { return a + 1; } ); kernel.succeed(h2d, set); d2h.succeed(kernel); run_and_wait(cf); }); cputask.precede(gputask); executor.run(taskflow).wait(); REQUIRE(std::fabs(sum+n-sol) < 0.0001); REQUIRE(cudaFree(gpu) == cudaSuccess); REQUIRE(cudaFree(res) == cudaSuccess); } } TEST_CASE("cudaflow.transform_uninitialized_reduce.int" * doctest::timeout(300)) { transform_uninitialized_reduce(); } TEST_CASE("cudaflow.transform_uninitialized_reduce.float" * doctest::timeout(300)) { transform_uninitialized_reduce(); } TEST_CASE("cudaflow.transform_uninitialized_reduce.double" * doctest::timeout(300)) { transform_uninitialized_reduce(); } TEST_CASE("capture.transform_uninitialized_reduce.int" * doctest::timeout(300)) { transform_uninitialized_reduce(); } TEST_CASE("capture.transform_uninitialized_reduce.float" * doctest::timeout(300)) { transform_uninitialized_reduce(); } TEST_CASE("capture.transform_uninitialized_reduce.double" * doctest::timeout(300)) { transform_uninitialized_reduce(); } // ---------------------------------------------------------------------------- // scan // ---------------------------------------------------------------------------- template void scan() { tf::Executor executor; tf::Taskflow taskflow; for(int N=1; N<=1234567; N = N*2 + 1) { taskflow.clear(); auto data1 = tf::cuda_malloc_shared(N); auto data2 = tf::cuda_malloc_shared(N); auto scan1 = tf::cuda_malloc_shared(N); auto scan2 = tf::cuda_malloc_shared(N); // initialize the data for(int i=0; i temp( tf::cuda_scan_buffer_size(N) ); tf::cuda_inclusive_scan(tf::cudaDefaultExecutionPolicy{}, data1, data1+N, scan1, tf::cuda_plus{}, temp.data() ); cudaStreamSynchronize(0); tf::cuda_exclusive_scan(tf::cudaDefaultExecutionPolicy{}, data2, data2+N, scan2, tf::cuda_plus{}, temp.data() ); cudaStreamSynchronize(0); // inspect for(int i=1; i(); } TEST_CASE("cudaflow.scan.size_t" * doctest::timeout(300)) { scan(); } TEST_CASE("capture.scan.int" * doctest::timeout(300)) { scan(); } TEST_CASE("capture.scan.size_t" * doctest::timeout(300)) { scan(); } // ---------------------------------------------------------------------------- // transofrm scan // ---------------------------------------------------------------------------- template void transform_scan() { tf::Executor executor; tf::Taskflow taskflow; for(int N=1; N<=1234567; N = N*2 + 1) { taskflow.clear(); auto data1 = tf::cuda_malloc_shared(N); auto data2 = tf::cuda_malloc_shared(N); auto scan1 = tf::cuda_malloc_shared(N); auto scan2 = tf::cuda_malloc_shared(N); // initialize the data for(int i=0; i temp( tf::cuda_scan_buffer_size(N) ); tf::cuda_transform_inclusive_scan(tf::cudaDefaultExecutionPolicy{}, data1, data1+N, scan1, [] __device__ (T a, T b){ return a+b; }, [] __device__ (T a) { return a*10; }, temp.data() ); cudaStreamSynchronize(0); tf::cuda_transform_exclusive_scan(tf::cudaDefaultExecutionPolicy{}, data2, data2+N, scan2, [] __device__ (T a, T b){ return a+b; }, [] __device__ (T a) { return a*10; }, temp.data() ); cudaStreamSynchronize(0); // inspect for(int i=1; i(); } TEST_CASE("cudaflow.scan.size_t" * doctest::timeout(300)) { transform_scan(); } TEST_CASE("capture.transform_scan.int" * doctest::timeout(300)) { transform_scan(); } TEST_CASE("capture.transform_scan.size_t" * doctest::timeout(300)) { transform_scan(); } // ---------------------------------------------------------------------------- // merge // ---------------------------------------------------------------------------- template void merge_keys() { tf::Executor executor; tf::Taskflow taskflow; for(int N=0; N<=1234567; N = N*2 + 1) { taskflow.clear(); auto a = tf::cuda_malloc_shared(N); auto b = tf::cuda_malloc_shared(N); auto c = tf::cuda_malloc_shared(2*N); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; // ----------------- standalone algorithms // initialize the data for(int i=0; i(N, N); tf::cudaDeviceVector buf(bufsz); tf::cuda_merge(p, a, a+N, b, b+N, c, tf::cuda_less{}, buf.data()); s.synchronize(); REQUIRE(std::is_sorted(c, c+2*N)); /*// ----------------- cudaFlow capturer for(int i=0; i{}); run_and_wait(cudaflow); }); executor.run(taskflow).wait(); REQUIRE(std::is_sorted(c, c+2*N));*/ REQUIRE(cudaFree(a) == cudaSuccess); REQUIRE(cudaFree(b) == cudaSuccess); REQUIRE(cudaFree(c) == cudaSuccess); } } TEST_CASE("cudaflow.merge_keys.int" * doctest::timeout(300)) { merge_keys(); } TEST_CASE("cudaflow.merge_keys.float" * doctest::timeout(300)) { merge_keys(); } TEST_CASE("cudaflow.merge_keys.int" * doctest::timeout(300)) { merge_keys(); } TEST_CASE("capture.merge_keys.float" * doctest::timeout(300)) { merge_keys(); } // ---------------------------------------------------------------------------- // merge_by_keys // ---------------------------------------------------------------------------- template void merge_keys_values() { tf::Executor executor; tf::Taskflow taskflow; for(int N=0; N<=1234567; N = N*2 + 1) { taskflow.clear(); auto a_k = tf::cuda_malloc_shared(N); auto b_k = tf::cuda_malloc_shared(N); auto c_k = tf::cuda_malloc_shared(2*N); auto a_v = tf::cuda_malloc_shared(N); auto b_v = tf::cuda_malloc_shared(N); auto c_v = tf::cuda_malloc_shared(2*N); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; // ----------------- standalone algorithms // initialize the data for(int i=0; i(N, N); tf::cudaDeviceVector buf(bufsz); tf::cuda_merge_by_key( p, a_k, a_k+N, a_v, b_k, b_k+N, b_v, c_k, c_v, tf::cuda_less{}, buf.data() ); s.synchronize(); for(int i=0; i<2*N; i++) { REQUIRE(c_k[i] == (i+1)); REQUIRE(c_v[i] == -(i+1)); } /* // ----------------- cudaFlow capturer // initialize the data for(int i=0; i{} ); run_and_wait(cudaflow); }); executor.run(taskflow).wait(); for(int i=0; i<2*N; i++) { REQUIRE(c_k[i] == (i+1)); REQUIRE(c_v[i] == -(i+1)); }*/ REQUIRE(cudaFree(a_k) == cudaSuccess); REQUIRE(cudaFree(b_k) == cudaSuccess); REQUIRE(cudaFree(c_k) == cudaSuccess); REQUIRE(cudaFree(a_v) == cudaSuccess); REQUIRE(cudaFree(b_v) == cudaSuccess); REQUIRE(cudaFree(c_v) == cudaSuccess); } } TEST_CASE("cudaflow.merge_keys_values.int" * doctest::timeout(300)) { merge_keys_values(); } TEST_CASE("cudaflow.merge_keys_values.float" * doctest::timeout(300)) { merge_keys_values(); } TEST_CASE("capturer.merge_keys_values.int" * doctest::timeout(300)) { merge_keys_values(); } TEST_CASE("capturer.merge_keys_values.float" * doctest::timeout(300)) { merge_keys_values(); } // ---------------------------------------------------------------------------- // sort // ---------------------------------------------------------------------------- template void sort_keys() { tf::Executor executor; tf::Taskflow taskflow; for(int N=0; N<=1234567; N = N*2 + 1) { taskflow.clear(); auto a = tf::cuda_malloc_shared(N); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; // ----------------- standalone asynchronous algorithms // initialize the data for(int i=0; i(N); tf::cudaDeviceVector buf(bufsz); tf::cuda_sort(p, a, a+N, tf::cuda_less{}, buf.data()); s.synchronize(); REQUIRE(std::is_sorted(a, a+N)); /* // ----------------- cudaflow capturer for(int i=0; i{}); run_and_wait(cudaflow); }); executor.run(taskflow).wait(); REQUIRE(std::is_sorted(a, a+N)); */ REQUIRE(cudaFree(a) == cudaSuccess); } } TEST_CASE("cudaflow.sort_keys.int" * doctest::timeout(300)) { sort_keys(); } TEST_CASE("cudaflow.sort_keys.float" * doctest::timeout(300)) { sort_keys(); } TEST_CASE("capture.sort_keys.int" * doctest::timeout(300)) { sort_keys(); } TEST_CASE("capture.sort_keys.float" * doctest::timeout(300)) { sort_keys(); } // ---------------------------------------------------------------------------- // sort key-value // ---------------------------------------------------------------------------- template void sort_keys_values() { std::random_device rd; std::mt19937 g(rd()); tf::Executor executor; tf::Taskflow taskflow; for(int N=1; N<=1234567; N = N*2 + 1) { taskflow.clear(); auto a = tf::cuda_malloc_shared(N); auto b = tf::cuda_malloc_shared(N); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; std::vector indices(N); // ----------------- standalone asynchronous algorithms // initialize the data for(int i=0; i(N); tf::cudaDeviceVector buf(bufsz); tf::cuda_sort_by_key(p, a, a+N, b, tf::cuda_less{}, buf.data()); s.synchronize(); REQUIRE(std::is_sorted(a, a+N)); for(int i=0; i a[j]; }); taskflow.emplace([&](){ F cudaflow; cudaflow.sort_by_key(a, a+N, b, tf::cuda_greater{}); run_and_wait(cudaflow); }); executor.run(taskflow).wait(); REQUIRE(std::is_sorted(a, a+N, std::greater{})); for(int i=0; i(); } TEST_CASE("cudaflow.sort_keys_values.float" * doctest::timeout(300)) { sort_keys_values(); } TEST_CASE("capture.sort_keys_values.int" * doctest::timeout(300)) { sort_keys_values(); } TEST_CASE("capture.sort_keys_values.float" * doctest::timeout(300)) { sort_keys_values(); } // ---------------------------------------------------------------------------- // find-if // ---------------------------------------------------------------------------- template void find_if() { tf::Executor executor; tf::Taskflow taskflow; for(int N=0; N<=1234567; N += std::max(N/100, 1)) { taskflow.clear(); auto a = tf::cuda_malloc_shared(N); auto r = tf::cuda_malloc_shared(1); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; // initialize the data for(int i=0; i(); } TEST_CASE("cudaflow.find_if.float" * doctest::timeout(300)) { find_if(); } TEST_CASE("capture.find_if.int" * doctest::timeout(300)) { find_if(); } TEST_CASE("capture.find_if.float" * doctest::timeout(300)) { find_if(); } // ---------------------------------------------------------------------------- // min_element // ---------------------------------------------------------------------------- template void min_element() { tf::Executor executor; tf::Taskflow taskflow; for(int N=0; N<=1234567; N += std::max(N/10, 1)) { taskflow.clear(); auto a = tf::cuda_malloc_shared(N); auto r = tf::cuda_malloc_shared(1); auto min = std::numeric_limits::max(); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; // initialize the data for(int i=0; i buf( tf::cuda_min_element_buffer_size(N) ); tf::cuda_min_element( p, a, a+N, r, tf::cuda_less{}, buf.data() ); s.synchronize(); if(min != std::numeric_limits::max()) { REQUIRE(a[*r] == min); } else { REQUIRE(*r == N); } // ----------------- cudaflow *r = 1234; taskflow.emplace([&](){ F cudaflow; cudaflow.min_element(a, a+N, r, tf::cuda_less{}); run_and_wait(cudaflow); }); executor.run(taskflow).wait(); if(min != std::numeric_limits::max()) { REQUIRE(a[*r] == min); } else { REQUIRE(*r == N); } REQUIRE(cudaFree(a) == cudaSuccess); REQUIRE(cudaFree(r) == cudaSuccess); } } TEST_CASE("cudaflow.min_element.int" * doctest::timeout(300)) { min_element(); } TEST_CASE("cudaflow.min_element.float" * doctest::timeout(300)) { min_element(); } TEST_CASE("capturer.min_element.int" * doctest::timeout(300)) { min_element(); } TEST_CASE("capturer.min_element.float" * doctest::timeout(300)) { min_element(); } // ---------------------------------------------------------------------------- // max_element // ---------------------------------------------------------------------------- template void max_element() { tf::Executor executor; tf::Taskflow taskflow; for(int N=0; N<=1234567; N += std::max(N/10, 1)) { taskflow.clear(); auto a = tf::cuda_malloc_shared(N); auto r = tf::cuda_malloc_shared(1); auto max = std::numeric_limits::lowest(); tf::cudaStream s; auto p = tf::cudaDefaultExecutionPolicy{s}; // initialize the data for(int i=0; i buf( tf::cuda_max_element_buffer_size(N) ); tf::cuda_max_element(p, a, a+N, r, tf::cuda_less{}, buf.data()); s.synchronize(); if(max != std::numeric_limits::lowest()) { REQUIRE(a[*r] == max); } else { REQUIRE(*r == N); } // ----------------- cudaflow *r = 1234; taskflow.emplace([&](){ F cudaflow; cudaflow.max_element(a, a+N, r, tf::cuda_less{}); run_and_wait(cudaflow); }); executor.run(taskflow).wait(); if(max != std::numeric_limits::lowest()) { REQUIRE(a[*r] == max); } else { REQUIRE(*r == N); } REQUIRE(cudaFree(a) == cudaSuccess); REQUIRE(cudaFree(r) == cudaSuccess); } } TEST_CASE("cudaflow.max_element.int" * doctest::timeout(300)) { max_element(); } TEST_CASE("cudaflow.max_element.float" * doctest::timeout(300)) { max_element(); } TEST_CASE("capturer.max_element.int" * doctest::timeout(300)) { max_element(); } TEST_CASE("capturer.max_element.float" * doctest::timeout(300)) { max_element(); } /*// -------------------------------------------------------------------------- // row-major transpose // ---------------------------------------------------------------------------- // Disable for now - better to use cublasFlowCapturer template __global__ void verify(const T* din_mat, const T* dout_mat, bool* check, size_t rows, size_t cols) { size_t tid = blockDim.x * blockIdx.x + threadIdx.x; size_t size = rows * cols; for(; tid < size; tid += gridDim.x * blockDim.x) { if(din_mat[tid] != dout_mat[tid / cols + (tid % cols) * rows]) { *check = false; return; } } } template void transpose() { tf::Executor executor; for(size_t rows = 1; rows <= 7999; rows*=2+3) { for(size_t cols = 1; cols <= 8021; cols*=3+5) { tf::Taskflow taskflow; std::vector hinput_mat(rows * cols); std::generate_n(hinput_mat.begin(), rows * cols, [](){ return ::rand(); }); T* dinput_mat {nullptr}; T* doutput_mat {nullptr}; bool* check {nullptr}; //allocate auto allocate = taskflow.emplace([&]() { REQUIRE(cudaMalloc(&dinput_mat, (rows * cols) * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&doutput_mat, (rows * cols) * sizeof(T)) == cudaSuccess); REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess); *check = true; }).name("allocate"); //transpose auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) { auto h2d_input_t = cf.copy(dinput_mat, hinput_mat.data(), rows * cols).name("h2d"); auto kernel_t = tf::cudaBLAF(cf).transpose( dinput_mat, doutput_mat, rows, cols ); auto verify_t = cf.kernel( 32, 512, 0, verify, dinput_mat, doutput_mat, check, rows, cols ); h2d_input_t.precede(kernel_t); kernel_t.precede(verify_t); }).name("transpose"); //free memory auto deallocate = taskflow.emplace([&](){ REQUIRE(cudaFree(dinput_mat) == cudaSuccess); REQUIRE(cudaFree(doutput_mat) == cudaSuccess); }).name("deallocate"); allocate.precede(cudaflow); cudaflow.precede(deallocate); executor.run(taskflow).wait(); REQUIRE(*check); } } } TEST_CASE("transpose.int" * doctest::timeout(300) ) { transpose(); } TEST_CASE("transpose.float" * doctest::timeout(300) ) { transpose(); } TEST_CASE("transpose.double" * doctest::timeout(300) ) { transpose(); } // ---------------------------------------------------------------------------- // row-major matrix multiplication // ---------------------------------------------------------------------------- template void matmul() { tf::Taskflow taskflow; tf::Executor executor; std::vector a, b, c; for(int m=1; m<=1992; m=2*m+1) { for(int k=1; k<=1012; k=2*k+3) { for(int n=1; n<=1998; n=2*n+8) { taskflow.clear(); T* ha {nullptr}; T* hb {nullptr}; T* hc {nullptr}; T* da {nullptr}; T* db {nullptr}; T* dc {nullptr}; T val_a = ::rand()%5-1; T val_b = ::rand()%7-3; auto hosta = taskflow.emplace([&](){ a.resize(m*k); std::fill_n(a.begin(), m*k, val_a); ha = a.data(); REQUIRE(cudaMalloc(&da, m*k*sizeof(T)) == cudaSuccess); }).name("ha"); auto hostb = taskflow.emplace([&](){ b.resize(k*n); std::fill_n(b.begin(), k*n, val_b); hb = b.data(); REQUIRE(cudaMalloc(&db, k*n*sizeof(T)) == cudaSuccess); }).name("hb"); auto hostc = taskflow.emplace([&](){ c.resize(m*n); hc = c.data(); REQUIRE(cudaMalloc(&dc, m*n*sizeof(T)) == cudaSuccess); }).name("hc"); auto cuda = taskflow.emplace([&](tf::cudaFlow& cf){ auto pa = cf.copy(da, ha, m*k); auto pb = cf.copy(db, hb, k*n); auto op = tf::cudaBLAF(cf).matmul( da, db, dc, m, k, n ).name("op"); auto cc = cf.copy(hc, dc, m*n).name("cc"); op.precede(cc).succeed(pa, pb); }); cuda.succeed(hosta, hostb, hostc); executor.run(taskflow).wait(); int ans = val_a*val_b*k; for(const auto& x : c) { REQUIRE((int)x == ans); } REQUIRE(cudaFree(da) == cudaSuccess); REQUIRE(cudaFree(db) == cudaSuccess); REQUIRE(cudaFree(dc) == cudaSuccess); } } } } TEST_CASE("matmul.int" * doctest::timeout(300) ) { matmul(); } TEST_CASE("matmul.float" * doctest::timeout(300) ) { matmul(); } TEST_CASE("matmul.double" * doctest::timeout(300) ) { matmul(); }*/