#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include #include #include #include template void run_and_wait(T& cf) { tf::cudaStream stream; cf.run(stream); stream.synchronize(); } //verify template __global__ void verify(const T* a, const T* b, bool* check, size_t size) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; for(;tid < size; tid += gridDim.x * blockDim.x) { if(a[tid] != b[tid]) { *check = false; return; } } } template __global__ void k_add(T* ptr, size_t N, T value) { int i = blockIdx.x*blockDim.x + threadIdx.x; if (i < N) { ptr[i] += value; } } //add template __global__ void add(const T* a, const T* b, T* c, size_t size) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; for(;tid < size; tid += gridDim.x * blockDim.x) { c[tid] = a[tid] + b[tid]; } } //multiply template __global__ void multiply(const T* a, const T* b, T* c, size_t size) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; for(;tid < size; tid += gridDim.x * blockDim.x) { c[tid] = a[tid] * b[tid]; } } // ---------------------------------------------------------------------------- // Incrementality // ---------------------------------------------------------------------------- TEST_CASE("cudaFlowCapturer.Incrementality") { unsigned N = 1024; tf::cudaFlowCapturer cf; // construct a cudaflow of three tasks auto cpu = static_cast(std::calloc(N, sizeof(int))); auto gpu = tf::cuda_malloc_device(N); dim3 g = {(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, N); auto kernel = cf.kernel(g, b, 0, k_add, gpu, N, 17); auto d2h = cf.copy(cpu, gpu, N); h2d.precede(kernel); kernel.precede(d2h); REQUIRE(cf.num_tasks() == 3); REQUIRE(cf.empty() == false); REQUIRE(cf.native_executable() == nullptr); // run cf.run(0); cudaStreamSynchronize(0); auto native_graph = cf.native_graph(); auto native_executable = cf.native_executable(); REQUIRE(native_graph != nullptr); REQUIRE(native_executable != nullptr); REQUIRE(cf.num_tasks() == 3); REQUIRE(cf.empty() == false); REQUIRE(cf.native_graph() != nullptr); REQUIRE(cf.native_executable() != nullptr); REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks()); for(unsigned i=0; i, gpu, N, j); cf.run(0); cudaStreamSynchronize(0); auto updated_native_graph = cf.native_graph(); auto updated_native_executable = cf.native_executable(); REQUIRE(updated_native_graph != native_graph); REQUIRE(updated_native_executable == native_executable); REQUIRE(cf.num_tasks() == 3); REQUIRE(cf.empty() == false); REQUIRE(cf.native_graph() != nullptr); REQUIRE(cf.native_executable() != nullptr); REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks()); for(unsigned i=0; i void rebind_kernel() { tf::Executor executor; for(size_t N = 1; N < 65529; N = N * 2 + 1) { tf::Taskflow taskflow; std::vector operand(3, nullptr); std::vector ans_operand(3, nullptr); std::vector ind(3); std::generate_n(ind.data(), 3, [&](){ return ::rand() % 3; }); bool* check {nullptr}; //allocate auto allocate_t = taskflow.emplace([&]() { for(int i = 0; i < 3; ++i) { REQUIRE(cudaMallocManaged(&operand[i], N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMallocManaged(&ans_operand[i], N * sizeof(T)) == cudaSuccess); } REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess); }).name("allocate"); //initialize auto initialize_t = taskflow.emplace([&](){ for(int i = 0; i < 3; ++i) { std::generate_n(operand[i], N, [&](){ return ::rand() % N - N / 2 + i; }); std::memcpy(ans_operand[i], operand[i], N * sizeof(T)); } *check = true; }).name("initialize"); //rebind_kernel auto add_t = taskflow.emplace([&]() { F cf; auto multi_t = cf.kernel( 32, 512, 0, multiply, operand[ind[0]], operand[ind[1]], operand[ind[2]], N ); auto add_t = cf.kernel( 32, 512, 0, add, operand[ind[1]], operand[ind[2]], operand[ind[0]], N ); multi_t.precede(add_t); run_and_wait(cf); cf.kernel( multi_t, 64, 128, 0, multiply, operand[ind[2]], operand[ind[0]], operand[ind[1]], N ); cf.kernel( add_t, 16, 256, 0, add, operand[ind[1]], operand[ind[0]], operand[ind[2]], N ); run_and_wait(cf); cf.kernel( multi_t, 8, 1024, 0, multiply, operand[ind[0]], operand[ind[2]], operand[ind[1]], N ); cf.kernel( add_t, 64, 64, 0, add, operand[ind[2]], operand[ind[1]], operand[ind[0]], N ); run_and_wait(cf); }).name("add"); //verify auto verify_t = taskflow.emplace([&]() { F cf; //auto multi1_t = cf.transform( // ans_operand[ind[2]], ans_operand[ind[2]]+ N, // [] __device__ (T& v1, T& v2) { return v1 * v2; }, // ans_operand[ind[0]], ans_operand[ind[1]] //); auto multi1_t = cf.transform( ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[1]], ans_operand[ind[2]], [] __device__ (T& v1, T& v2) { return v1*v2; } ); //auto add1_t = cf.transform( // ans_operand[ind[0]], ans_operand[ind[0]]+ N, // [] __device__ (T& v1, T& v2) { return v1 + v2; }, // ans_operand[ind[1]], ans_operand[ind[2]] //); auto add1_t = cf.transform( ans_operand[ind[1]], ans_operand[ind[1]]+N, ans_operand[ind[2]], ans_operand[ind[0]], [] __device__ (T& v1, T& v2) { return v1 + v2; } ); //auto multi2_t = cf.transform( // ans_operand[ind[1]], ans_operand[ind[1]]+ N, // [] __device__ (T& v1, T& v2) { return v1 * v2; }, // ans_operand[ind[2]], ans_operand[ind[0]] //); auto multi2_t = cf.transform( ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[0]], ans_operand[ind[1]], [] __device__ (T& v1, T& v2) { return v1 * v2; } ); //auto add2_t = cf.transform( // ans_operand[ind[2]], ans_operand[ind[2]]+ N, // [] __device__ (T& v1, T& v2) { return v1 + v2; }, // ans_operand[ind[1]], ans_operand[ind[0]] //); auto add2_t = cf.transform( ans_operand[ind[1]], ans_operand[ind[1]] + N, ans_operand[ind[0]], ans_operand[ind[2]], [] __device__ (T& v1, T& v2) { return v1 + v2; } ); auto multi3_t = cf.transform( ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[2]], ans_operand[ind[1]], [] __device__ (T& v1, T& v2) { return v1 * v2; } ); auto add3_t = cf.transform( ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[1]], ans_operand[ind[0]], [] __device__ (T& v1, T& v2) { return v1 + v2; } ); auto verify1_t = cf.kernel( 32, 512, 0, verify, operand[ind[0]], ans_operand[ind[0]], check, N ); auto verify2_t = cf.kernel( 32, 512, 0, verify, operand[ind[1]], ans_operand[ind[1]], check, N ); auto verify3_t = cf.kernel( 32, 512, 0, verify, operand[ind[2]], ans_operand[ind[2]], check, N ); multi1_t.precede(add1_t); add1_t.precede(multi2_t); multi2_t.precede(add2_t); add2_t.precede(multi3_t); multi3_t.precede(add3_t); add3_t.precede(verify1_t).precede(verify2_t).precede(verify3_t); run_and_wait(cf); REQUIRE(*check); }).name("verify"); //free memory auto deallocate_t = taskflow.emplace([&]() { for(int i = 0; i < 3; ++i) { REQUIRE(cudaFree(operand[i]) == cudaSuccess); REQUIRE(cudaFree(ans_operand[i]) == cudaSuccess); } REQUIRE(cudaFree(check) == cudaSuccess); }).name("deallocate"); allocate_t.precede(initialize_t); initialize_t.precede(add_t); add_t.precede(verify_t); verify_t.precede(deallocate_t); executor.run(taskflow).wait(); } } // cudaflow TEST_CASE("cudaFlow.rebind.kernel.int" * doctest::timeout(300)) { rebind_kernel(); } TEST_CASE("cudaFlow.rebind.kernel.float" * doctest::timeout(300)) { rebind_kernel(); } TEST_CASE("cudaFlow.rebind.kernel.double" * doctest::timeout(300)) { rebind_kernel(); } // capturer TEST_CASE("cudaFlowCapturer.rebind.kernel.int" * doctest::timeout(300)) { rebind_kernel(); } TEST_CASE("cudaFlowCapturer.rebind.kernel.float" * doctest::timeout(300)) { rebind_kernel(); } TEST_CASE("cudaFlowCapturer.rebind.kernel.double" * doctest::timeout(300)) { rebind_kernel(); } //---------------------------------------------------------------------- //rebind copy //---------------------------------------------------------------------- template void rebind_copy() { tf::Executor executor; for(int N = 1; N < 65459; N = N * 2 + 1) { tf::Taskflow taskflow; std::vector ha(N, N + 5); std::vector hb(N, N - 31); std::vector hc(N, N - 47); std::vector hz(N); T* da {nullptr}; T* db {nullptr}; T* dc {nullptr}; T* dz {nullptr}; //allocate auto allocate_t = taskflow.emplace([&]() { REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess); }).name("allocate"); //rebind_copy auto h2d_t = taskflow.emplace([&]() { F cf; auto h2d_t = cf.copy(da, ha.data(), N).name("h2d"); run_and_wait(cf); cf.copy(h2d_t, db, hb.data(), N); run_and_wait(cf); cf.copy(h2d_t, dc, hc.data(), N); run_and_wait(cf); }); auto kernel_t = taskflow.emplace([&]() { F cf; //auto add1_t = cf.transform( // dz, dz + N, // [] __device__ (T& v1, T& v2) { return v1 + v2; }, // da, db //); auto add1_t = cf.transform( da, da+N, db, dz, [] __device__ (T& v1, T& v2) { return v1 + v2; } ); //auto add2_t = cf.transform( // dc, dc + N, // [] __device__ (T& v1, T& v2) { return v1 - v2; }, // dc, dz //); auto add2_t = cf.transform( dc, dc + N, dz, dc, [] __device__ (T& v1, T& v2) { return v1 - v2; } ); add1_t.precede(add2_t); run_and_wait(cf); }); auto d2h_t = taskflow.emplace([&]() { F cf; auto d2h_t = cf.copy(hc.data(), dc, N).name("d2h"); run_and_wait(cf); cf.copy(d2h_t, hz.data(), dz, N); run_and_wait(cf); }); //verify auto verify_t = taskflow.emplace([&]() { for(auto& c: hc) { REQUIRE(c == -21 - N); } for(auto& z: hz) { REQUIRE(z == 2 * N - 26); } }); //free memory auto deallocate_t = taskflow.emplace([&]() { REQUIRE(cudaFree(da) == cudaSuccess); REQUIRE(cudaFree(db) == cudaSuccess); REQUIRE(cudaFree(dc) == cudaSuccess); REQUIRE(cudaFree(dz) == cudaSuccess); }).name("deallocate"); allocate_t.precede(h2d_t); h2d_t.precede(kernel_t); kernel_t.precede(d2h_t); d2h_t.precede(verify_t); verify_t.precede(deallocate_t); executor.run(taskflow).wait(); } } // cudaFlow TEST_CASE("cudaFlow.rebind.copy.int" * doctest::timeout(300)) { rebind_copy(); } TEST_CASE("cudaFlow.rebind.copy.float" * doctest::timeout(300)) { rebind_copy(); } TEST_CASE("cudaFlow.rebind.copy.double" * doctest::timeout(300)) { rebind_copy(); } // cudaFlowCapturer TEST_CASE("cudaFlowCapturer.rebind.copy.int" * doctest::timeout(300)) { rebind_copy(); } TEST_CASE("cudaFlowCapturer.rebind.copy.float" * doctest::timeout(300)) { rebind_copy(); } TEST_CASE("cudaFlowCapturer.rebind.copy.double" * doctest::timeout(300)) { rebind_copy(); } //---------------------------------------------------------------------- // rebind memcpy //---------------------------------------------------------------------- template void rebind_memcpy() { tf::Executor executor; for(int N = 1; N < 65459; N = N * 2 + 1) { tf::Taskflow taskflow; std::vector ha(N, N + 5); std::vector hb(N, N - 31); std::vector hc(N, N - 47); std::vector hz(N); T* da {nullptr}; T* db {nullptr}; T* dc {nullptr}; T* dz {nullptr}; //allocate auto allocate_t = taskflow.emplace([&]() { REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess); }).name("allocate"); //rebind_memcpy auto h2d_t = taskflow.emplace([&]() { F cf; auto h2d_t = cf.memcpy(da, ha.data(), sizeof(T) * N).name("h2d"); run_and_wait(cf); cf.memcpy(h2d_t, db, hb.data(), sizeof(T) * N); run_and_wait(cf); cf.memcpy(h2d_t, dc, hc.data(), sizeof(T) * N); run_and_wait(cf); }); auto kernel_t = taskflow.emplace([&]() { F cf; auto add1_t = cf.transform( da, da + N, db, dz, [] __device__ (T& v1, T& v2) { return v1 + v2; } ); auto add2_t = cf.transform( dc, dc + N, dz, dc, [] __device__ (T& v1, T& v2) { return v1 - v2; } ); add1_t.precede(add2_t); run_and_wait(cf); }); auto d2h_t = taskflow.emplace([&]() { F cf; auto d2h_t = cf.memcpy(hc.data(), dc, sizeof(T) * N).name("d2h"); run_and_wait(cf); cf.memcpy(d2h_t, hz.data(), dz, sizeof(T) * N); run_and_wait(cf); }); //verify auto verify_t = taskflow.emplace([&]() { for(auto& c: hc) { REQUIRE(c == -21 - N); } for(auto& z: hz) { REQUIRE(z == 2 * N - 26); } }); //free memory auto deallocate_t = taskflow.emplace([&]() { REQUIRE(cudaFree(da) == cudaSuccess); REQUIRE(cudaFree(db) == cudaSuccess); REQUIRE(cudaFree(dc) == cudaSuccess); REQUIRE(cudaFree(dz) == cudaSuccess); }).name("deallocate"); allocate_t.precede(h2d_t); h2d_t.precede(kernel_t); kernel_t.precede(d2h_t); d2h_t.precede(verify_t); verify_t.precede(deallocate_t); executor.run(taskflow).wait(); } } // cudaflow TEST_CASE("cudaFlow.rebind.memcpy.int" * doctest::timeout(300)) { rebind_memcpy(); } TEST_CASE("cudaFlow.rebind.memcpy.float" * doctest::timeout(300)) { rebind_memcpy(); } TEST_CASE("cudaFlow.rebind.memcpy.double" * doctest::timeout(300)) { rebind_memcpy(); } // capturer TEST_CASE("cudaFlowCapturer.rebind.memcpy.int" * doctest::timeout(300)) { rebind_memcpy(); } TEST_CASE("cudaFlowCapturer.rebind.memcpy.float" * doctest::timeout(300)) { rebind_memcpy(); } TEST_CASE("cudaFlowCapturer.rebind.memcpy.double" * doctest::timeout(300)) { rebind_memcpy(); } //---------------------------------------------------------------------- //rebind memset //---------------------------------------------------------------------- template void rebind_memset() { tf::Executor executor; tf::Taskflow taskflow; for(size_t N = 1; N < 65199; N = N * 2 + 1) { taskflow.clear(); T* a {nullptr}; T* b {nullptr}; T* ans_a {nullptr}; T* ans_b {nullptr}; bool* check {nullptr}; //allocate auto allocate_t = taskflow.emplace([&]() { REQUIRE(cudaMallocManaged(&a, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMallocManaged(&b, (N + 37) * sizeof(T)) == cudaSuccess); REQUIRE(cudaMallocManaged(&ans_a, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMallocManaged(&ans_b, (N + 37) * sizeof(T)) == cudaSuccess); REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess); }).name("allocate"); //initialize auto initialize_t = taskflow.emplace([&]() { std::generate_n(a, N, [&](){ return ::rand() % N - N / 2; }); std::generate_n(b, N + 37, [&](){ return ::rand() % N + N / 2; }); REQUIRE(cudaMemset(ans_a, 0, N * sizeof(T)) == cudaSuccess); REQUIRE(cudaMemset(ans_b, 1, (N + 37) * sizeof(T)) == cudaSuccess); *check = true; }).name("initialize"); //rebind_memset auto memset_t = taskflow.emplace([&]() { F cf; auto memset_t = cf.memset(ans_a, 0, N * sizeof(T)); run_and_wait(cf); cf.memset(memset_t, a, 0, N * sizeof(T)); run_and_wait(cf); cf.memset(memset_t, b, 1, (N + 37) * sizeof(T)); run_and_wait(cf); }).name("memset"); //verify auto verify_t = taskflow.emplace([&]() { F cf; cf.kernel( 32, 512, 0, verify, a, ans_a, check, N ); cf.kernel( 32, 512, 0, verify, b, ans_b, check, N + 37 ); run_and_wait(cf); REQUIRE(*check); }).name("verify"); //free memory auto deallocate_t = taskflow.emplace([&]() { REQUIRE(cudaFree(a) == cudaSuccess); REQUIRE(cudaFree(b) == cudaSuccess); REQUIRE(cudaFree(ans_a) == cudaSuccess); REQUIRE(cudaFree(ans_b) == cudaSuccess); REQUIRE(cudaFree(check) == cudaSuccess); }).name("deallocate"); allocate_t.precede(initialize_t); initialize_t.precede(memset_t); memset_t.precede(verify_t); verify_t.precede(deallocate_t); executor.run(taskflow).wait(); } } // cudaflow TEST_CASE("cudaFlow.rebind.memset.int" * doctest::timeout(300)) { rebind_memset(); } TEST_CASE("cudaFlow.rebind.memset.float" * doctest::timeout(300)) { rebind_memset(); } TEST_CASE("cudaFlow.rebind.memset.double" * doctest::timeout(300)) { rebind_memset(); } // capturer TEST_CASE("cudaFlowCapturer.rebind.memset.int" * doctest::timeout(300)) { rebind_memset(); } TEST_CASE("cudaFlowCapturer.rebind.memset.float" * doctest::timeout(300)) { rebind_memset(); } TEST_CASE("cudaFlowCapturer.rebind.memset.double" * doctest::timeout(300)) { rebind_memset(); } // ---------------------------------------------------------------------------- // rebind algorithms // ---------------------------------------------------------------------------- TEST_CASE("cudaFlowCapturer.rebind.algorithms") { tf::cudaFlowCapturer capturer; auto data = tf::cuda_malloc_shared(10000); auto res = tf::cuda_malloc_shared(1); auto task = capturer.for_each( data, data+10000, []__device__(int& i) { i = 10; } ); run_and_wait(capturer); for(int i=0; i<10000; i++) { REQUIRE(data[i] == 10); } REQUIRE(capturer.num_tasks() == 1); // rebind to single task capturer.single_task(task, [=] __device__ () {*data = 2;}); run_and_wait(capturer); REQUIRE(*data == 2); for(int i=1; i<10000; i++) { REQUIRE(data[i] == 10); } REQUIRE(capturer.num_tasks() == 1); // rebind to for each index capturer.for_each_index(task, 0, 10000, 1, [=] __device__ (int i) { data[i] = -23; } ); run_and_wait(capturer); for(int i=0; i<10000; i++) { REQUIRE(data[i] == -23); } REQUIRE(capturer.num_tasks() == 1); // rebind to single task capturer.single_task(task, [res]__device__(){ *res = 999; }); run_and_wait(capturer); REQUIRE(*res == 999); REQUIRE(capturer.num_tasks() == 1); // clear the capturer capturer.clear(); REQUIRE(capturer.num_tasks() == 0); run_and_wait(capturer); REQUIRE(*res == 999); for(int i=0; i<10000; i++) { REQUIRE(data[i] == -23); } // clear the memory tf::cuda_free(data); tf::cuda_free(res); }