#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include #include // ---------------------------------------------------------------------------- // kernel helper // ---------------------------------------------------------------------------- template __global__ void k_set(T* ptr, size_t N, T value) { int i = blockIdx.x*blockDim.x + threadIdx.x; if (i < N) { ptr[i] = value; } } template __global__ void k_single_set(T* ptr, int i, T value) { ptr[i] = value; } template __global__ void k_add(T* ptr, size_t N, T value) { int i = blockIdx.x*blockDim.x + threadIdx.x; if (i < N) { ptr[i] += value; } } template __global__ void k_single_add(T* ptr, int i, T value) { ptr[i] += value; } template void run_and_wait(T& cf) { tf::cudaStream stream; cf.run(stream); stream.synchronize(); } // -------------------------------------------------------- // Testcase: Empty // -------------------------------------------------------- template void empty() { std::atomic counter{0}; tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](){ T tf; ++counter; }); taskflow.emplace([&](){ T tf; ++counter; }); taskflow.emplace([&](){ T tf; ++counter; }); executor.run_n(taskflow, 100).wait(); REQUIRE(counter == 300); } TEST_CASE("Empty" * doctest::timeout(300)) { empty(); } TEST_CASE("EmptyCapture" * doctest::timeout(300)) { empty(); } // ---------------------------------------------------------------------------- // Move Semantics // ---------------------------------------------------------------------------- template void move_semantics() { unsigned N = 1024; F rhs; REQUIRE(rhs.num_tasks() == 0); REQUIRE(rhs.empty()); REQUIRE(rhs.native_executable() == nullptr); // construct a cudaflow of three tasks auto cpu = static_cast(std::calloc(N, sizeof(int))); auto gpu = tf::cuda_malloc_device(N); dim3 g = {(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = rhs.copy(gpu, cpu, N); auto kernel = rhs.kernel(g, b, 0, k_add, gpu, N, 17); auto d2h = rhs.copy(cpu, gpu, N); h2d.precede(kernel); kernel.precede(d2h); REQUIRE(rhs.num_tasks() == 3); REQUIRE(rhs.empty() == false); REQUIRE(rhs.native_executable() == nullptr); // construct a rhs F lhs( std::move(rhs) ); REQUIRE(rhs.num_tasks() == 0); REQUIRE(rhs.empty()); REQUIRE(rhs.native_executable() == nullptr); REQUIRE(lhs.num_tasks() == 3); REQUIRE(lhs.empty() == false); REQUIRE(lhs.native_executable() == nullptr); // assign lhs to rhs using move semantics rhs = std::move(lhs); REQUIRE(lhs.num_tasks() == 0); REQUIRE(lhs.empty()); REQUIRE(lhs.native_executable() == nullptr); REQUIRE(rhs.num_tasks() == 3); REQUIRE(rhs.empty() == false); REQUIRE(rhs.native_executable() == nullptr); // run rhs.run(0); cudaStreamSynchronize(0); auto native_graph = rhs.native_graph(); auto native_executable = rhs.native_executable(); REQUIRE(native_graph != nullptr); REQUIRE(native_executable != nullptr); REQUIRE(rhs.num_tasks() == 3); REQUIRE(rhs.empty() == false); REQUIRE(rhs.native_graph() != nullptr); REQUIRE(rhs.native_executable() != nullptr); REQUIRE(tf::cuda_graph_get_num_nodes(rhs.native_graph()) == rhs.num_tasks()); for(unsigned i=0; i(); } TEST_CASE("cudaFlowCapturer.MoveSemantics" * doctest::timeout(300)) { move_semantics(); } // ---------------------------------------------------------------------------- // Standalone // ---------------------------------------------------------------------------- template void standalone() { T cf; tf::cudaStream stream; REQUIRE(cf.empty()); unsigned N = 1024; auto cpu = static_cast(std::calloc(N, sizeof(int))); auto gpu = tf::cuda_malloc_device(N); dim3 g = {(N+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, N); auto kernel = cf.kernel(g, b, 0, k_add, gpu, N, 17); auto d2h = cf.copy(cpu, gpu, N); h2d.precede(kernel); kernel.precede(d2h); for(unsigned i=0; i(); } TEST_CASE("Standalone.cudaCapturer") { standalone(); } // -------------------------------------------------------- // Testcase: Set // -------------------------------------------------------- template void set() { tf::Executor executor; tf::Taskflow taskflow; for(unsigned n=1; n<=123456; n = n*2 + 1) { taskflow.clear(); T* cpu = nullptr; T* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(T))); REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); }); auto gputask = taskflow.emplace([&]() { tf::cudaFlow cf; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel((n+255)/256, 256, 0, k_set, gpu, n, (T)17); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); cputask.precede(gputask); executor.run(taskflow).wait(); for(unsigned i=0; i(); } TEST_CASE("Set.i16" * doctest::timeout(300)) { set(); } TEST_CASE("Set.i32" * doctest::timeout(300)) { set(); } // -------------------------------------------------------- // Testcase: Add // -------------------------------------------------------- template void add() { for(unsigned n=1; n<=123456; n = n*2 + 1) { tf::Taskflow taskflow; tf::Executor executor; T* cpu = nullptr; T* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(T))); REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); }); auto gputask = taskflow.emplace([&](){ tf::cudaFlow cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto ad1 = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto ad2 = cf.kernel(g, b, 0, k_add, gpu, n, 2); auto ad3 = cf.kernel(g, b, 0, k_add, gpu, n, 3); auto ad4 = cf.kernel(g, b, 0, k_add, gpu, n, 4); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(ad1); ad1.precede(ad2); ad2.precede(ad3); ad3.precede(ad4); ad4.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); cputask.precede(gputask); executor.run(taskflow).wait(); for(unsigned i=0; i(); } TEST_CASE("Add.i16" * doctest::timeout(300)) { add(); } TEST_CASE("Add.i32" * doctest::timeout(300)) { add(); } // TODO: 64-bit fail? //TEST_CASE("Add.i64" * doctest::timeout(300)) { // add(); //} // -------------------------------------------------------- // Testcase: Binary Set // -------------------------------------------------------- template void bset() { const unsigned n = 10000; tf::Taskflow taskflow; tf::Executor executor; T* cpu = nullptr; T* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(T))); REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); }); auto gputask = taskflow.emplace([&]() { F cf; dim3 g = {1, 1, 1}; dim3 b = {1, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto d2h = cf.copy(cpu, gpu, n); std::vector tasks(n+1); for(unsigned i=1; i<=n; ++i) { tasks[i] = cf.kernel(g, b, 0, k_single_set, gpu, i-1, (T)17); auto p = i/2; if(p != 0) { tasks[p].precede(tasks[i]); } tasks[i].precede(d2h); h2d.precede(tasks[i]); } run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); cputask.precede(gputask); executor.run(taskflow).wait(); for(unsigned i=0; i(); } TEST_CASE("BSet.i16" * doctest::timeout(300)) { bset(); } TEST_CASE("BSet.i32" * doctest::timeout(300)) { bset(); } TEST_CASE("CapturedBSet.i8" * doctest::timeout(300)) { bset(); } TEST_CASE("CapturedBSet.i16" * doctest::timeout(300)) { bset(); } TEST_CASE("CapturedBSet.i32" * doctest::timeout(300)) { bset(); } // -------------------------------------------------------- // Testcase: Memset // -------------------------------------------------------- template void memset() { tf::Taskflow taskflow; tf::Executor executor; const int N = 100; int* cpu = new int [N]; int* gpu = nullptr; REQUIRE(cudaMalloc(&gpu, N*sizeof(int)) == cudaSuccess); for(int r=1; r<=100; ++r) { int start = ::rand() % N; for(int i=0; i, gpu, N, 123); auto copy = cf.copy(cpu, gpu, N); auto zero = cf.memset(gpu+start, 0x3f, (N-start)*sizeof(int)); kset.precede(zero); zero.precede(copy); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("CapturedMemset" * doctest::timeout(300)) { memset(); } // -------------------------------------------------------- // Testcase: Memset0 // -------------------------------------------------------- template void memset0() { tf::Taskflow taskflow; tf::Executor executor; const int N = 97; T* cpu = new T [N]; T* gpu = nullptr; REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess); for(int r=1; r<=100; ++r) { int start = ::rand() % N; for(int i=0; i, gpu, N, (T)123); auto zero = cf.memset(gpu+start, (T)0, (N-start)*sizeof(T)); auto copy = cf.copy(cpu, gpu, N); kset.precede(zero); zero.precede(copy); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("Memset0.i16") { memset0(); } TEST_CASE("Memset0.i32") { memset0(); } TEST_CASE("Memset0.f32") { memset0(); } TEST_CASE("Memset0.f64") { memset0(); } TEST_CASE("CapturedMemset0.i8") { memset0(); } TEST_CASE("CapturedMemset0.i16") { memset0(); } TEST_CASE("CapturedMemset0.i32") { memset0(); } TEST_CASE("CapturedMemset0.f32") { memset0(); } TEST_CASE("CapturedMemset0.f64") { memset0(); } // -------------------------------------------------------- // Testcase: Memcpy // -------------------------------------------------------- template void memcpy() { tf::Taskflow taskflow; tf::Executor executor; const int N = 97; T* cpu = new T [N]; T* gpu = nullptr; REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess); for(int r=1; r<=100; ++r) { int start = ::rand() % N; for(int i=0; i, gpu, N, (T)123); auto zero = cf.memset(gpu+start, (T)0, (N-start)*sizeof(T)); auto copy = cf.memcpy(cpu, gpu, N*sizeof(T)); kset.precede(zero); zero.precede(copy); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("Memcpy.i16") { memcpy(); } TEST_CASE("Memcpy.i32") { memcpy(); } TEST_CASE("Memcpy.f32") { memcpy(); } TEST_CASE("Memcpy.f64") { memcpy(); } TEST_CASE("CapturedMemcpy.i8") { memcpy(); } TEST_CASE("CapturedMemcpy.i16") { memcpy(); } TEST_CASE("CapturedMemcpy.i32") { memcpy(); } TEST_CASE("CapturedMemcpy.f32") { memcpy(); } TEST_CASE("CapturedMemcpy.f64") { memcpy(); } // -------------------------------------------------------- // Testcase: fill // -------------------------------------------------------- template void fill(T value) { tf::Taskflow taskflow; tf::Executor executor; const int N = 107; T* cpu = new T [N]; T* gpu = nullptr; REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess); for(int r=1; r<=100; ++r) { int start = ::rand() % N; for(int i=0; i, gpu, N, (T)123); auto fill = cf.fill(gpu+start, value, (N-start)); auto copy = cf.copy(cpu, gpu, N); kset.precede(fill); fill.precede(copy); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); executor.run(taskflow).wait(); for(int i=0; i(+123); fill(-123); } TEST_CASE("Fill.i16") { fill(+12345); fill(-12345); } TEST_CASE("Fill.i32") { fill(+123456789); fill(-123456789); } TEST_CASE("Fill.f32") { fill(+123456789.0f); fill(-123456789.0f); } // -------------------------------------------------------- // Testcase: Zero // -------------------------------------------------------- template void zero() { tf::Taskflow taskflow; tf::Executor executor; const int N = 100; T* cpu = new T [N]; T* gpu = nullptr; REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess); for(int r=1; r<=100; ++r) { int start = ::rand() % N; for(int i=0; i, gpu, N, (T)123); auto zero = cf.zero(gpu+start, (N-start)); auto copy = cf.copy(cpu, gpu, N); kset.precede(zero); zero.precede(copy); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("Zero.i16") { zero(); } TEST_CASE("Zero.i32") { zero(); } TEST_CASE("Zero.f32") { zero(); } // -------------------------------------------------------- // Testcase: Barrier // -------------------------------------------------------- template void barrier() { const unsigned n = 1000; tf::Taskflow taskflow; tf::Executor executor; T* cpu = nullptr; T* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(T))); REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); }); auto gputask = taskflow.emplace([&]() { tf::cudaFlow cf; dim3 g = {1, 1, 1}; dim3 b = {1, 1, 1}; auto br1 = cf.noop(); auto br2 = cf.noop(); auto br3 = cf.noop(); auto h2d = cf.copy(gpu, cpu, n); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(br1); for(unsigned i=0; i, gpu, i, (T)17); k1.succeed(br1) .precede(br2); auto k2 = cf.kernel(g, b, 0, k_single_add, gpu, i, (T)3); k2.succeed(br2) .precede(br3); } br3.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); cputask.precede(gputask); executor.run(taskflow).wait(); for(unsigned i=0; i(); } TEST_CASE("Barrier.i16" * doctest::timeout(300)) { barrier(); } TEST_CASE("Barrier.i32" * doctest::timeout(300)) { barrier(); } // ---------------------------------------------------------------------------- // NestedRuns // ---------------------------------------------------------------------------- template void nested_runs() { int* cpu = nullptr; int* gpu = nullptr; constexpr unsigned n = 1000; cpu = static_cast(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); struct A { tf::Executor executor; tf::Taskflow taskflow; void run(int* cpu, int* gpu, unsigned n) { taskflow.clear(); auto A1 = taskflow.emplace([&]() { F cf; cf.copy(gpu, cpu, n); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto A2 = taskflow.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; cf.kernel(g, b, 0, k_add, gpu, n, 1); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto A3 = taskflow.emplace([&] () { F cf; cf.copy(cpu, gpu, n); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); A1.precede(A2); A2.precede(A3); executor.run_n(taskflow, 10).wait(); } }; struct B { tf::Taskflow taskflow; tf::Executor executor; A a; void run(int* cpu, int* gpu, unsigned n) { taskflow.clear(); auto B0 = taskflow.emplace([] () {}); auto B1 = taskflow.emplace([&] () { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto B2 = taskflow.emplace([&] () { a.run(cpu, gpu, n); }); auto B3 = taskflow.emplace([&] () { for(unsigned i=0; i(); } TEST_CASE("CapturedNestedRuns" * doctest::timeout(300)) { nested_runs(); } /* // ---------------------------------------------------------------------------- // WorkerID // ---------------------------------------------------------------------------- void worker_id(unsigned N, unsigned M) { tf::Taskflow taskflow; tf::Executor executor(N + M); REQUIRE(executor.num_workers() == (N + M)); const unsigned s = 100; for(unsigned k=0; k= 0); REQUIRE(id < N+M); }); auto gputask = taskflow.emplace([&](tf::cudaFlow&) { auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); }); auto chktask = taskflow.emplace([&] () { auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); }); taskflow.emplace([&](tf::cudaFlow&) { auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); }); taskflow.emplace([&]() { auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); }); auto subflow = taskflow.emplace([&](tf::Subflow& sf){ auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); auto t1 = sf.emplace([&](){ auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); }); auto t2 = sf.emplace([&](tf::cudaFlow&){ auto id = executor.this_worker_id(); REQUIRE(id >= 0); REQUIRE(id < N+M); }); t1.precede(t2); }); cputask.precede(gputask); gputask.precede(chktask); chktask.precede(subflow); } executor.run_n(taskflow, 10).wait(); } TEST_CASE("WorkerID.1C1G") { worker_id(1, 1); } TEST_CASE("WorkerID.1C2G") { worker_id(1, 2); } TEST_CASE("WorkerID.1C3G") { worker_id(1, 3); } TEST_CASE("WorkerID.1C4G") { worker_id(1, 4); } TEST_CASE("WorkerID.2C1G") { worker_id(2, 1); } TEST_CASE("WorkerID.2C2G") { worker_id(2, 2); } TEST_CASE("WorkerID.2C3G") { worker_id(2, 3); } TEST_CASE("WorkerID.2C4G") { worker_id(2, 4); } TEST_CASE("WorkerID.3C1G") { worker_id(3, 1); } TEST_CASE("WorkerID.3C2G") { worker_id(3, 2); } TEST_CASE("WorkerID.3C3G") { worker_id(3, 3); } TEST_CASE("WorkerID.3C4G") { worker_id(3, 4); } TEST_CASE("WorkerID.4C1G") { worker_id(4, 1); } TEST_CASE("WorkerID.4C2G") { worker_id(4, 2); } TEST_CASE("WorkerID.4C3G") { worker_id(4, 3); } TEST_CASE("WorkerID.4C4G") { worker_id(4, 4); } */ // ---------------------------------------------------------------------------- // Multiruns // ---------------------------------------------------------------------------- void multiruns(unsigned N, unsigned M) { tf::Taskflow taskflow; tf::Executor executor(N + M); const unsigned n = 1000; const unsigned s = 100; int *cpu[s] = {0}; int *gpu[s] = {0}; for(unsigned k=0; k(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu[k], n*sizeof(int)) == cudaSuccess); }); auto gputask = taskflow.emplace([&, k, number]() { tf::cudaFlow cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu[k], cpu[k], n); auto kernel = cf.kernel(g, b, 0, k_add, gpu[k], n, number); auto d2h = cf.copy(cpu[k], gpu[k], n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto chktask = taskflow.emplace([&, k, number] () { for(unsigned i=0; i void subflow() { tf::Taskflow taskflow; tf::Executor executor; int* cpu = nullptr; int* gpu = nullptr; const unsigned n = 1000; auto partask = taskflow.emplace([&](tf::Subflow& sf){ auto cputask = sf.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); }); auto gputask = sf.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); cputask.precede(gputask); }); auto chktask = taskflow.emplace([&](){ for(unsigned i=0; i(); } TEST_CASE("CapturedSubflow" * doctest::timeout(300)) { subflow(); } // ---------------------------------------------------------------------------- // NestedSubflow // ---------------------------------------------------------------------------- template void nested_subflow() { tf::Taskflow taskflow; tf::Executor executor; int* cpu = nullptr; int* gpu = nullptr; const unsigned n = 1000; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); }); auto partask = taskflow.emplace([&](tf::Subflow& sf){ auto gputask1 = sf.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto subtask1 = sf.emplace([&](tf::Subflow& sf2) { auto gputask2 = sf2.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto subtask2 = sf2.emplace([&](tf::Subflow& sf3){ sf3.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); }); gputask2.precede(subtask2); }); gputask1.precede(subtask1); }); auto chktask = taskflow.emplace([&](){ for(unsigned i=0; i(); } TEST_CASE("CapturedNestedSubflow" * doctest::timeout(300) ) { nested_subflow(); } // ---------------------------------------------------------------------------- // DetachedSubflow // ---------------------------------------------------------------------------- template void detached_subflow() { tf::Taskflow taskflow; tf::Executor executor; int* cpu = nullptr; int* gpu = nullptr; const unsigned n = 1000; taskflow.emplace([&](tf::Subflow& sf){ auto cputask = sf.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); }); auto gputask = sf.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); cputask.precede(gputask); sf.detach(); }); executor.run(taskflow).wait(); for(unsigned i=0; i(); } TEST_CASE("CapturedDetachedSubflow" * doctest::timeout(300)) { detached_subflow(); } // ---------------------------------------------------------------------------- // Conditional GPU tasking // ---------------------------------------------------------------------------- template void loop() { tf::Taskflow taskflow; tf::Executor executor; const unsigned n = 1000; int* cpu = nullptr; int* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); }); auto gputask = taskflow.emplace([&]() { F cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto d2h = cf.copy(cpu, gpu, n); h2d.precede(kernel); kernel.precede(d2h); run_and_wait(cf); REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph())); }); auto condition = taskflow.emplace([&cpu, round=0] () mutable { ++round; for(unsigned i=0; i= 100; }); auto freetask = taskflow.emplace([&](){ REQUIRE(cudaFree(gpu) == cudaSuccess); std::free(cpu); }); cputask.precede(gputask); gputask.precede(condition); condition.precede(gputask, freetask); executor.run(taskflow).wait(); } TEST_CASE("Loop" * doctest::timeout(300)) { loop(); } TEST_CASE("CapturedLoop" * doctest::timeout(300)) { loop(); } // ---------------------------------------------------------------------------- // Predicate // ---------------------------------------------------------------------------- TEST_CASE("Predicate") { tf::Taskflow taskflow; tf::Executor executor; const unsigned n = 1000; int* cpu = nullptr; int* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); REQUIRE(cudaMemcpy(gpu, cpu, n*sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess); }); auto gputask = taskflow.emplace([&]() { tf::cudaFlow cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto copy = cf.copy(cpu, gpu, n); kernel.precede(copy); tf::cudaStream stream; for(int i=0; i<100; i++) { cf.run(stream); } stream.synchronize(); }); auto freetask = taskflow.emplace([&](){ for(unsigned i=0; i(std::calloc(n, sizeof(int))); REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess); REQUIRE(cudaMemcpy(gpu, cpu, n*sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess); }); auto gputask = taskflow.emplace([&]() { tf::cudaFlow cf; dim3 g = {(n+255)/256, 1, 1}; dim3 b = {256, 1, 1}; auto kernel = cf.kernel(g, b, 0, k_add, gpu, n, 1); auto copy = cf.copy(cpu, gpu, n); kernel.precede(copy); tf::cudaStream stream; for(int i=0; i<100; i++) { cf.run(stream); } stream.synchronize(); }); auto freetask = taskflow.emplace([&](){ for(unsigned i=0; i