mesytec-mnode/external/taskflow-3.8.0/unittests/cuda/test_cuda_basics.cu
2025-01-04 01:25:05 +01:00

1652 lines
37 KiB
Text

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cuda/cudaflow.hpp>
// ----------------------------------------------------------------------------
// kernel helper
// ----------------------------------------------------------------------------
template <typename T>
__global__ void k_set(T* ptr, size_t N, T value) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < N) {
ptr[i] = value;
}
}
template <typename T>
__global__ void k_single_set(T* ptr, int i, T value) {
ptr[i] = value;
}
template <typename T>
__global__ void k_add(T* ptr, size_t N, T value) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < N) {
ptr[i] += value;
}
}
template <typename T>
__global__ void k_single_add(T* ptr, int i, T value) {
ptr[i] += value;
}
template <typename T>
void run_and_wait(T& cf) {
tf::cudaStream stream;
cf.run(stream);
stream.synchronize();
}
// --------------------------------------------------------
// Testcase: Empty
// --------------------------------------------------------
template <typename T>
void empty() {
std::atomic<int> counter{0};
tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](){
T tf;
++counter;
});
taskflow.emplace([&](){
T tf;
++counter;
});
taskflow.emplace([&](){
T tf;
++counter;
});
executor.run_n(taskflow, 100).wait();
REQUIRE(counter == 300);
}
TEST_CASE("Empty" * doctest::timeout(300)) {
empty<tf::cudaFlow>();
}
TEST_CASE("EmptyCapture" * doctest::timeout(300)) {
empty<tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// Move Semantics
// ----------------------------------------------------------------------------
template <typename F>
void move_semantics() {
unsigned N = 1024;
F rhs;
REQUIRE(rhs.num_tasks() == 0);
REQUIRE(rhs.empty());
REQUIRE(rhs.native_executable() == nullptr);
// construct a cudaflow of three tasks
auto cpu = static_cast<int*>(std::calloc(N, sizeof(int)));
auto gpu = tf::cuda_malloc_device<int>(N);
dim3 g = {(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = rhs.copy(gpu, cpu, N);
auto kernel = rhs.kernel(g, b, 0, k_add<int>, gpu, N, 17);
auto d2h = rhs.copy(cpu, gpu, N);
h2d.precede(kernel);
kernel.precede(d2h);
REQUIRE(rhs.num_tasks() == 3);
REQUIRE(rhs.empty() == false);
REQUIRE(rhs.native_executable() == nullptr);
// construct a rhs
F lhs( std::move(rhs) );
REQUIRE(rhs.num_tasks() == 0);
REQUIRE(rhs.empty());
REQUIRE(rhs.native_executable() == nullptr);
REQUIRE(lhs.num_tasks() == 3);
REQUIRE(lhs.empty() == false);
REQUIRE(lhs.native_executable() == nullptr);
// assign lhs to rhs using move semantics
rhs = std::move(lhs);
REQUIRE(lhs.num_tasks() == 0);
REQUIRE(lhs.empty());
REQUIRE(lhs.native_executable() == nullptr);
REQUIRE(rhs.num_tasks() == 3);
REQUIRE(rhs.empty() == false);
REQUIRE(rhs.native_executable() == nullptr);
// run
rhs.run(0);
cudaStreamSynchronize(0);
auto native_graph = rhs.native_graph();
auto native_executable = rhs.native_executable();
REQUIRE(native_graph != nullptr);
REQUIRE(native_executable != nullptr);
REQUIRE(rhs.num_tasks() == 3);
REQUIRE(rhs.empty() == false);
REQUIRE(rhs.native_graph() != nullptr);
REQUIRE(rhs.native_executable() != nullptr);
REQUIRE(tf::cuda_graph_get_num_nodes(rhs.native_graph()) == rhs.num_tasks());
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == 17);
}
// assign rhs to lhs using move semantics
lhs = std::move(rhs);
REQUIRE(lhs.num_tasks() == 3);
REQUIRE(lhs.empty() == false);
REQUIRE(lhs.native_graph() == native_graph);
REQUIRE(lhs.native_executable() == native_executable);
REQUIRE(tf::cuda_graph_get_num_nodes(lhs.native_graph()) == lhs.num_tasks());
REQUIRE(rhs.num_tasks() == 0);
REQUIRE(rhs.empty());
REQUIRE(rhs.native_graph() == nullptr);
REQUIRE(rhs.native_executable() == nullptr);
// run the flow again
for(size_t j=2; j<=10; j++) {
lhs.run(0);
cudaStreamSynchronize(0);
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == j*17);
}
REQUIRE(lhs.num_tasks() == 3);
REQUIRE(lhs.empty() == false);
REQUIRE(lhs.native_graph() == native_graph);
REQUIRE(lhs.native_executable() == native_executable);
REQUIRE(tf::cuda_graph_get_num_nodes(lhs.native_graph()) == lhs.num_tasks());
REQUIRE(rhs.num_tasks() == 0);
REQUIRE(rhs.empty());
REQUIRE(rhs.native_graph() == nullptr);
REQUIRE(rhs.native_executable() == nullptr);
}
}
TEST_CASE("cudaFlow.MoveSemantics" * doctest::timeout(300)) {
move_semantics<tf::cudaFlow>();
}
TEST_CASE("cudaFlowCapturer.MoveSemantics" * doctest::timeout(300)) {
move_semantics<tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// Standalone
// ----------------------------------------------------------------------------
template <typename T>
void standalone() {
T cf;
tf::cudaStream stream;
REQUIRE(cf.empty());
unsigned N = 1024;
auto cpu = static_cast<int*>(std::calloc(N, sizeof(int)));
auto gpu = tf::cuda_malloc_device<int>(N);
dim3 g = {(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, N);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, N, 17);
auto d2h = cf.copy(cpu, gpu, N);
h2d.precede(kernel);
kernel.precede(d2h);
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == 0);
}
cf.run(stream);
stream.synchronize();
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == 17);
}
for(size_t i=0; i<9; i++) {
cf.run(stream);
}
stream.synchronize();
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == 170);
}
std::free(cpu);
tf::cuda_free(gpu);
}
TEST_CASE("Standalone.cudaFlow") {
standalone<tf::cudaFlow>();
}
TEST_CASE("Standalone.cudaCapturer") {
standalone<tf::cudaFlowCapturer>();
}
// --------------------------------------------------------
// Testcase: Set
// --------------------------------------------------------
template <typename T>
void set() {
tf::Executor executor;
tf::Taskflow taskflow;
for(unsigned n=1; n<=123456; n = n*2 + 1) {
taskflow.clear();
T* cpu = nullptr;
T* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
});
auto gputask = taskflow.emplace([&]() {
tf::cudaFlow cf;
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel((n+255)/256, 256, 0, k_set<T>, gpu, n, (T)17);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
cputask.precede(gputask);
executor.run(taskflow).wait();
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == (T)17);
}
std::free(cpu);
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
}
TEST_CASE("Set.i8" * doctest::timeout(300)) {
set<int8_t>();
}
TEST_CASE("Set.i16" * doctest::timeout(300)) {
set<int16_t>();
}
TEST_CASE("Set.i32" * doctest::timeout(300)) {
set<int32_t>();
}
// --------------------------------------------------------
// Testcase: Add
// --------------------------------------------------------
template <typename T>
void add() {
for(unsigned n=1; n<=123456; n = n*2 + 1) {
tf::Taskflow taskflow;
tf::Executor executor;
T* cpu = nullptr;
T* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
});
auto gputask = taskflow.emplace([&](){
tf::cudaFlow cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto ad1 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 1);
auto ad2 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 2);
auto ad3 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 3);
auto ad4 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 4);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(ad1);
ad1.precede(ad2);
ad2.precede(ad3);
ad3.precede(ad4);
ad4.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
cputask.precede(gputask);
executor.run(taskflow).wait();
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == 10);
}
std::free(cpu);
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
}
TEST_CASE("Add.i8" * doctest::timeout(300)) {
add<int8_t>();
}
TEST_CASE("Add.i16" * doctest::timeout(300)) {
add<int16_t>();
}
TEST_CASE("Add.i32" * doctest::timeout(300)) {
add<int32_t>();
}
// TODO: 64-bit fail?
//TEST_CASE("Add.i64" * doctest::timeout(300)) {
// add<int64_t>();
//}
// --------------------------------------------------------
// Testcase: Binary Set
// --------------------------------------------------------
template <typename T, typename F>
void bset() {
const unsigned n = 10000;
tf::Taskflow taskflow;
tf::Executor executor;
T* cpu = nullptr;
T* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
});
auto gputask = taskflow.emplace([&]() {
F cf;
dim3 g = {1, 1, 1};
dim3 b = {1, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto d2h = cf.copy(cpu, gpu, n);
std::vector<tf::cudaTask> tasks(n+1);
for(unsigned i=1; i<=n; ++i) {
tasks[i] = cf.kernel(g, b, 0, k_single_set<T>, gpu, i-1, (T)17);
auto p = i/2;
if(p != 0) {
tasks[p].precede(tasks[i]);
}
tasks[i].precede(d2h);
h2d.precede(tasks[i]);
}
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
cputask.precede(gputask);
executor.run(taskflow).wait();
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == (T)17);
}
std::free(cpu);
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("BSet.i8" * doctest::timeout(300)) {
bset<int8_t, tf::cudaFlow>();
}
TEST_CASE("BSet.i16" * doctest::timeout(300)) {
bset<int16_t, tf::cudaFlow>();
}
TEST_CASE("BSet.i32" * doctest::timeout(300)) {
bset<int32_t, tf::cudaFlow>();
}
TEST_CASE("CapturedBSet.i8" * doctest::timeout(300)) {
bset<int8_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedBSet.i16" * doctest::timeout(300)) {
bset<int16_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedBSet.i32" * doctest::timeout(300)) {
bset<int32_t, tf::cudaFlowCapturer>();
}
// --------------------------------------------------------
// Testcase: Memset
// --------------------------------------------------------
template <typename F>
void memset() {
tf::Taskflow taskflow;
tf::Executor executor;
const int N = 100;
int* cpu = new int [N];
int* gpu = nullptr;
REQUIRE(cudaMalloc(&gpu, N*sizeof(int)) == cudaSuccess);
for(int r=1; r<=100; ++r) {
int start = ::rand() % N;
for(int i=0; i<N; ++i) {
cpu[i] = 999;
}
taskflow.emplace([&](){
F cf;
dim3 g = {(unsigned)(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kset = cf.kernel(g, b, 0, k_set<int>, gpu, N, 123);
auto copy = cf.copy(cpu, gpu, N);
auto zero = cf.memset(gpu+start, 0x3f, (N-start)*sizeof(int));
kset.precede(zero);
zero.precede(copy);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
executor.run(taskflow).wait();
for(int i=0; i<start; ++i) {
REQUIRE(cpu[i] == 123);
}
for(int i=start; i<N; ++i) {
REQUIRE(cpu[i] == 0x3f3f3f3f);
}
}
delete [] cpu;
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("Memset" * doctest::timeout(300)) {
memset<tf::cudaFlow>();
}
TEST_CASE("CapturedMemset" * doctest::timeout(300)) {
memset<tf::cudaFlowCapturer>();
}
// --------------------------------------------------------
// Testcase: Memset0
// --------------------------------------------------------
template <typename T, typename F>
void memset0() {
tf::Taskflow taskflow;
tf::Executor executor;
const int N = 97;
T* cpu = new T [N];
T* gpu = nullptr;
REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess);
for(int r=1; r<=100; ++r) {
int start = ::rand() % N;
for(int i=0; i<N; ++i) {
cpu[i] = (T)999;
}
taskflow.emplace([&](){
F cf;
dim3 g = {(unsigned)(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kset = cf.kernel(g, b, 0, k_set<T>, gpu, N, (T)123);
auto zero = cf.memset(gpu+start, (T)0, (N-start)*sizeof(T));
auto copy = cf.copy(cpu, gpu, N);
kset.precede(zero);
zero.precede(copy);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
executor.run(taskflow).wait();
for(int i=0; i<start; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)123) < 1e-4);
}
for(int i=start; i<N; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)0) < 1e-4);
}
}
delete [] cpu;
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("Memset0.i8") {
memset0<int8_t, tf::cudaFlow>();
}
TEST_CASE("Memset0.i16") {
memset0<int16_t, tf::cudaFlow>();
}
TEST_CASE("Memset0.i32") {
memset0<int32_t, tf::cudaFlow>();
}
TEST_CASE("Memset0.f32") {
memset0<float, tf::cudaFlow>();
}
TEST_CASE("Memset0.f64") {
memset0<double, tf::cudaFlow>();
}
TEST_CASE("CapturedMemset0.i8") {
memset0<int8_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemset0.i16") {
memset0<int16_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemset0.i32") {
memset0<int32_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemset0.f32") {
memset0<float, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemset0.f64") {
memset0<double, tf::cudaFlowCapturer>();
}
// --------------------------------------------------------
// Testcase: Memcpy
// --------------------------------------------------------
template <typename T, typename F>
void memcpy() {
tf::Taskflow taskflow;
tf::Executor executor;
const int N = 97;
T* cpu = new T [N];
T* gpu = nullptr;
REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess);
for(int r=1; r<=100; ++r) {
int start = ::rand() % N;
for(int i=0; i<N; ++i) {
cpu[i] = (T)999;
}
taskflow.emplace([&](){
F cf;
dim3 g = {(unsigned)(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kset = cf.kernel(g, b, 0, k_set<T>, gpu, N, (T)123);
auto zero = cf.memset(gpu+start, (T)0, (N-start)*sizeof(T));
auto copy = cf.memcpy(cpu, gpu, N*sizeof(T));
kset.precede(zero);
zero.precede(copy);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
executor.run(taskflow).wait();
for(int i=0; i<start; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)123) < 1e-4);
}
for(int i=start; i<N; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)0) < 1e-4);
}
}
delete [] cpu;
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("Memcpy.i8") {
memcpy<int8_t, tf::cudaFlow>();
}
TEST_CASE("Memcpy.i16") {
memcpy<int16_t, tf::cudaFlow>();
}
TEST_CASE("Memcpy.i32") {
memcpy<int32_t, tf::cudaFlow>();
}
TEST_CASE("Memcpy.f32") {
memcpy<float, tf::cudaFlow>();
}
TEST_CASE("Memcpy.f64") {
memcpy<double, tf::cudaFlow>();
}
TEST_CASE("CapturedMemcpy.i8") {
memcpy<int8_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemcpy.i16") {
memcpy<int16_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemcpy.i32") {
memcpy<int32_t, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemcpy.f32") {
memcpy<float, tf::cudaFlowCapturer>();
}
TEST_CASE("CapturedMemcpy.f64") {
memcpy<double, tf::cudaFlowCapturer>();
}
// --------------------------------------------------------
// Testcase: fill
// --------------------------------------------------------
template <typename T>
void fill(T value) {
tf::Taskflow taskflow;
tf::Executor executor;
const int N = 107;
T* cpu = new T [N];
T* gpu = nullptr;
REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess);
for(int r=1; r<=100; ++r) {
int start = ::rand() % N;
for(int i=0; i<N; ++i) {
cpu[i] = (T)999;
}
taskflow.emplace([&](){
tf::cudaFlow cf;
dim3 g = {(unsigned)(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kset = cf.kernel(g, b, 0, k_set<T>, gpu, N, (T)123);
auto fill = cf.fill(gpu+start, value, (N-start));
auto copy = cf.copy(cpu, gpu, N);
kset.precede(fill);
fill.precede(copy);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
executor.run(taskflow).wait();
for(int i=0; i<start; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)123) < 1e-4);
}
for(int i=start; i<N; ++i) {
REQUIRE(std::fabs(cpu[i] - value) < 1e-4);
}
}
delete [] cpu;
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("Fill.i8") {
fill<int8_t>(+123);
fill<int8_t>(-123);
}
TEST_CASE("Fill.i16") {
fill<int16_t>(+12345);
fill<int16_t>(-12345);
}
TEST_CASE("Fill.i32") {
fill<int32_t>(+123456789);
fill<int32_t>(-123456789);
}
TEST_CASE("Fill.f32") {
fill<float>(+123456789.0f);
fill<float>(-123456789.0f);
}
// --------------------------------------------------------
// Testcase: Zero
// --------------------------------------------------------
template <typename T>
void zero() {
tf::Taskflow taskflow;
tf::Executor executor;
const int N = 100;
T* cpu = new T [N];
T* gpu = nullptr;
REQUIRE(cudaMalloc(&gpu, N*sizeof(T)) == cudaSuccess);
for(int r=1; r<=100; ++r) {
int start = ::rand() % N;
for(int i=0; i<N; ++i) {
cpu[i] = (T)999;
}
taskflow.emplace([&](){
tf::cudaFlow cf;
dim3 g = {(unsigned)(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kset = cf.kernel(g, b, 0, k_set<T>, gpu, N, (T)123);
auto zero = cf.zero(gpu+start, (N-start));
auto copy = cf.copy(cpu, gpu, N);
kset.precede(zero);
zero.precede(copy);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
executor.run(taskflow).wait();
for(int i=0; i<start; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)123) < 1e-4);
}
for(int i=start; i<N; ++i) {
REQUIRE(std::fabs(cpu[i] - (T)0) < 1e-4);
}
}
delete [] cpu;
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("Zero.i8") {
zero<int8_t>();
}
TEST_CASE("Zero.i16") {
zero<int16_t>();
}
TEST_CASE("Zero.i32") {
zero<int32_t>();
}
TEST_CASE("Zero.f32") {
zero<float>();
}
// --------------------------------------------------------
// Testcase: Barrier
// --------------------------------------------------------
template <typename T>
void barrier() {
const unsigned n = 1000;
tf::Taskflow taskflow;
tf::Executor executor;
T* cpu = nullptr;
T* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
});
auto gputask = taskflow.emplace([&]() {
tf::cudaFlow cf;
dim3 g = {1, 1, 1};
dim3 b = {1, 1, 1};
auto br1 = cf.noop();
auto br2 = cf.noop();
auto br3 = cf.noop();
auto h2d = cf.copy(gpu, cpu, n);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(br1);
for(unsigned i=0; i<n; ++i) {
auto k1 = cf.kernel(g, b, 0, k_single_set<T>, gpu, i, (T)17);
k1.succeed(br1)
.precede(br2);
auto k2 = cf.kernel(g, b, 0, k_single_add<T>, gpu, i, (T)3);
k2.succeed(br2)
.precede(br3);
}
br3.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
cputask.precede(gputask);
executor.run(taskflow).wait();
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == (T)20);
}
std::free(cpu);
REQUIRE(cudaFree(gpu) == cudaSuccess);
}
TEST_CASE("Barrier.i8" * doctest::timeout(300)) {
barrier<int8_t>();
}
TEST_CASE("Barrier.i16" * doctest::timeout(300)) {
barrier<int16_t>();
}
TEST_CASE("Barrier.i32" * doctest::timeout(300)) {
barrier<int32_t>();
}
// ----------------------------------------------------------------------------
// NestedRuns
// ----------------------------------------------------------------------------
template <typename F>
void nested_runs() {
int* cpu = nullptr;
int* gpu = nullptr;
constexpr unsigned n = 1000;
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
struct A {
tf::Executor executor;
tf::Taskflow taskflow;
void run(int* cpu, int* gpu, unsigned n) {
taskflow.clear();
auto A1 = taskflow.emplace([&]() {
F cf;
cf.copy(gpu, cpu, n);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto A2 = taskflow.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto A3 = taskflow.emplace([&] () {
F cf;
cf.copy(cpu, gpu, n);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
A1.precede(A2);
A2.precede(A3);
executor.run_n(taskflow, 10).wait();
}
};
struct B {
tf::Taskflow taskflow;
tf::Executor executor;
A a;
void run(int* cpu, int* gpu, unsigned n) {
taskflow.clear();
auto B0 = taskflow.emplace([] () {});
auto B1 = taskflow.emplace([&] () {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto B2 = taskflow.emplace([&] () { a.run(cpu, gpu, n); });
auto B3 = taskflow.emplace([&] () {
for(unsigned i=0; i<n; ++i) {
cpu[i]++;
}
});
B0.precede(B1);
B1.precede(B2);
B2.precede(B3);
executor.run_n(taskflow, 100).wait();
}
};
B b;
b.run(cpu, gpu, n);
for(unsigned i=0; i<n; i++) {
REQUIRE(cpu[i] == 1200);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
}
TEST_CASE("NestedRuns" * doctest::timeout(300)) {
nested_runs<tf::cudaFlow>();
}
TEST_CASE("CapturedNestedRuns" * doctest::timeout(300)) {
nested_runs<tf::cudaFlowCapturer>();
}
/*
// ----------------------------------------------------------------------------
// WorkerID
// ----------------------------------------------------------------------------
void worker_id(unsigned N, unsigned M) {
tf::Taskflow taskflow;
tf::Executor executor(N + M);
REQUIRE(executor.num_workers() == (N + M));
const unsigned s = 100;
for(unsigned k=0; k<s; ++k) {
auto cputask = taskflow.emplace([&](){
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
auto gputask = taskflow.emplace([&](tf::cudaFlow&) {
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
auto chktask = taskflow.emplace([&] () {
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
taskflow.emplace([&](tf::cudaFlow&) {
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
taskflow.emplace([&]() {
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
auto subflow = taskflow.emplace([&](tf::Subflow& sf){
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
auto t1 = sf.emplace([&](){
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
auto t2 = sf.emplace([&](tf::cudaFlow&){
auto id = executor.this_worker_id();
REQUIRE(id >= 0);
REQUIRE(id < N+M);
});
t1.precede(t2);
});
cputask.precede(gputask);
gputask.precede(chktask);
chktask.precede(subflow);
}
executor.run_n(taskflow, 10).wait();
}
TEST_CASE("WorkerID.1C1G") {
worker_id(1, 1);
}
TEST_CASE("WorkerID.1C2G") {
worker_id(1, 2);
}
TEST_CASE("WorkerID.1C3G") {
worker_id(1, 3);
}
TEST_CASE("WorkerID.1C4G") {
worker_id(1, 4);
}
TEST_CASE("WorkerID.2C1G") {
worker_id(2, 1);
}
TEST_CASE("WorkerID.2C2G") {
worker_id(2, 2);
}
TEST_CASE("WorkerID.2C3G") {
worker_id(2, 3);
}
TEST_CASE("WorkerID.2C4G") {
worker_id(2, 4);
}
TEST_CASE("WorkerID.3C1G") {
worker_id(3, 1);
}
TEST_CASE("WorkerID.3C2G") {
worker_id(3, 2);
}
TEST_CASE("WorkerID.3C3G") {
worker_id(3, 3);
}
TEST_CASE("WorkerID.3C4G") {
worker_id(3, 4);
}
TEST_CASE("WorkerID.4C1G") {
worker_id(4, 1);
}
TEST_CASE("WorkerID.4C2G") {
worker_id(4, 2);
}
TEST_CASE("WorkerID.4C3G") {
worker_id(4, 3);
}
TEST_CASE("WorkerID.4C4G") {
worker_id(4, 4);
} */
// ----------------------------------------------------------------------------
// Multiruns
// ----------------------------------------------------------------------------
void multiruns(unsigned N, unsigned M) {
tf::Taskflow taskflow;
tf::Executor executor(N + M);
const unsigned n = 1000;
const unsigned s = 100;
int *cpu[s] = {0};
int *gpu[s] = {0};
for(unsigned k=0; k<s; ++k) {
int number = ::rand()%100;
auto cputask = taskflow.emplace([&, k](){
cpu[k] = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu[k], n*sizeof(int)) == cudaSuccess);
});
auto gputask = taskflow.emplace([&, k, number]() {
tf::cudaFlow cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu[k], cpu[k], n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu[k], n, number);
auto d2h = cf.copy(cpu[k], gpu[k], n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto chktask = taskflow.emplace([&, k, number] () {
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[k][i] == number);
}
});
cputask.precede(gputask);
gputask.precede(chktask);
}
executor.run(taskflow).wait();
}
TEST_CASE("Multiruns.1C1G") {
multiruns(1, 1);
}
TEST_CASE("Multiruns.1C2G") {
multiruns(1, 2);
}
TEST_CASE("Multiruns.1C3G") {
multiruns(1, 3);
}
TEST_CASE("Multiruns.1C4G") {
multiruns(1, 4);
}
TEST_CASE("Multiruns.2C1G") {
multiruns(2, 1);
}
TEST_CASE("Multiruns.2C2G") {
multiruns(2, 2);
}
TEST_CASE("Multiruns.2C3G") {
multiruns(2, 3);
}
TEST_CASE("Multiruns.2C4G") {
multiruns(2, 4);
}
TEST_CASE("Multiruns.3C1G") {
multiruns(3, 1);
}
TEST_CASE("Multiruns.3C2G") {
multiruns(3, 2);
}
TEST_CASE("Multiruns.3C3G") {
multiruns(3, 3);
}
TEST_CASE("Multiruns.3C4G") {
multiruns(3, 4);
}
TEST_CASE("Multiruns.4C1G") {
multiruns(4, 1);
}
TEST_CASE("Multiruns.4C2G") {
multiruns(4, 2);
}
TEST_CASE("Multiruns.4C3G") {
multiruns(4, 3);
}
TEST_CASE("Multiruns.4C4G") {
multiruns(4, 4);
}
// ----------------------------------------------------------------------------
// Subflow
// ----------------------------------------------------------------------------
template <typename F>
void subflow() {
tf::Taskflow taskflow;
tf::Executor executor;
int* cpu = nullptr;
int* gpu = nullptr;
const unsigned n = 1000;
auto partask = taskflow.emplace([&](tf::Subflow& sf){
auto cputask = sf.emplace([&](){
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
});
auto gputask = sf.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
cputask.precede(gputask);
});
auto chktask = taskflow.emplace([&](){
for(unsigned i=0; i<n ;++i){
REQUIRE(cpu[i] == 1);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
});
partask.precede(chktask);
executor.run(taskflow).wait();
}
TEST_CASE("Subflow" * doctest::timeout(300)) {
subflow<tf::cudaFlow>();
}
TEST_CASE("CapturedSubflow" * doctest::timeout(300)) {
subflow<tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// NestedSubflow
// ----------------------------------------------------------------------------
template <typename F>
void nested_subflow() {
tf::Taskflow taskflow;
tf::Executor executor;
int* cpu = nullptr;
int* gpu = nullptr;
const unsigned n = 1000;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
});
auto partask = taskflow.emplace([&](tf::Subflow& sf){
auto gputask1 = sf.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto subtask1 = sf.emplace([&](tf::Subflow& sf2) {
auto gputask2 = sf2.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto subtask2 = sf2.emplace([&](tf::Subflow& sf3){
sf3.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
});
gputask2.precede(subtask2);
});
gputask1.precede(subtask1);
});
auto chktask = taskflow.emplace([&](){
for(unsigned i=0; i<n ;++i){
REQUIRE(cpu[i] == 3);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
});
partask.precede(chktask)
.succeed(cputask);
executor.run(taskflow).wait();
}
TEST_CASE("NestedSubflow" * doctest::timeout(300) ) {
nested_subflow<tf::cudaFlow>();
}
TEST_CASE("CapturedNestedSubflow" * doctest::timeout(300) ) {
nested_subflow<tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// DetachedSubflow
// ----------------------------------------------------------------------------
template <typename F>
void detached_subflow() {
tf::Taskflow taskflow;
tf::Executor executor;
int* cpu = nullptr;
int* gpu = nullptr;
const unsigned n = 1000;
taskflow.emplace([&](tf::Subflow& sf){
auto cputask = sf.emplace([&](){
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
});
auto gputask = sf.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
cputask.precede(gputask);
sf.detach();
});
executor.run(taskflow).wait();
for(unsigned i=0; i<n ;++i){
REQUIRE(cpu[i] == 1);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
}
TEST_CASE("DetachedSubflow" * doctest::timeout(300)) {
detached_subflow<tf::cudaFlow>();
}
TEST_CASE("CapturedDetachedSubflow" * doctest::timeout(300)) {
detached_subflow<tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// Conditional GPU tasking
// ----------------------------------------------------------------------------
template <typename F>
void loop() {
tf::Taskflow taskflow;
tf::Executor executor;
const unsigned n = 1000;
int* cpu = nullptr;
int* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
});
auto gputask = taskflow.emplace([&]() {
F cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, n);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto d2h = cf.copy(cpu, gpu, n);
h2d.precede(kernel);
kernel.precede(d2h);
run_and_wait(cf);
REQUIRE(cf.num_tasks() == tf::cuda_graph_get_num_nodes(cf.native_graph()));
});
auto condition = taskflow.emplace([&cpu, round=0] () mutable {
++round;
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == round);
}
return round >= 100;
});
auto freetask = taskflow.emplace([&](){
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
});
cputask.precede(gputask);
gputask.precede(condition);
condition.precede(gputask, freetask);
executor.run(taskflow).wait();
}
TEST_CASE("Loop" * doctest::timeout(300)) {
loop<tf::cudaFlow>();
}
TEST_CASE("CapturedLoop" * doctest::timeout(300)) {
loop<tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// Predicate
// ----------------------------------------------------------------------------
TEST_CASE("Predicate") {
tf::Taskflow taskflow;
tf::Executor executor;
const unsigned n = 1000;
int* cpu = nullptr;
int* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
REQUIRE(cudaMemcpy(gpu, cpu, n*sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess);
});
auto gputask = taskflow.emplace([&]() {
tf::cudaFlow cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto copy = cf.copy(cpu, gpu, n);
kernel.precede(copy);
tf::cudaStream stream;
for(int i=0; i<100; i++) {
cf.run(stream);
}
stream.synchronize();
});
auto freetask = taskflow.emplace([&](){
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == 100);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
});
cputask.precede(gputask);
gputask.precede(freetask);
executor.run(taskflow).wait();
}
// ----------------------------------------------------------------------------
// Repeat
// ----------------------------------------------------------------------------
TEST_CASE("Repeat") {
tf::Taskflow taskflow;
tf::Executor executor;
const unsigned n = 1000;
int* cpu = nullptr;
int* gpu = nullptr;
auto cputask = taskflow.emplace([&](){
cpu = static_cast<int*>(std::calloc(n, sizeof(int)));
REQUIRE(cudaMalloc(&gpu, n*sizeof(int)) == cudaSuccess);
REQUIRE(cudaMemcpy(gpu, cpu, n*sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess);
});
auto gputask = taskflow.emplace([&]() {
tf::cudaFlow cf;
dim3 g = {(n+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, n, 1);
auto copy = cf.copy(cpu, gpu, n);
kernel.precede(copy);
tf::cudaStream stream;
for(int i=0; i<100; i++) {
cf.run(stream);
}
stream.synchronize();
});
auto freetask = taskflow.emplace([&](){
for(unsigned i=0; i<n; ++i) {
REQUIRE(cpu[i] == 100);
}
REQUIRE(cudaFree(gpu) == cudaSuccess);
std::free(cpu);
});
cputask.precede(gputask);
gputask.precede(freetask);
executor.run(taskflow).wait();
}