mesytec-mnode/external/taskflow-3.8.0/unittests/cuda/test_cuda_basic_updates.cu
2025-01-04 01:25:05 +01:00

848 lines
22 KiB
Text

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cuda/cudaflow.hpp>
#include <taskflow/cuda/algorithm/reduce.hpp>
#include <taskflow/cuda/algorithm/for_each.hpp>
#include <taskflow/cuda/algorithm/transform.hpp>
template <typename T>
void run_and_wait(T& cf) {
tf::cudaStream stream;
cf.run(stream);
stream.synchronize();
}
//verify
template <typename T>
__global__
void verify(const T* a, const T* b, bool* check, size_t size) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
for(;tid < size; tid += gridDim.x * blockDim.x) {
if(a[tid] != b[tid]) {
*check = false;
return;
}
}
}
template <typename T>
__global__ void k_add(T* ptr, size_t N, T value) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < N) {
ptr[i] += value;
}
}
//add
template <typename T>
__global__
void add(const T* a, const T* b, T* c, size_t size) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
for(;tid < size; tid += gridDim.x * blockDim.x) {
c[tid] = a[tid] + b[tid];
}
}
//multiply
template <typename T>
__global__
void multiply(const T* a, const T* b, T* c, size_t size) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
for(;tid < size; tid += gridDim.x * blockDim.x) {
c[tid] = a[tid] * b[tid];
}
}
// ----------------------------------------------------------------------------
// Incrementality
// ----------------------------------------------------------------------------
TEST_CASE("cudaFlowCapturer.Incrementality") {
unsigned N = 1024;
tf::cudaFlowCapturer cf;
// construct a cudaflow of three tasks
auto cpu = static_cast<int*>(std::calloc(N, sizeof(int)));
auto gpu = tf::cuda_malloc_device<int>(N);
dim3 g = {(N+255)/256, 1, 1};
dim3 b = {256, 1, 1};
auto h2d = cf.copy(gpu, cpu, N);
auto kernel = cf.kernel(g, b, 0, k_add<int>, gpu, N, 17);
auto d2h = cf.copy(cpu, gpu, N);
h2d.precede(kernel);
kernel.precede(d2h);
REQUIRE(cf.num_tasks() == 3);
REQUIRE(cf.empty() == false);
REQUIRE(cf.native_executable() == nullptr);
// run
cf.run(0);
cudaStreamSynchronize(0);
auto native_graph = cf.native_graph();
auto native_executable = cf.native_executable();
REQUIRE(native_graph != nullptr);
REQUIRE(native_executable != nullptr);
REQUIRE(cf.num_tasks() == 3);
REQUIRE(cf.empty() == false);
REQUIRE(cf.native_graph() != nullptr);
REQUIRE(cf.native_executable() != nullptr);
REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks());
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == 17);
}
// update task without changing topology and run
int sum = 17;
for(size_t j=0; j<10; j++) {
sum += j;
cf.kernel(kernel, g, b, 0, k_add<int>, gpu, N, j);
cf.run(0);
cudaStreamSynchronize(0);
auto updated_native_graph = cf.native_graph();
auto updated_native_executable = cf.native_executable();
REQUIRE(updated_native_graph != native_graph);
REQUIRE(updated_native_executable == native_executable);
REQUIRE(cf.num_tasks() == 3);
REQUIRE(cf.empty() == false);
REQUIRE(cf.native_graph() != nullptr);
REQUIRE(cf.native_executable() != nullptr);
REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks());
for(unsigned i=0; i<N; ++i) {
REQUIRE(cpu[i] == sum);
}
native_graph = updated_native_graph;
native_executable = updated_native_executable;
}
// change topology and run
auto d2h2 = cf.copy(cpu, gpu, N);
d2h.precede(d2h2);
cf.run(0);
cudaStreamSynchronize(0);
auto updated_native_graph = cf.native_graph();
auto updated_native_executable = cf.native_executable();
REQUIRE(updated_native_graph != native_graph);
// CUDA runtime may reuse the previous executable
//REQUIRE(updated_native_executable == native_executable);
REQUIRE(cf.num_tasks() == 4);
REQUIRE(cf.empty() == false);
REQUIRE(cf.native_graph() != nullptr);
REQUIRE(cf.native_executable() != nullptr);
REQUIRE(tf::cuda_graph_get_num_nodes(cf.native_graph()) == cf.num_tasks());
}
//----------------------------------------------------------------------
//rebind kernel
//----------------------------------------------------------------------
template <typename T, typename F>
void rebind_kernel() {
tf::Executor executor;
for(size_t N = 1; N < 65529; N = N * 2 + 1) {
tf::Taskflow taskflow;
std::vector<T*> operand(3, nullptr);
std::vector<T*> ans_operand(3, nullptr);
std::vector<int> ind(3);
std::generate_n(ind.data(), 3, [&](){ return ::rand() % 3; });
bool* check {nullptr};
//allocate
auto allocate_t = taskflow.emplace([&]() {
for(int i = 0; i < 3; ++i) {
REQUIRE(cudaMallocManaged(&operand[i], N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMallocManaged(&ans_operand[i], N * sizeof(T)) == cudaSuccess);
}
REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
}).name("allocate");
//initialize
auto initialize_t = taskflow.emplace([&](){
for(int i = 0; i < 3; ++i) {
std::generate_n(operand[i], N, [&](){ return ::rand() % N - N / 2 + i; });
std::memcpy(ans_operand[i], operand[i], N * sizeof(T));
}
*check = true;
}).name("initialize");
//rebind_kernel
auto add_t = taskflow.emplace([&]() {
F cf;
auto multi_t = cf.kernel(
32, 512, 0,
multiply<T>,
operand[ind[0]], operand[ind[1]], operand[ind[2]], N
);
auto add_t = cf.kernel(
32, 512, 0,
add<T>,
operand[ind[1]], operand[ind[2]], operand[ind[0]], N
);
multi_t.precede(add_t);
run_and_wait(cf);
cf.kernel(
multi_t,
64, 128, 0,
multiply<T>,
operand[ind[2]], operand[ind[0]], operand[ind[1]], N
);
cf.kernel(
add_t,
16, 256, 0,
add<T>,
operand[ind[1]], operand[ind[0]], operand[ind[2]], N
);
run_and_wait(cf);
cf.kernel(
multi_t,
8, 1024, 0,
multiply<T>,
operand[ind[0]], operand[ind[2]], operand[ind[1]], N
);
cf.kernel(
add_t,
64, 64, 0,
add<T>,
operand[ind[2]], operand[ind[1]], operand[ind[0]], N
);
run_and_wait(cf);
}).name("add");
//verify
auto verify_t = taskflow.emplace([&]() {
F cf;
//auto multi1_t = cf.transform(
// ans_operand[ind[2]], ans_operand[ind[2]]+ N,
// [] __device__ (T& v1, T& v2) { return v1 * v2; },
// ans_operand[ind[0]], ans_operand[ind[1]]
//);
auto multi1_t = cf.transform(
ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[1]],
ans_operand[ind[2]],
[] __device__ (T& v1, T& v2) { return v1*v2; }
);
//auto add1_t = cf.transform(
// ans_operand[ind[0]], ans_operand[ind[0]]+ N,
// [] __device__ (T& v1, T& v2) { return v1 + v2; },
// ans_operand[ind[1]], ans_operand[ind[2]]
//);
auto add1_t = cf.transform(
ans_operand[ind[1]], ans_operand[ind[1]]+N, ans_operand[ind[2]],
ans_operand[ind[0]],
[] __device__ (T& v1, T& v2) { return v1 + v2; }
);
//auto multi2_t = cf.transform(
// ans_operand[ind[1]], ans_operand[ind[1]]+ N,
// [] __device__ (T& v1, T& v2) { return v1 * v2; },
// ans_operand[ind[2]], ans_operand[ind[0]]
//);
auto multi2_t = cf.transform(
ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[0]],
ans_operand[ind[1]],
[] __device__ (T& v1, T& v2) { return v1 * v2; }
);
//auto add2_t = cf.transform(
// ans_operand[ind[2]], ans_operand[ind[2]]+ N,
// [] __device__ (T& v1, T& v2) { return v1 + v2; },
// ans_operand[ind[1]], ans_operand[ind[0]]
//);
auto add2_t = cf.transform(
ans_operand[ind[1]], ans_operand[ind[1]] + N, ans_operand[ind[0]],
ans_operand[ind[2]],
[] __device__ (T& v1, T& v2) { return v1 + v2; }
);
auto multi3_t = cf.transform(
ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[2]],
ans_operand[ind[1]],
[] __device__ (T& v1, T& v2) { return v1 * v2; }
);
auto add3_t = cf.transform(
ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[1]],
ans_operand[ind[0]],
[] __device__ (T& v1, T& v2) { return v1 + v2; }
);
auto verify1_t = cf.kernel(
32, 512, 0,
verify<T>,
operand[ind[0]], ans_operand[ind[0]], check, N
);
auto verify2_t = cf.kernel(
32, 512, 0,
verify<T>,
operand[ind[1]], ans_operand[ind[1]], check, N
);
auto verify3_t = cf.kernel(
32, 512, 0,
verify<T>,
operand[ind[2]], ans_operand[ind[2]], check, N
);
multi1_t.precede(add1_t);
add1_t.precede(multi2_t);
multi2_t.precede(add2_t);
add2_t.precede(multi3_t);
multi3_t.precede(add3_t);
add3_t.precede(verify1_t).precede(verify2_t).precede(verify3_t);
run_and_wait(cf);
REQUIRE(*check);
}).name("verify");
//free memory
auto deallocate_t = taskflow.emplace([&]() {
for(int i = 0; i < 3; ++i) {
REQUIRE(cudaFree(operand[i]) == cudaSuccess);
REQUIRE(cudaFree(ans_operand[i]) == cudaSuccess);
}
REQUIRE(cudaFree(check) == cudaSuccess);
}).name("deallocate");
allocate_t.precede(initialize_t);
initialize_t.precede(add_t);
add_t.precede(verify_t);
verify_t.precede(deallocate_t);
executor.run(taskflow).wait();
}
}
// cudaflow
TEST_CASE("cudaFlow.rebind.kernel.int" * doctest::timeout(300)) {
rebind_kernel<int, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.kernel.float" * doctest::timeout(300)) {
rebind_kernel<float, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.kernel.double" * doctest::timeout(300)) {
rebind_kernel<double, tf::cudaFlow>();
}
// capturer
TEST_CASE("cudaFlowCapturer.rebind.kernel.int" * doctest::timeout(300)) {
rebind_kernel<int, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.kernel.float" * doctest::timeout(300)) {
rebind_kernel<float, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.kernel.double" * doctest::timeout(300)) {
rebind_kernel<double, tf::cudaFlowCapturer>();
}
//----------------------------------------------------------------------
//rebind copy
//----------------------------------------------------------------------
template <typename T, typename F>
void rebind_copy() {
tf::Executor executor;
for(int N = 1; N < 65459; N = N * 2 + 1) {
tf::Taskflow taskflow;
std::vector<T> ha(N, N + 5);
std::vector<T> hb(N, N - 31);
std::vector<T> hc(N, N - 47);
std::vector<T> hz(N);
T* da {nullptr};
T* db {nullptr};
T* dc {nullptr};
T* dz {nullptr};
//allocate
auto allocate_t = taskflow.emplace([&]() {
REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess);
}).name("allocate");
//rebind_copy
auto h2d_t = taskflow.emplace([&]() {
F cf;
auto h2d_t = cf.copy(da, ha.data(), N).name("h2d");
run_and_wait(cf);
cf.copy(h2d_t, db, hb.data(), N);
run_and_wait(cf);
cf.copy(h2d_t, dc, hc.data(), N);
run_and_wait(cf);
});
auto kernel_t = taskflow.emplace([&]() {
F cf;
//auto add1_t = cf.transform(
// dz, dz + N,
// [] __device__ (T& v1, T& v2) { return v1 + v2; },
// da, db
//);
auto add1_t = cf.transform(
da, da+N, db,
dz,
[] __device__ (T& v1, T& v2) { return v1 + v2; }
);
//auto add2_t = cf.transform(
// dc, dc + N,
// [] __device__ (T& v1, T& v2) { return v1 - v2; },
// dc, dz
//);
auto add2_t = cf.transform(
dc, dc + N, dz,
dc,
[] __device__ (T& v1, T& v2) { return v1 - v2; }
);
add1_t.precede(add2_t);
run_and_wait(cf);
});
auto d2h_t = taskflow.emplace([&]() {
F cf;
auto d2h_t = cf.copy(hc.data(), dc, N).name("d2h");
run_and_wait(cf);
cf.copy(d2h_t, hz.data(), dz, N);
run_and_wait(cf);
});
//verify
auto verify_t = taskflow.emplace([&]() {
for(auto& c: hc) {
REQUIRE(c == -21 - N);
}
for(auto& z: hz) {
REQUIRE(z == 2 * N - 26);
}
});
//free memory
auto deallocate_t = taskflow.emplace([&]() {
REQUIRE(cudaFree(da) == cudaSuccess);
REQUIRE(cudaFree(db) == cudaSuccess);
REQUIRE(cudaFree(dc) == cudaSuccess);
REQUIRE(cudaFree(dz) == cudaSuccess);
}).name("deallocate");
allocate_t.precede(h2d_t);
h2d_t.precede(kernel_t);
kernel_t.precede(d2h_t);
d2h_t.precede(verify_t);
verify_t.precede(deallocate_t);
executor.run(taskflow).wait();
}
}
// cudaFlow
TEST_CASE("cudaFlow.rebind.copy.int" * doctest::timeout(300)) {
rebind_copy<int, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.copy.float" * doctest::timeout(300)) {
rebind_copy<float, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.copy.double" * doctest::timeout(300)) {
rebind_copy<double, tf::cudaFlow>();
}
// cudaFlowCapturer
TEST_CASE("cudaFlowCapturer.rebind.copy.int" * doctest::timeout(300)) {
rebind_copy<int, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.copy.float" * doctest::timeout(300)) {
rebind_copy<float, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.copy.double" * doctest::timeout(300)) {
rebind_copy<double, tf::cudaFlowCapturer>();
}
//----------------------------------------------------------------------
// rebind memcpy
//----------------------------------------------------------------------
template <typename T, typename F>
void rebind_memcpy() {
tf::Executor executor;
for(int N = 1; N < 65459; N = N * 2 + 1) {
tf::Taskflow taskflow;
std::vector<T> ha(N, N + 5);
std::vector<T> hb(N, N - 31);
std::vector<T> hc(N, N - 47);
std::vector<T> hz(N);
T* da {nullptr};
T* db {nullptr};
T* dc {nullptr};
T* dz {nullptr};
//allocate
auto allocate_t = taskflow.emplace([&]() {
REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess);
}).name("allocate");
//rebind_memcpy
auto h2d_t = taskflow.emplace([&]() {
F cf;
auto h2d_t = cf.memcpy(da, ha.data(), sizeof(T) * N).name("h2d");
run_and_wait(cf);
cf.memcpy(h2d_t, db, hb.data(), sizeof(T) * N);
run_and_wait(cf);
cf.memcpy(h2d_t, dc, hc.data(), sizeof(T) * N);
run_and_wait(cf);
});
auto kernel_t = taskflow.emplace([&]() {
F cf;
auto add1_t = cf.transform(
da, da + N, db,
dz,
[] __device__ (T& v1, T& v2) { return v1 + v2; }
);
auto add2_t = cf.transform(
dc, dc + N, dz,
dc,
[] __device__ (T& v1, T& v2) { return v1 - v2; }
);
add1_t.precede(add2_t);
run_and_wait(cf);
});
auto d2h_t = taskflow.emplace([&]() {
F cf;
auto d2h_t = cf.memcpy(hc.data(), dc, sizeof(T) * N).name("d2h");
run_and_wait(cf);
cf.memcpy(d2h_t, hz.data(), dz, sizeof(T) * N);
run_and_wait(cf);
});
//verify
auto verify_t = taskflow.emplace([&]() {
for(auto& c: hc) {
REQUIRE(c == -21 - N);
}
for(auto& z: hz) {
REQUIRE(z == 2 * N - 26);
}
});
//free memory
auto deallocate_t = taskflow.emplace([&]() {
REQUIRE(cudaFree(da) == cudaSuccess);
REQUIRE(cudaFree(db) == cudaSuccess);
REQUIRE(cudaFree(dc) == cudaSuccess);
REQUIRE(cudaFree(dz) == cudaSuccess);
}).name("deallocate");
allocate_t.precede(h2d_t);
h2d_t.precede(kernel_t);
kernel_t.precede(d2h_t);
d2h_t.precede(verify_t);
verify_t.precede(deallocate_t);
executor.run(taskflow).wait();
}
}
// cudaflow
TEST_CASE("cudaFlow.rebind.memcpy.int" * doctest::timeout(300)) {
rebind_memcpy<int, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.memcpy.float" * doctest::timeout(300)) {
rebind_memcpy<float, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.memcpy.double" * doctest::timeout(300)) {
rebind_memcpy<double, tf::cudaFlow>();
}
// capturer
TEST_CASE("cudaFlowCapturer.rebind.memcpy.int" * doctest::timeout(300)) {
rebind_memcpy<int, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.memcpy.float" * doctest::timeout(300)) {
rebind_memcpy<float, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.memcpy.double" * doctest::timeout(300)) {
rebind_memcpy<double, tf::cudaFlowCapturer>();
}
//----------------------------------------------------------------------
//rebind memset
//----------------------------------------------------------------------
template <typename T, typename F>
void rebind_memset() {
tf::Executor executor;
tf::Taskflow taskflow;
for(size_t N = 1; N < 65199; N = N * 2 + 1) {
taskflow.clear();
T* a {nullptr};
T* b {nullptr};
T* ans_a {nullptr};
T* ans_b {nullptr};
bool* check {nullptr};
//allocate
auto allocate_t = taskflow.emplace([&]() {
REQUIRE(cudaMallocManaged(&a, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMallocManaged(&b, (N + 37) * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMallocManaged(&ans_a, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMallocManaged(&ans_b, (N + 37) * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
}).name("allocate");
//initialize
auto initialize_t = taskflow.emplace([&]() {
std::generate_n(a, N, [&](){ return ::rand() % N - N / 2; });
std::generate_n(b, N + 37, [&](){ return ::rand() % N + N / 2; });
REQUIRE(cudaMemset(ans_a, 0, N * sizeof(T)) == cudaSuccess);
REQUIRE(cudaMemset(ans_b, 1, (N + 37) * sizeof(T)) == cudaSuccess);
*check = true;
}).name("initialize");
//rebind_memset
auto memset_t = taskflow.emplace([&]() {
F cf;
auto memset_t = cf.memset(ans_a, 0, N * sizeof(T));
run_and_wait(cf);
cf.memset(memset_t, a, 0, N * sizeof(T));
run_and_wait(cf);
cf.memset(memset_t, b, 1, (N + 37) * sizeof(T));
run_and_wait(cf);
}).name("memset");
//verify
auto verify_t = taskflow.emplace([&]() {
F cf;
cf.kernel(
32, 512, 0,
verify<T>,
a, ans_a, check, N
);
cf.kernel(
32, 512, 0,
verify<T>,
b, ans_b, check, N + 37
);
run_and_wait(cf);
REQUIRE(*check);
}).name("verify");
//free memory
auto deallocate_t = taskflow.emplace([&]() {
REQUIRE(cudaFree(a) == cudaSuccess);
REQUIRE(cudaFree(b) == cudaSuccess);
REQUIRE(cudaFree(ans_a) == cudaSuccess);
REQUIRE(cudaFree(ans_b) == cudaSuccess);
REQUIRE(cudaFree(check) == cudaSuccess);
}).name("deallocate");
allocate_t.precede(initialize_t);
initialize_t.precede(memset_t);
memset_t.precede(verify_t);
verify_t.precede(deallocate_t);
executor.run(taskflow).wait();
}
}
// cudaflow
TEST_CASE("cudaFlow.rebind.memset.int" * doctest::timeout(300)) {
rebind_memset<int, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.memset.float" * doctest::timeout(300)) {
rebind_memset<float, tf::cudaFlow>();
}
TEST_CASE("cudaFlow.rebind.memset.double" * doctest::timeout(300)) {
rebind_memset<double, tf::cudaFlow>();
}
// capturer
TEST_CASE("cudaFlowCapturer.rebind.memset.int" * doctest::timeout(300)) {
rebind_memset<int, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.memset.float" * doctest::timeout(300)) {
rebind_memset<float, tf::cudaFlowCapturer>();
}
TEST_CASE("cudaFlowCapturer.rebind.memset.double" * doctest::timeout(300)) {
rebind_memset<double, tf::cudaFlowCapturer>();
}
// ----------------------------------------------------------------------------
// rebind algorithms
// ----------------------------------------------------------------------------
TEST_CASE("cudaFlowCapturer.rebind.algorithms") {
tf::cudaFlowCapturer capturer;
auto data = tf::cuda_malloc_shared<int>(10000);
auto res = tf::cuda_malloc_shared<int>(1);
auto task = capturer.for_each(
data, data+10000, []__device__(int& i) {
i = 10;
}
);
run_and_wait(capturer);
for(int i=0; i<10000; i++) {
REQUIRE(data[i] == 10);
}
REQUIRE(capturer.num_tasks() == 1);
// rebind to single task
capturer.single_task(task, [=] __device__ () {*data = 2;});
run_and_wait(capturer);
REQUIRE(*data == 2);
for(int i=1; i<10000; i++) {
REQUIRE(data[i] == 10);
}
REQUIRE(capturer.num_tasks() == 1);
// rebind to for each index
capturer.for_each_index(task, 0, 10000, 1,
[=] __device__ (int i) {
data[i] = -23;
}
);
run_and_wait(capturer);
for(int i=0; i<10000; i++) {
REQUIRE(data[i] == -23);
}
REQUIRE(capturer.num_tasks() == 1);
// rebind to single task
capturer.single_task(task, [res]__device__(){ *res = 999; });
run_and_wait(capturer);
REQUIRE(*res == 999);
REQUIRE(capturer.num_tasks() == 1);
// clear the capturer
capturer.clear();
REQUIRE(capturer.num_tasks() == 0);
run_and_wait(capturer);
REQUIRE(*res == 999);
for(int i=0; i<10000; i++) {
REQUIRE(data[i] == -23);
}
// clear the memory
tf::cuda_free(data);
tf::cuda_free(res);
}