mesytec-mnode/external/taskflow-3.8.0/unittests/cuda/cuda_algorithm_updates.cu

#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN

#include <doctest.h>

#include <taskflow/taskflow.hpp>
#include <taskflow/cuda/cudaflow.hpp>
#include <taskflow/cuda/algorithm/transform.hpp>
#include <taskflow/cuda/algorithm/for_each.hpp>
#include <taskflow/cuda/algorithm/reduce.hpp>
#include <taskflow/cuda/algorithm/scan.hpp>
#include <taskflow/cuda/algorithm/find.hpp>
#include <taskflow/cuda/algorithm/sort.hpp>

//verify
template <typename T>
__global__
void verify(const T* a, const T* b, bool* check, size_t size) {
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

  for(;tid < size; tid += gridDim.x * blockDim.x) {
    if(a[tid] != b[tid]) {
      *check = false;
      return;
    }
  }
}

//add
template <typename T>
__global__
void add(const T* a, const T* b, T* c, size_t size) {
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

  for(;tid < size; tid += gridDim.x * blockDim.x) {
    c[tid] = a[tid] + b[tid];
  }
}

//multiply
template <typename T>
__global__
void multiply(const T* a, const T* b, T* c, size_t size) {
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

  for(;tid < size; tid += gridDim.x * blockDim.x) {
    c[tid] = a[tid] * b[tid];
  }
}

//----------------------------------------------------------------------
//offload_n
//----------------------------------------------------------------------

template <typename F>
void offload_n() {

  tf::Executor executor;
  tf::Taskflow taskflow;

  for(size_t N = 1; N < 65532; N = N * 2 + 1) {
    
    taskflow.clear();

    int* a {nullptr};
    int* ans_a {nullptr};
    bool* check {nullptr};
  
    int times = ::rand() % 7;

    //allocate
    auto allocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaMallocManaged(&a, N * sizeof(int)) == cudaSuccess);
      REQUIRE(cudaMallocManaged(&ans_a, N * sizeof(int)) == cudaSuccess);

      REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
    }).name("allocate");
  
    //initialize
    auto initialize_t = taskflow.emplace([&]() {
      std::generate_n(a, N, [&](){ return ::rand() % N; });
      std::memcpy(ans_a, a, N * sizeof(int));
      *check = true;
    }).name("initialize");

    //offload
    auto offload_t = taskflow.emplace([&](F& cf) {
      cf.kernel(
        32, 512, 0,
        add<int>,
        a, a, a, N
      );

      cf.offload_n(times+1);
    }).name("offload");

    //verify
    auto verify_t = taskflow.emplace([&](F& cf) {
      auto ans_t = cf.for_each(
        ans_a, ans_a + N,
        [=] __device__(int& v) { v *= std::pow(2, (times + 1)); }
      );

      auto verify_t = cf.kernel(  
        32, 512, 0,
        verify<int>,
        a, ans_a, check, N
      );

      ans_t.precede(verify_t);

      cf.offload();
      REQUIRE(*check);
    }).name("verify");

     //free memory
    auto deallocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaFree(a) == cudaSuccess);
      REQUIRE(cudaFree(ans_a) == cudaSuccess);

      REQUIRE(cudaFree(check) == cudaSuccess);
    }).name("deallocate");

    allocate_t.precede(initialize_t);
    initialize_t.precede(offload_t);
    offload_t.precede(verify_t);
    verify_t.precede(deallocate_t);

    executor.run(taskflow).wait();
  }
}

TEST_CASE("cudaFlow.offload" * doctest::timeout(300)) {
  offload_n<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.offload" * doctest::timeout(300)) {
  offload_n<tf::cudaFlowCapturer>();
}


//----------------------------------------------------------------------
//join_n
//----------------------------------------------------------------------

template <typename F>
void join() {
  tf::Executor executor;
  tf::Taskflow taskflow;

  for(size_t N = 1; N < 65532; N = N * 2 + 1) {
    
    taskflow.clear();

    int* a {nullptr};
    int* ans_a {nullptr};
    bool* check {nullptr};
  
    int times = ::rand() % 7;

    //allocate
    auto allocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaMallocManaged(&a, N * sizeof(int)) == cudaSuccess);
      REQUIRE(cudaMallocManaged(&ans_a, N * sizeof(int)) == cudaSuccess);

      REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
    }).name("allocate");
  
    //initialize
    auto initialize_t = taskflow.emplace([&]() {
      std::generate_n(a, N, [&](){ return ::rand() % N; });

      std::memcpy(ans_a, a, N * sizeof(int));

      *check = true;
    }).name("initialize");

    //join
    auto join_t = taskflow.emplace([&](F& cf) {
      cf.kernel(
        32, 512, 0,
        add<int>,
        a, a, a, N
      );

      cf.offload_n(times);
    }).name("join");

    //verify
    auto verify_t = taskflow.emplace([&](F& cf) {
      auto ans_t = cf.for_each(
        ans_a, ans_a + N,
        [=] __device__(int& v) { v *= std::pow(2, (times)); }
      );

      auto verify_t = cf.kernel(  
        32, 512, 0,
        verify<int>,
        a, ans_a, check, N
      );

      ans_t.precede(verify_t);

      cf.offload();
      REQUIRE(*check);
    }).name("verify");

     //free memory
    auto deallocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaFree(a) == cudaSuccess);
      REQUIRE(cudaFree(ans_a) == cudaSuccess);

      REQUIRE(cudaFree(check) == cudaSuccess);
    }).name("deallocate");

    allocate_t.precede(initialize_t);
    initialize_t.precede(join_t);
    join_t.precede(verify_t);
    verify_t.precede(deallocate_t);

    executor.run(taskflow).wait();
  }
}

TEST_CASE("cudaFlow.join" * doctest::timeout(300)) {
  join<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.join" * doctest::timeout(300)) {
  join<tf::cudaFlowCapturer>();
}

//----------------------------------------------------------------------
//update kernel
//----------------------------------------------------------------------

template <typename F, typename T>
void update_kernel() {

  tf::Executor executor;
  tf::Taskflow taskflow;

  for(size_t N = 1; N < 65529; N = N * 2 + 1) {

    taskflow.clear();

    std::vector<T*> operand(3, nullptr);
    std::vector<T*> ans_operand(3, nullptr);

    std::vector<int> ind(3);
    std::generate_n(ind.data(), 3, [&](){ return ::rand() % 3; });


    bool* check {nullptr};

    //allocate
    auto allocate_t = taskflow.emplace([&]() {
      for(int i = 0; i < 3; ++i) {
        REQUIRE(cudaMallocManaged(&operand[i], N * sizeof(T)) == cudaSuccess);
        REQUIRE(cudaMallocManaged(&ans_operand[i], N * sizeof(T)) == cudaSuccess);
      }

      REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
    }).name("allocate");

    //initialize
    auto initialize_t = taskflow.emplace([&](){
      for(int i = 0; i < 3; ++i) {
        std::generate_n(operand[i], N, [&](){ return ::rand() % N - N / 2 + i; });
        std::memcpy(ans_operand[i], operand[i], N * sizeof(T));
      }
      
      *check = true;
    }).name("initialize"); 

    
    //update_kernel
    auto add_t = taskflow.emplace([&](F& cf) {
      auto multi_t = cf.kernel(
        32, 512, 0,
        multiply<T>,
        operand[ind[0]], operand[ind[1]], operand[ind[2]], N
      );

      auto add_t = cf.kernel(
        32, 512, 0,
        add<T>,
        operand[ind[1]], operand[ind[2]], operand[ind[0]], N
      );

      multi_t.precede(add_t);

      cf.offload();

      cf.kernel(
        multi_t,
        64, 128, 0, multiply<T>,
        operand[ind[2]], operand[ind[0]], operand[ind[1]], N
      );

      cf.kernel(
        add_t,
        16, 256, 0, add<T>,
        operand[ind[1]], operand[ind[0]], operand[ind[2]], N
      );

      cf.offload();

      cf.kernel(
        multi_t,
        8, 1024, 0, multiply<T>,
        operand[ind[0]], operand[ind[2]], operand[ind[1]], N
      );

      cf.kernel(
        add_t,
        64, 64, 0, add<T>,
        operand[ind[2]], operand[ind[1]], operand[ind[0]], N
      );

      cf.offload();
    }).name("add");

    //verify
    auto verify_t = taskflow.emplace([&](F& cf) {
      auto multi1_t = cf.transform(
        ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[1]],
        ans_operand[ind[2]],
        [] __device__ (T& v1, T& v2) { return v1 * v2; }
      );

      auto add1_t = cf.transform(
        ans_operand[ind[1]], ans_operand[ind[1]]+N, ans_operand[ind[2]],
        ans_operand[ind[0]],
        [] __device__ (T& v1, T& v2) { return v1 + v2; }
      );

      auto multi2_t = cf.transform(
        ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[0]],
        ans_operand[ind[1]],
        [] __device__ (T& v1, T& v2) { return v1 * v2; }
      );

      auto add2_t = cf.transform(
        ans_operand[ind[1]], ans_operand[ind[1]] + N, ans_operand[ind[0]],
        ans_operand[ind[2]],
        [] __device__ (T& v1, T& v2) { return v1 + v2; }
      );

      auto multi3_t = cf.transform(
        ans_operand[ind[0]], ans_operand[ind[0]] + N, ans_operand[ind[2]],
        ans_operand[ind[1]],
        [] __device__ (T& v1, T& v2) { return v1 * v2; }
      );

      auto add3_t = cf.transform(
        ans_operand[ind[2]], ans_operand[ind[2]] + N, ans_operand[ind[1]],
        ans_operand[ind[0]],
        [] __device__ (T& v1, T& v2) { return v1 + v2; }
      );
  
      auto verify1_t = cf.kernel(
        32, 512, 0,
        verify<T>,
        operand[ind[0]], ans_operand[ind[0]], check, N
      );

      auto verify2_t = cf.kernel(
        32, 512, 0,
        verify<T>,
        operand[ind[1]], ans_operand[ind[1]], check, N
      );

      auto verify3_t = cf.kernel(
        32, 512, 0,
        verify<T>,
        operand[ind[2]], ans_operand[ind[2]], check, N
      );

      multi1_t.precede(add1_t);
      add1_t.precede(multi2_t);
      multi2_t.precede(add2_t);
      add2_t.precede(multi3_t);
      multi3_t.precede(add3_t);
      add3_t.precede(verify1_t).precede(verify2_t).precede(verify3_t);

      cf.offload();
      REQUIRE(*check);

    }).name("verify");

     //free memory
    auto deallocate_t = taskflow.emplace([&]() {
      for(int i = 0; i < 3; ++i) {
      REQUIRE(cudaFree(operand[i]) == cudaSuccess);
      REQUIRE(cudaFree(ans_operand[i]) == cudaSuccess);
      }

      REQUIRE(cudaFree(check) == cudaSuccess);
    }).name("deallocate");

    allocate_t.precede(initialize_t);
    initialize_t.precede(add_t);
    add_t.precede(verify_t);
    verify_t.precede(deallocate_t);

    executor.run(taskflow).wait();

  }

}

TEST_CASE("cudaFlow.update.kernel.int" * doctest::timeout(300)) {
  update_kernel<tf::cudaFlow, int>();
}

TEST_CASE("cudaFlow.update.kernel.float" * doctest::timeout(300)) {
  update_kernel<tf::cudaFlow, float>();
}

TEST_CASE("cudaFlow.update.kernel.double" * doctest::timeout(300)) {
  update_kernel<tf::cudaFlow, double>();
}

TEST_CASE("cudaFlowCapturer.update.kernel.int" * doctest::timeout(300)) {
  update_kernel<tf::cudaFlowCapturer, int>();
}

TEST_CASE("cudaFlowCapturer.update.kernel.float" * doctest::timeout(300)) {
  update_kernel<tf::cudaFlowCapturer, float>();
}

TEST_CASE("cudaFlowCapturer.update.kernel.double" * doctest::timeout(300)) {
  update_kernel<tf::cudaFlowCapturer, double>();
}

//----------------------------------------------------------------------
// update copy
//----------------------------------------------------------------------

template <typename F, typename T>
void update_copy() {
  
  tf::Executor executor;
  tf::Taskflow taskflow;

  for(int N = 1; N < 65459; N = N * 2 + 1) {
    
    taskflow.clear();

    std::vector<T> ha(N, N + 5);
    std::vector<T> hb(N, N - 31);
    std::vector<T> hc(N, N - 47);
    std::vector<T> hz(N);

    T* da {nullptr};
    T* db {nullptr};
    T* dc {nullptr};
    T* dz {nullptr};


    //allocate
    auto allocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess);
    }).name("allocate");


    //update_copy
    auto h2d_t = taskflow.emplace([&](F& cf) {
      auto h2d_t = cf.copy(da, ha.data(), N).name("h2d");
      cf.offload();

      cf.copy(h2d_t, db, hb.data(), N);
      cf.offload();

      cf.copy(h2d_t, dc, hc.data(), N);
      cf.offload();

    });

    auto kernel_t = taskflow.emplace([&](F& cf) {
      auto add1_t = cf.transform(
        da, da+N, db, dz,
        [] __device__ (T& v1, T& v2) { return v1 + v2; }
      );

      auto add2_t = cf.transform(
        dc, dc+N, dz, dc,
        [] __device__ (T& v1, T& v2) { return v1 - v2; }
      );

      add1_t.precede(add2_t);
    });

    auto d2h_t = taskflow.emplace([&](F& cf) {
      auto d2h_t = cf.copy(hc.data(), dc, N).name("d2h");
      cf.offload();

      cf.copy(d2h_t, hz.data(), dz, N);
      cf.offload();

    });

    //verify
    auto verify_t = taskflow.emplace([&]() {
      for(auto& c: hc) {
        REQUIRE(c == -21 - N);
      }

      for(auto& z: hz) {
        REQUIRE(z == 2 * N - 26);
      }
    });

     //free memory
    auto deallocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaFree(da) == cudaSuccess);
      REQUIRE(cudaFree(db) == cudaSuccess);
      REQUIRE(cudaFree(dc) == cudaSuccess);
      REQUIRE(cudaFree(dz) == cudaSuccess);
    }).name("deallocate");

    allocate_t.precede(h2d_t);
    h2d_t.precede(kernel_t);
    kernel_t.precede(d2h_t);
    d2h_t.precede(verify_t);
    verify_t.precede(deallocate_t);

    executor.run(taskflow).wait();

  }
}

TEST_CASE("cudaFlow.update.copy.int" * doctest::timeout(300)) {
  update_copy<tf::cudaFlow, int>();
}

TEST_CASE("cudaFlow.update.copy.float" * doctest::timeout(300)) {
  update_copy<tf::cudaFlow, float>();
}

TEST_CASE("cudaFlow.update.copy.double" * doctest::timeout(300)) {
  update_copy<tf::cudaFlow, double>();
}

TEST_CASE("cudaFlowCapturer.update.copy.int" * doctest::timeout(300)) {
  update_copy<tf::cudaFlowCapturer, int>();
}

TEST_CASE("cudaFlowCapturer.update.copy.float" * doctest::timeout(300)) {
  update_copy<tf::cudaFlowCapturer, float>();
}

TEST_CASE("cudaFlowCapturer.update.copy.double" * doctest::timeout(300)) {
  update_copy<tf::cudaFlowCapturer, double>();
}

//----------------------------------------------------------------------
//update memcpy
//----------------------------------------------------------------------

template <typename F, typename T>
void update_memcpy() {
  
  tf::Executor executor;
  tf::Taskflow taskflow;

  for(int N = 1; N < 65459; N = N * 2 + 1) {
    
    taskflow.clear();

    std::vector<T> ha(N, N + 5);
    std::vector<T> hb(N, N - 31);
    std::vector<T> hc(N, N - 47);
    std::vector<T> hz(N);

    T* da {nullptr};
    T* db {nullptr};
    T* dc {nullptr};
    T* dz {nullptr};


    //allocate
    auto allocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaMalloc(&da, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMalloc(&db, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMalloc(&dc, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMalloc(&dz, N * sizeof(T)) == cudaSuccess);
    }).name("allocate");


    //update_memcpy
    auto h2d_t = taskflow.emplace([&](F& cf) {
      auto h2d_t = cf.memcpy(da, ha.data(), sizeof(T) * N).name("h2d");
      cf.offload();

      cf.memcpy(h2d_t, db, hb.data(), sizeof(T) * N);
      cf.offload();

      cf.memcpy(h2d_t, dc, hc.data(), sizeof(T) * N);
      cf.offload();

    });

    auto kernel_t = taskflow.emplace([&](F& cf) {
      auto add1_t = cf.transform(
        da, da+N, db, dz,
        [] __device__ (T& v1, T& v2) { return v1 + v2; }
      );

      auto add2_t = cf.transform(
        dc, dc+N, dz, dc,
        [] __device__ (T& v1, T& v2) { return v1 - v2; }
      );

      add1_t.precede(add2_t);
    });

    auto d2h_t = taskflow.emplace([&](F& cf) {
      auto d2h_t = cf.memcpy(hc.data(), dc, sizeof(T) * N).name("d2h");
      cf.offload();

      cf.memcpy(d2h_t, hz.data(), dz, sizeof(T) * N);
      cf.offload();

    });

    //verify
    auto verify_t = taskflow.emplace([&]() {
      for(auto& c: hc) {
        REQUIRE(c == -21 - N);
      }

      for(auto& z: hz) {
        REQUIRE(z == 2 * N - 26);
      }
    });

     //free memory
    auto deallocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaFree(da) == cudaSuccess);
      REQUIRE(cudaFree(db) == cudaSuccess);
      REQUIRE(cudaFree(dc) == cudaSuccess);
      REQUIRE(cudaFree(dz) == cudaSuccess);
    }).name("deallocate");

    allocate_t.precede(h2d_t);
    h2d_t.precede(kernel_t);
    kernel_t.precede(d2h_t);
    d2h_t.precede(verify_t);
    verify_t.precede(deallocate_t);

    executor.run(taskflow).wait();

  }
}

TEST_CASE("cudaFlow.update.memcpy.int" * doctest::timeout(300)) {
  update_memcpy<tf::cudaFlow, int>();
}

TEST_CASE("cudaFlow.update.memcpy.float" * doctest::timeout(300)) {
  update_memcpy<tf::cudaFlow, float>();
}

TEST_CASE("cudaFlow.update.memcpy.double" * doctest::timeout(300)) {
  update_memcpy<tf::cudaFlow, double>();
}

TEST_CASE("cudaFlowCapturer.update.memcpy.int" * doctest::timeout(300)) {
  update_memcpy<tf::cudaFlowCapturer, int>();
}

TEST_CASE("cudaFlowCapturer.update.memcpy.float" * doctest::timeout(300)) {
  update_memcpy<tf::cudaFlowCapturer, float>();
}

TEST_CASE("cudaFlowCapturer.update.memcpy.double" * doctest::timeout(300)) {
  update_memcpy<tf::cudaFlowCapturer, double>();
}

//----------------------------------------------------------------------
//update memset
//----------------------------------------------------------------------

template <typename F, typename T>
void update_memset() {

  tf::Executor executor;
  tf::Taskflow taskflow;

  for(size_t N = 1; N < 65199; N = N * 2 + 1) {
    
    taskflow.clear();

    T* a {nullptr};
    T* b {nullptr};

    T* ans_a {nullptr};
    T* ans_b {nullptr};
    
    bool* check {nullptr};

    //allocate
    auto allocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaMallocManaged(&a, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMallocManaged(&b, (N + 37) * sizeof(T)) == cudaSuccess);

      REQUIRE(cudaMallocManaged(&ans_a, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMallocManaged(&ans_b, (N + 37) * sizeof(T)) == cudaSuccess);

      REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
    }).name("allocate");

    //initialize
    auto initialize_t = taskflow.emplace([&]() {
      std::generate_n(a, N, [&](){ return ::rand() % N - N / 2; });
      std::generate_n(b, N + 37, [&](){ return ::rand() % N + N / 2; });
      
      REQUIRE(cudaMemset(ans_a, 0, N * sizeof(T)) == cudaSuccess);
      REQUIRE(cudaMemset(ans_b, 1, (N + 37) * sizeof(T)) == cudaSuccess);

      *check = true;
    }).name("initialize"); 

    //update_memset
    auto memset_t = taskflow.emplace([&](F& cf) {
      auto memset_t = cf.memset(ans_a, 0, N * sizeof(T));
      cf.offload();

      cf.memset(memset_t, a, 0, N * sizeof(T));
      cf.offload();

      cf.memset(memset_t, b, 1, (N + 37) * sizeof(T));
      cf.offload();
    }).name("memset");

    //verify
    auto verify_t = taskflow.emplace([&](F& cf) {
      cf.kernel(
        32, 512, 0,
        verify<T>,
        a, ans_a, check, N
      );

      cf.kernel(
        32, 512, 0,
        verify<T>,
        b, ans_b, check, N + 37
      );

      cf.offload();
      REQUIRE(*check);
    }).name("verify");

    //free memory
    auto deallocate_t = taskflow.emplace([&]() {
      REQUIRE(cudaFree(a) == cudaSuccess);
      REQUIRE(cudaFree(b) == cudaSuccess);
      REQUIRE(cudaFree(ans_a) == cudaSuccess);
      REQUIRE(cudaFree(ans_b) == cudaSuccess);
      REQUIRE(cudaFree(check) == cudaSuccess);
    }).name("deallocate");

    allocate_t.precede(initialize_t);
    initialize_t.precede(memset_t);
    memset_t.precede(verify_t);
    verify_t.precede(deallocate_t);

    executor.run(taskflow).wait();
  }
}

TEST_CASE("cudaFlow.update.memset.int" * doctest::timeout(300)) {
  update_memset<tf::cudaFlow, int>();
}

TEST_CASE("cudaFlow.update.memset.float" * doctest::timeout(300)) {
  update_memset<tf::cudaFlow, float>();
}

TEST_CASE("cudaFlow.update.memset.double" * doctest::timeout(300)) {
  update_memset<tf::cudaFlow, double>();
}

TEST_CASE("cudaFlowCapturer.update.memset.int" * doctest::timeout(300)) {
  update_memset<tf::cudaFlowCapturer, int>();
}

TEST_CASE("cudaFlowCapturer.update.memset.float" * doctest::timeout(300)) {
  update_memset<tf::cudaFlowCapturer, float>();
}

TEST_CASE("cudaFlowCapturer.update.memset.double" * doctest::timeout(300)) {
  update_memset<tf::cudaFlowCapturer, double>();
}

// ----------------------------------------------------------------------------
// update algorithms
// ----------------------------------------------------------------------------

struct SetValue {
  __device__ void operator()(int& a) const { a = v; }
  int v;
};

struct SetValueOnIndex {
  __device__ void operator()(int i) const { data[i] = v; }
  int* data;
  int v;
};

struct AddOrMultiply {
  __device__ int operator()(int a, int b) const { return v ? a + b : a *b; }
  bool v;
};

struct AddScalar {
  __device__ int operator()(int a) const { return a + v; }
  int v;
};

struct MultiplyScalar {
  __device__ int operator()(int a) const { return a*v; }
  int v;
};

struct LessOrGreater {
  __device__ int operator()(int a, int b) const { return v ? a < b : a > b; }
  bool v;
};

struct IsEqual {
  int v;
  __device__ bool operator()(int a) const { return v == a; }
};

// ----------------------------------------------------------------------------
// update for_each
// ----------------------------------------------------------------------------

template <typename F>
void update_for_each() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {

    cf.clear();

    auto data = tf::cuda_malloc_shared<int>(N);
    
    // for each task
    //auto task = cf.for_each(data, data+N, [] __device__ (int& a){ a = 100; });
    auto task = cf.for_each(data, data+N, SetValue{100});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(data[i] == 100);
    }
    
    // update for each parameters
    //cf.for_each(task, data, data+N, [] __device__ (int& a){ a = 1234; });
    cf.for_each(task, data, data+N, SetValue{1234});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(data[i] == 1234);
    }

    tf::cuda_free(data);
  }
}

TEST_CASE("cudaFlow.update.for_each" * doctest::timeout(300)) {
  update_for_each<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.for_each" * doctest::timeout(300)) {
  update_for_each<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update for_each_index
// ----------------------------------------------------------------------------

template <typename F>
void update_for_each_index() {
  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {

    cf.clear();

    auto data = tf::cuda_malloc_shared<int>(N);
    
    // for each index
    auto task = cf.for_each_index(0, N, 1, SetValueOnIndex{data, 100});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(data[i] == 100);
    }

    // update for each index
    cf.for_each_index(task, 0, N, 1, SetValueOnIndex{data, -100});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(data[i] == -100);
    }

    tf::cuda_free(data);
  }
}

TEST_CASE("cudaFlow.update.for_each_index" * doctest::timeout(300)) {
  update_for_each_index<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.for_each_index" * doctest::timeout(300)) {
  update_for_each_index<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update reduce
// ----------------------------------------------------------------------------

template <typename F>
void update_reduce() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {

    cf.clear();

    auto data = tf::cuda_malloc_shared<int>(N);
    auto soln = tf::cuda_malloc_shared<int>(1);

    for(int i=0; i<N; i++) data[i] = -1;
    *soln = 0;
    
    // reduce
    //auto task = cf.reduce(
    //  data, data + N, soln, [] __device__ (int a, int b){ return a + b; }
    //);
    auto task = cf.reduce(data, data + N, soln, AddOrMultiply{true});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == -N);
    
    // update reduce range
    *soln = -1;
    //cf.reduce(
    //  task, data, data + N, soln, [] __device__ (int a, int b){ return a * b; }
    //);
    cf.reduce(task, data, data + N, soln, AddOrMultiply{false});
    cf.offload();
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == ((N&1) ? 1 : -1));

    tf::cuda_free(data);
    tf::cuda_free(soln);
  }
}

TEST_CASE("cudaFlow.update.reduce" * doctest::timeout(300)) {
  update_reduce<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.reduce" * doctest::timeout(300)) {
  update_reduce<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update uninitialized reduce
// ----------------------------------------------------------------------------

template <typename F>
void update_uninitialized_reduce() {
  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {
  
    cf.clear();

    auto data = tf::cuda_malloc_shared<int>(N);
    auto soln = tf::cuda_malloc_shared<int>(1);

    for(int i=0; i<N; i++) data[i] = -1;
    
    // uninitialized_reduce
    //auto task = cf.uninitialized_reduce(
    //  data, data + N, soln, [] __device__ (int a, int b){ return a + b; }
    //);
    auto task = cf.uninitialized_reduce(data, data + N, soln, AddOrMultiply{true});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == -N);
    
    // update reduce range
    //cf.uninitialized_reduce(
    //  task, data, data + N, soln, [] __device__ (int a, int b){ return a * b; }
    //);
    cf.uninitialized_reduce(
      task, data, data + N, soln, AddOrMultiply{false}
    );
    cf.offload();
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == ((N&1) ? -1 : 1));

    tf::cuda_free(data);
    tf::cuda_free(soln);
  }
}

TEST_CASE("cudaFlow.update.uninitialized_reduce" * doctest::timeout(300)) {
  update_uninitialized_reduce<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.uninitialized_reduce" * doctest::timeout(300)) {
  update_uninitialized_reduce<tf::cudaFlowCapturer>();
}


// ----------------------------------------------------------------------------
// update transform reduce
// ----------------------------------------------------------------------------

template <typename F>
void update_transform_reduce() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {

    cf.clear();

    auto data = tf::cuda_malloc_shared<int>(N);
    auto soln = tf::cuda_malloc_shared<int>(1);

    for(int i=0; i<N; i++) data[i] = -1;
    *soln = 0;
    
    // transform_reduce
    auto task = cf.transform_reduce(
      data, data + N, soln, AddOrMultiply{true}, AddScalar{2}
      //[] __device__ (int a, int b) { return a + b; },
      //[] __device__ (int a) { return a + 2; }
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == N);
    
    // update reduce range
    *soln = 8;
    cf.transform_reduce(
      task, data, data + N, soln, AddOrMultiply{false}, AddScalar{1}
      //[] __device__ (int a, int b){ return a * b; },
      //[] __device__ (int a) { return a + 1; }
    );
    cf.offload();
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == 0);

    tf::cuda_free(data);
    tf::cuda_free(soln);
  }
}

TEST_CASE("cudaFlow.update.transform_reduce" * doctest::timeout(300)) {
  update_transform_reduce<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.transform_reduce" * doctest::timeout(300)) {
  update_transform_reduce<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update transform uninitialized reduce
// ----------------------------------------------------------------------------

template <typename F>
void update_transform_uninitialized_reduce() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {

    cf.clear();

    auto data = tf::cuda_malloc_shared<int>(N);
    auto soln = tf::cuda_malloc_shared<int>(1);

    for(int i=0; i<N; i++) data[i] = -1;
    *soln = 100;
    
    // transform_reduce
    auto task = cf.transform_uninitialized_reduce(
      data, data + N, soln, AddOrMultiply{true}, AddScalar{2}
      //[] __device__ (int a, int b) { return a + b; },
      //[] __device__ (int a) { return a + 2; }
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == N);
    
    // update reduce range
    *soln = 8;
    cf.transform_uninitialized_reduce(
      task, data, data + N, soln, AddOrMultiply{false}, AddScalar{0}
      //[] __device__ (int a, int b){ return a * b; },
      //[] __device__ (int a) { return a; }
    );
    cf.offload();
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(*soln == ((N&1) ? -1 : 1));

    tf::cuda_free(data);
    tf::cuda_free(soln);
  }
}

TEST_CASE("cudaFlow.update.transform_uninitialized_reduce" * doctest::timeout(300)) {
  update_transform_uninitialized_reduce<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.transform_uninitialized_reduce" * doctest::timeout(300)) {
  update_transform_uninitialized_reduce<tf::cudaFlowCapturer>();
}


// ----------------------------------------------------------------------------
// transform
// ----------------------------------------------------------------------------
template <typename F>
void update_transform() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);
    auto output = tf::cuda_malloc_shared<int>(N);

    for(int i=0; i<N; i++) {
      input1[i] =  i;
      input2[i] = -i;
      output[i] =  7;
    }
    
    // transform
    auto task = cf.transform(
      input1, input1+N, output, MultiplyScalar{2}
      //[] __device__ (int& a) { return 2*a; }
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(output[i] == input1[i]*2);
    }

    // update transform parameters
    cf.transform(task,
      input2, input2+N, output, MultiplyScalar{10}
      // [] __device__ (int& a) { return 10*a; }
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(output[i] == input2[i]*10);
    }

    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(output);
  }
}

TEST_CASE("cudaFlow.update.transform" * doctest::timeout(300)) {
  update_transform<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.transform" * doctest::timeout(300)) {
  update_transform<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// binary transform
// ----------------------------------------------------------------------------

// update binary_transform
template <typename F>
void update_binary_transform() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);
    auto output = tf::cuda_malloc_shared<int>(N);

    for(int i=0; i<N; i++) {
      input1[i] =  i;
      input2[i] = -i;
      output[i] =  7;
    }
    
    // transform
    auto task = cf.transform(
      input1, input1+N, input2, output, AddOrMultiply{false}
      //[] __device__ (int a, int b) { return a*b; }
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(output[i] == input1[i] * input2[i]);
    }

    // update transform parameters
    cf.transform(task,
      input1, input1+N, input2, output, AddOrMultiply{true}
      //[] __device__ (int a, int b) { return a+b; }
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    for(int i=0; i<N; i++) {
      REQUIRE(output[i] == input1[i]+input2[i]);
    }

    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(output);
  }
}

TEST_CASE("cudaFlow.update.binary_transform" * doctest::timeout(300)) {
  update_binary_transform<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.binary_transform" * doctest::timeout(300)) {
  update_binary_transform<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update scan
// ----------------------------------------------------------------------------

template <typename F>
void update_scan() {
  
  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);
    auto output1 = tf::cuda_malloc_shared<int>(N);
    auto output2 = tf::cuda_malloc_shared<int>(N);

    for(int i=0; i<N; i++) {
      input1[i] = i;
      input2[i] = -i;
      output1[i] = 0;
      output2[i] = 0;
    }
    
    // scan
    auto inclusive_scan = cf.inclusive_scan(
      input1, input1+N, output1, AddOrMultiply{true}
    );
    auto exclusive_scan = cf.exclusive_scan(
      input2, input2+N, output2, AddOrMultiply{true}
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 2);
    for(int i=1; i<N; i++) {
      REQUIRE(output1[i] == output1[i-1] + input1[i]);
      REQUIRE(output2[i] == output2[i-1] + input2[i-1]);
    }
    
    // update scan
    cf.inclusive_scan(inclusive_scan,
      input2, input2+N, output2, AddOrMultiply{true}
    );
    cf.exclusive_scan(exclusive_scan,
      input1, input1+N, output1, AddOrMultiply{true}
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 2);
    for(int i=1; i<N; i++) {
      REQUIRE(output1[i] == output1[i-1] + input1[i-1]);
      REQUIRE(output2[i] == output2[i-1] + input2[i]);
    }

    // ---------- transform_scan
    cf.clear();

    inclusive_scan = cf.transform_inclusive_scan(
      input1, input1+N, output1, AddOrMultiply{true}, MultiplyScalar{2}
      //[] __device__ (int a, int b) { return a + b; },
      //[] __device__ (int a) { return a * 2; }
    );

    exclusive_scan = cf.transform_exclusive_scan(
      input2, input2+N, output2, AddOrMultiply{true}, MultiplyScalar{10}
      //[] __device__ (int a, int b) { return a + b; },
      //[] __device__ (int a) { return a * 10; }
    );
    
    cf.offload();

    REQUIRE(cf.num_tasks() == 2);
    for(int i=1; i<N; i++) {
      REQUIRE(output1[i] == output1[i-1] + input1[i]*2);
      REQUIRE(output2[i] == output2[i-1] + input2[i-1]*10);
    }
    
    // ---------- update transform scan
    
    cf.transform_inclusive_scan(inclusive_scan,
      input2, input2+N, output2, AddOrMultiply{true}, MultiplyScalar{2}
      //[] __device__ (int a, int b) { return a + b; },
      //[] __device__ (int a) { return a * 2; }
    );

    cf.transform_exclusive_scan(exclusive_scan,
      input1, input1+N, output1, AddOrMultiply{true}, MultiplyScalar{10}
      //[] __device__ (int a, int b) { return a + b; },
      //[] __device__ (int a) { return a * 10; }
    );

    cf.offload();
    
    REQUIRE(cf.num_tasks() == 2);
    for(int i=1; i<N; i++) {
      REQUIRE(output2[i] == output2[i-1] + input2[i]*2);
      REQUIRE(output1[i] == output1[i-1] + input1[i-1]*10);
    }

    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(output1);
    tf::cuda_free(output2);
  }
}

TEST_CASE("cudaFlow.update.scan" * doctest::timeout(300)) {
  update_scan<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.scan" * doctest::timeout(300)) {
  update_scan<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update merge
// ----------------------------------------------------------------------------

template <typename F>
void update_merge() {

  F cf;

  for(int N=1; N<=100000; N += (N/10+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(2*N);
    auto output1 = tf::cuda_malloc_shared<int>(3*N);
    auto output2 = tf::cuda_malloc_shared<int>(3*N);

    std::iota(input1, input1+N, 0);
    std::iota(input2, input2+2*N, 0);
    std::merge(input1, input1+N, input2, input2+2*N, output2);

    // merge
    auto merge = cf.merge(
      input1, input1+N, input2, input2+2*N, output1, tf::cuda_less<int>()
    );
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(output1, output1+3*N));

    for(int i=0; i<3*N; i++) {
      REQUIRE(output1[i] == output2[i]);
      output1[i] = output2[i] = rand();
    }

    // update merge
    cf.merge(merge,
      input1, input1+N, input2, input2+N, output2, tf::cuda_less<int>()
    );
    cf.offload();
    
    std::merge(input1, input1+N, input2, input2+N, output1);

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(output2, output2+2*N));

    for(int i=0; i<2*N; i++) {
      REQUIRE(output1[i] == output2[i]);
    }
    
    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(output1);
    tf::cuda_free(output2);
  }
}

TEST_CASE("cudaFlow.update.merge" * doctest::timeout(300)) {
  update_merge<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.merge" * doctest::timeout(300)) {
  update_merge<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update sort
// ----------------------------------------------------------------------------
 
template <typename F>
void update_sort() {
   
  F cf;

  for(int N=1; N<=100000; N += (N/100+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);

    for(int i=0; i<N; i++) {
      input1[i] = rand();
      input2[i] = rand();
    }
    
    // create sort
    auto sort = cf.sort(input1, input1+N, LessOrGreater{true});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(input1, input1+N));

    // update sort
    cf.sort(sort, input2, input2+N, LessOrGreater{true});
    cf.offload();
    
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(input2, input2+N, std::less<int>()));
    
    // update sort with a different kernel
    cf.sort(sort, input1, input1+N, tf::cuda_greater<int>());
    cf.offload();
    
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(input1, input1+N, std::greater<int>()));
    
    // free the data 
    tf::cuda_free(input1);
    tf::cuda_free(input2);
  }
}

TEST_CASE("cudaFlow.update.sort" * doctest::timeout(300)) {
  update_sort<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.sort" * doctest::timeout(300)) {
  update_sort<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update sort_by_key
// ----------------------------------------------------------------------------
 
template <typename F>
void update_sort_by_key() {
  
  std::random_device rd;
  std::mt19937 g(rd());
   
  F cf;

  for(int N=1; N<=100000; N += (N/100+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);
    auto index1 = tf::cuda_malloc_shared<int>(N);
    auto index2 = tf::cuda_malloc_shared<int>(N);
    std::vector<int> index(N);

    for(int i=0; i<N; i++) {
      input1[i] = i;
      input2[i] = i;
      index1[i] = i;
      index2[i] = i;
      index [i] = i;
    }
    std::shuffle(input1, input1+N, g);
    std::shuffle(input2, input2+N, g);
    
    // create sort
    std::sort(index.begin(), index.end(), [&](auto i, auto j){
      return input1[i] < input1[j];
    });
    auto sort = cf.sort_by_key(input1, input1+N, index1, LessOrGreater{true});
    cf.offload();

    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(input1, input1+N));
    for(int i=0; i<N; i++) {
      REQUIRE(index[i] == index1[i]);
    }

    // update sort
    for(int i=0; i<N; i++) {
      index[i] = i;
    }
    std::sort(index.begin(), index.end(), [&](auto i, auto j){
      return input2[i] > input2[j];
    });
    cf.sort_by_key(sort, input2, input2+N, index2, LessOrGreater{false});
    cf.offload();
    
    REQUIRE(cf.num_tasks() == 1);
    REQUIRE(std::is_sorted(input2, input2+N, std::greater<int>()));
    for(int i=0; i<N; i++) {
      REQUIRE(index[i] == index2[i]);
    }
    
    // free the data 
    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(index1);
    tf::cuda_free(index2);
  }
}

TEST_CASE("cudaFlow.update.sort_by_key" * doctest::timeout(300)) {
  update_sort_by_key<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.sort_by_key" * doctest::timeout(300)) {
  update_sort_by_key<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update find
// ----------------------------------------------------------------------------

template <typename F>
void update_find() {
   
  F cf;

  for(unsigned N=1; N<=100000; N += (N/100+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);
    auto index1 = tf::cuda_malloc_shared<unsigned>(1);
    auto index2 = tf::cuda_malloc_shared<unsigned>(1);

    for(unsigned i=0; i<N; i++) {
      input1[i] = i;
      input2[i] = i;
    }
    
    // create find
    auto find_if = cf.find_if(input1, input1+N, index1, IsEqual{(int)(N/2)});
    cf.offload();

    REQUIRE(*index1 != N);
    REQUIRE(input1[*index1] == N/2);

    // update find
    cf.find_if(find_if, input2, input2+N, index2, IsEqual{(int)(N/2 + 1)});
    cf.offload();
    
    REQUIRE(cf.num_tasks() == 1);

    if( N/2+1 >= N) {
      REQUIRE(*index2 == N);
    }
    else {
      REQUIRE(input2[*index2] == (N/2+1));
    }

    // free the data 
    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(index1);
    tf::cuda_free(index2);
  }
}

TEST_CASE("cudaFlow.update.find" * doctest::timeout(300)) {
  update_find<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.find" * doctest::timeout(300)) {
  update_find<tf::cudaFlowCapturer>();
}

// ----------------------------------------------------------------------------
// update min-/max-element
// ----------------------------------------------------------------------------

template <typename F>
void update_minmax_element() {
   
  F cf;

  for(unsigned N=1; N<=100000; N += (N/100+1)) {
  
    cf.clear();

    auto input1 = tf::cuda_malloc_shared<int>(N);
    auto input2 = tf::cuda_malloc_shared<int>(N);
    auto index1 = tf::cuda_malloc_shared<unsigned>(1);
    auto index2 = tf::cuda_malloc_shared<unsigned>(1);

    for(unsigned i=0; i<N; i++) {
      input1[i] = rand();
      input2[i] = rand();
    }
    
    // create find
    auto find_min = cf.min_element(input1, input1+N, index1, tf::cuda_less<int>());
    auto find_max = cf.max_element(input2, input2+N, index2, tf::cuda_less<int>());
    cf.offload();

    REQUIRE(input1[*index1] == *std::min_element(input1, input1+N));
    REQUIRE(input2[*index2] == *std::max_element(input2, input2+N));

    // update find
    cf.min_element(find_min, input2, input2+N, index2, tf::cuda_less<int>());
    cf.max_element(find_max, input1, input1+N, index1, tf::cuda_less<int>());
    cf.offload();

    REQUIRE(cf.num_tasks() == 2);
    REQUIRE(input2[*index2] == *std::min_element(input2, input2+N));
    REQUIRE(input1[*index1] == *std::max_element(input1, input1+N));

    // free the data 
    tf::cuda_free(input1);
    tf::cuda_free(input2);
    tf::cuda_free(index1);
    tf::cuda_free(index2);
  }
}

TEST_CASE("cudaFlow.update.minmax_element" * doctest::timeout(300)) {
  update_minmax_element<tf::cudaFlow>();
}

TEST_CASE("cudaFlowCapturer.update.minmax_element" * doctest::timeout(300)) {
  update_minmax_element<tf::cudaFlowCapturer>();
}