mesytec-mnode/external/taskflow-3.8.0/taskflow/cuda/algorithm/scan.hpp

#pragma once

#include "reduce.hpp"

/**
@file taskflow/cuda/algorithm/scan.hpp
@brief CUDA scan algorithm include file
*/

namespace tf::detail {

// ----------------------------------------------------------------------------
// scan
// ----------------------------------------------------------------------------

/** @private */
inline constexpr unsigned cudaScanRecursionThreshold = 8;

/** @private */
enum class cudaScanType : int {
  EXCLUSIVE = 1,
  INCLUSIVE
};

/** @private */
template<typename T, unsigned vt = 0, bool is_array = (vt > 0)>
struct cudaScanResult {
  T scan;
  T reduction;
};

/** @private */
template<typename T, unsigned vt>
struct cudaScanResult<T, vt, true> {
  cudaArray<T, vt> scan;
  T reduction;
};

//-----------------------------------------------------------------------------

/** @private */
template<unsigned nt, typename T>
struct cudaBlockScan {

  const static unsigned num_warps  = nt / CUDA_WARP_SIZE;
  const static unsigned num_passes = log2(nt);
  const static unsigned capacity   = nt + num_warps;

  /** @private */
  union storage_t {
    T data[2 * nt];
    struct { T threads[nt], warps[num_warps]; };
  };

  // standard scan
  template<typename op_t>
  __device__ cudaScanResult<T> operator ()(
    unsigned tid,
    T x,
    storage_t& storage,
    unsigned count = nt,
    op_t op = op_t(),
    T init = T(),
    cudaScanType type = cudaScanType::EXCLUSIVE
  ) const;

  // vectorized scan. accepts multiple values per thread and adds in
  // optional global carry-in
  template<unsigned vt, typename op_t>
  __device__ cudaScanResult<T, vt> operator()(
    unsigned tid,
    cudaArray<T, vt> x,
    storage_t& storage,
    T carry_in = T(),
    bool use_carry_in = false,
    unsigned count = nt,
    op_t op = op_t(),
    T init = T(),
    cudaScanType type = cudaScanType::EXCLUSIVE
  ) const;
};

// standard scan
template <unsigned nt, typename T>
template<typename op_t>
__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () (
  unsigned tid, T x, storage_t& storage, unsigned count, op_t op,
  T init, cudaScanType type
) const {

  unsigned first = 0;
  storage.data[first + tid] = x;
  __syncthreads();

  cuda_iterate<num_passes>([&](auto pass) {
    if(auto offset = 1<<pass; tid >= offset) {
      x = op(storage.data[first + tid - offset], x);
    }
    first = nt - first;
    storage.data[first + tid] = x;
    __syncthreads();
  });

  cudaScanResult<T> result;
  result.reduction = storage.data[first + count - 1];
  result.scan = (tid < count) ?
    (cudaScanType::INCLUSIVE == type ? x :
      (tid ? storage.data[first + tid - 1] : init)) :
    result.reduction;
  __syncthreads();

  return result;
}

// vectorized scan block
template <unsigned nt, typename T>
template<unsigned vt, typename op_t>
__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()(
  unsigned tid,
  cudaArray<T, vt> x,
  storage_t& storage,
  T carry_in,
  bool use_carry_in,
  unsigned count, op_t op,
  T init,
  cudaScanType type
) const {

  // Start with an inclusive scan of the in-range elements.
  if(count >= nt * vt) {
    cuda_iterate<vt>([&](auto i) {
      x[i] = i ? op(x[i], x[i - 1]) : x[i];
    });
  } else {
    cuda_iterate<vt>([&](auto i) {
      auto index = vt * tid + i;
      x[i] = i ?
        ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) :
        (x[i] = (index < count) ? x[i] : init);
    });
  }

  // Scan the thread-local reductions for a carry-in for each thread.
  auto result = operator()(
    tid, x[vt - 1], storage,
    (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE
  );

  // Perform the scan downsweep and add both the global carry-in and the
  // thread carry-in to the values.
  if(use_carry_in) {
    result.reduction = op(carry_in, result.reduction);
    result.scan = tid ? op(carry_in, result.scan) : carry_in;
  } else {
    use_carry_in = tid > 0;
  }

  cudaArray<T, vt> y;
  cuda_iterate<vt>([&](auto i) {
    if(cudaScanType::EXCLUSIVE == type) {
      y[i] = i ? x[i - 1] : result.scan;
      if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);
    } else {
      y[i] = use_carry_in ? op(x[i], result.scan) : x[i];
    }
  });

  return cudaScanResult<T, vt> { y, result.reduction };
}

/**
@private
@brief single-pass scan for small input
 */
template <typename P, typename I, typename O, typename C>
void cuda_single_pass_scan(
  P&& p,
  cudaScanType scan_type,
  I input,
  unsigned count,
  O output,
  C op
  //reduction_it reduction,
) {

  using T = typename std::iterator_traits<O>::value_type;
  using E = std::decay_t<P>;

  // Small input specialization. This is the non-recursive branch.
  cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {

    using scan_t = cudaBlockScan<E::nt, T>;

    __shared__ union {
      typename scan_t::storage_t scan;
      T values[E::nv];
    } shared;

    auto carry_in = T();
    for(unsigned cur = 0; cur < count; cur += E::nv) {
      // Cooperatively load values into register.
      auto count2 = min(count - cur, E::nv);

      auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur,
        tid, count2, shared.values);

      auto result = scan_t()(tid, x, shared.scan,
        carry_in, cur > 0, count2, op, T(), scan_type);

      // Store the scanned values back to global memory.
      cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2,
        output + cur, shared.values);

      // Roll the reduction into carry_in.
      carry_in = result.reduction;
    }

    // Store the carry-out to the reduction pointer. This may be a
    // discard_iterator_t if no reduction is wanted.
    //if(!tid) *reduction = carry_in;
  });
}

/**
@private

@brief main scan loop
*/
template<typename P, typename I, typename O, typename C>
void cuda_scan_loop(
  P&& p,
  cudaScanType scan_type,
  I input,
  unsigned count,
  O output,
  C op,
  //reduction_it reduction,
  void* ptr
) {

  using E = std::decay_t<P>;
  using T = typename std::iterator_traits<O>::value_type;

  T* buffer = static_cast<T*>(ptr);

  //launch_t::cta_dim(context).B(count);
  unsigned B = (count + E::nv - 1) / E::nv;

  if(B > cudaScanRecursionThreshold) {

    //cudaDeviceVector<T> partials(B);
    //auto buffer = partials.data();

    // upsweep phase
    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {

      __shared__ typename cudaBlockReduce<E::nt, T>::Storage shm;

      // Load the tile's data into register.
      auto tile = cuda_get_tile(bid, E::nv, count);
      auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
        input + tile.begin, tid, tile.count()
      );

      // Reduce the thread's values into a scalar.
      T scalar;
      cuda_strided_iterate<E::nt, E::vt>(
        [&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; },
        tid, tile.count()
      );

      // Reduce across all threads.
      auto all_reduce = cudaBlockReduce<E::nt, T>()(
        tid, scalar, shm, tile.count(), op
      );

      // Store the final reduction to the partials.
      if(!tid) {
        buffer[bid] = all_reduce;
      }
    });

    // recursively call scan
    //cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S);
    cuda_scan_loop(
      p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B
    );

    // downsweep: perform an intra-tile scan and add the scan of the partials
    // as carry-in
    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {

      using scan_t = cudaBlockScan<E::nt, T>;

      __shared__ union {
        typename scan_t::storage_t scan;
        T values[E::nv];
      } shared;

      // Load a tile to register in thread order.
      auto tile = cuda_get_tile(bid, E::nv, count);
      auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(
        input + tile.begin, tid, tile.count(), shared.values
      );

      // Scan the array with carry-in from the partials.
      auto y = scan_t()(tid, x, shared.scan,
        buffer[bid], bid > 0, tile.count(), op, T(),
        scan_type).scan;

      // Store the scanned values to the output.
      cuda_reg_to_mem_thread<E::nt, E::vt>(
        y, tid, tile.count(), output + tile.begin, shared.values
      );
    });
  }
  // Small input specialization. This is the non-recursive branch.
  else {
    cuda_single_pass_scan(p, scan_type, input, count, output, op);
  }
}

}  // namespace tf::detail ----------------------------------------------------

namespace tf {

// Function: scan_bufsz
template <unsigned NT, unsigned VT>  
template <typename T>
unsigned cudaExecutionPolicy<NT, VT>::scan_bufsz(unsigned count) {
  unsigned B = num_blocks(count);
  unsigned n = 0;
  for(auto b=B; b>detail::cudaScanRecursionThreshold; b=num_blocks(b)) {
    n += b;
  }
  return n*sizeof(T);
}


/**
@brief performs asynchronous inclusive scan over a range of items

@tparam P execution policy type
@tparam I input iterator
@tparam O output iterator
@tparam C binary operator type

@param p execution policy
@param first iterator to the beginning of the input range
@param last iterator to the end of the input range
@param output iterator to the beginning of the output range
@param op binary operator to apply to scan
@param buf pointer to the temporary buffer

*/
template<typename P, typename I, typename O, typename C>
void cuda_inclusive_scan(
  P&& p, I first, I last, O output, C op, void* buf
) {

  unsigned count = std::distance(first, last);

  if(count == 0) {
    return;
  }

  // launch the scan loop
  detail::cuda_scan_loop(
    p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf
  );
}

/**
@brief performs asynchronous inclusive scan over a range of transformed items

@tparam P execution policy type
@tparam I input iterator
@tparam O output iterator
@tparam C binary operator type
@tparam U unary operator type

@param p execution policy
@param first iterator to the beginning of the input range
@param last iterator to the end of the input range
@param output iterator to the beginning of the output range
@param bop binary operator to apply to scan
@param uop unary operator to apply to transform each item before scan
@param buf pointer to the temporary buffer

*/
template<typename P, typename I, typename O, typename C, typename U>
void cuda_transform_inclusive_scan(
  P&& p, I first, I last, O output, C bop, U uop, void* buf
) {

  using T = typename std::iterator_traits<O>::value_type;

  unsigned count = std::distance(first, last);

  if(count == 0) {
    return;
  }

  // launch the scan loop
  detail::cuda_scan_loop(
    p, detail::cudaScanType::INCLUSIVE,
    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
    count, output, bop, buf
  );
}

/**
@brief performs asynchronous exclusive scan over a range of items

@tparam P execution policy type
@tparam I input iterator
@tparam O output iterator
@tparam C binary operator type

@param p execution policy
@param first iterator to the beginning of the input range
@param last iterator to the end of the input range
@param output iterator to the beginning of the output range
@param op binary operator to apply to scan
@param buf pointer to the temporary buffer

*/
template<typename P, typename I, typename O, typename C>
void cuda_exclusive_scan(
  P&& p, I first, I last, O output, C op, void* buf
) {

  unsigned count = std::distance(first, last);

  if(count == 0) {
    return;
  }

  // launch the scan loop
  detail::cuda_scan_loop(
    p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf
  );
}

/**
@brief performs asynchronous exclusive scan over a range of items

@tparam P execution policy type
@tparam I input iterator
@tparam O output iterator
@tparam C binary operator type
@tparam U unary operator type

@param p execution policy
@param first iterator to the beginning of the input range
@param last iterator to the end of the input range
@param output iterator to the beginning of the output range
@param bop binary operator to apply to scan
@param uop unary operator to apply to transform each item before scan
@param buf pointer to the temporary buffer

*/
template<typename P, typename I, typename O, typename C, typename U>
void cuda_transform_exclusive_scan(
  P&& p, I first, I last, O output, C bop, U uop, void* buf
) {

  using T = typename std::iterator_traits<O>::value_type;

  unsigned count = std::distance(first, last);

  if(count == 0) {
    return;
  }

  // launch the scan loop
  detail::cuda_scan_loop(
    p, detail::cudaScanType::EXCLUSIVE,
    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
    count, output, bop, buf
  );
}


}  // end of namespace tf -----------------------------------------------------
add taskflow-3.8.0 2025-01-04 01:25:05 +01:00			`#pragma once`

			`#include "reduce.hpp"`

			`/**`
			`@file taskflow/cuda/algorithm/scan.hpp`
			`@brief CUDA scan algorithm include file`
			`*/`

			`namespace tf::detail {`

			`// ----------------------------------------------------------------------------`
			`// scan`
			`// ----------------------------------------------------------------------------`

			`/** @private */`
			`inline constexpr unsigned cudaScanRecursionThreshold = 8;`

			`/** @private */`
			`enum class cudaScanType : int {`
			`EXCLUSIVE = 1,`
			`INCLUSIVE`
			`};`

			`/** @private */`
			`template<typename T, unsigned vt = 0, bool is_array = (vt > 0)>`
			`struct cudaScanResult {`
			`T scan;`
			`T reduction;`
			`};`

			`/** @private */`
			`template<typename T, unsigned vt>`
			`struct cudaScanResult<T, vt, true> {`
			`cudaArray<T, vt> scan;`
			`T reduction;`
			`};`

			`//-----------------------------------------------------------------------------`

			`/** @private */`
			`template<unsigned nt, typename T>`
			`struct cudaBlockScan {`

			`const static unsigned num_warps = nt / CUDA_WARP_SIZE;`
			`const static unsigned num_passes = log2(nt);`
			`const static unsigned capacity = nt + num_warps;`

			`/** @private */`
			`union storage_t {`
			`T data[2 * nt];`
			`struct { T threads[nt], warps[num_warps]; };`
			`};`

			`// standard scan`
			`template<typename op_t>`
			`__device__ cudaScanResult<T> operator ()(`
			`unsigned tid,`
			`T x,`
			`storage_t& storage,`
			`unsigned count = nt,`
			`op_t op = op_t(),`
			`T init = T(),`
			`cudaScanType type = cudaScanType::EXCLUSIVE`
			`) const;`

			`// vectorized scan. accepts multiple values per thread and adds in`
			`// optional global carry-in`
			`template<unsigned vt, typename op_t>`
			`__device__ cudaScanResult<T, vt> operator()(`
			`unsigned tid,`
			`cudaArray<T, vt> x,`
			`storage_t& storage,`
			`T carry_in = T(),`
			`bool use_carry_in = false,`
			`unsigned count = nt,`
			`op_t op = op_t(),`
			`T init = T(),`
			`cudaScanType type = cudaScanType::EXCLUSIVE`
			`) const;`
			`};`

			`// standard scan`
			`template <unsigned nt, typename T>`
			`template<typename op_t>`
			`__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () (`
			`unsigned tid, T x, storage_t& storage, unsigned count, op_t op,`
			`T init, cudaScanType type`
			`) const {`

			`unsigned first = 0;`
			`storage.data[first + tid] = x;`
			`__syncthreads();`

			`cuda_iterate<num_passes>([&](auto pass) {`
			`if(auto offset = 1<<pass; tid >= offset) {`
			`x = op(storage.data[first + tid - offset], x);`
			`}`
			`first = nt - first;`
			`storage.data[first + tid] = x;`
			`__syncthreads();`
			`});`

			`cudaScanResult<T> result;`
			`result.reduction = storage.data[first + count - 1];`
			`result.scan = (tid < count) ?`
			`(cudaScanType::INCLUSIVE == type ? x :`
			`(tid ? storage.data[first + tid - 1] : init)) :`
			`result.reduction;`
			`__syncthreads();`

			`return result;`
			`}`

			`// vectorized scan block`
			`template <unsigned nt, typename T>`
			`template<unsigned vt, typename op_t>`
			`__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()(`
			`unsigned tid,`
			`cudaArray<T, vt> x,`
			`storage_t& storage,`
			`T carry_in,`
			`bool use_carry_in,`
			`unsigned count, op_t op,`
			`T init,`
			`cudaScanType type`
			`) const {`

			`// Start with an inclusive scan of the in-range elements.`
			`if(count >= nt * vt) {`
			`cuda_iterate<vt>([&](auto i) {`
			`x[i] = i ? op(x[i], x[i - 1]) : x[i];`
			`});`
			`} else {`
			`cuda_iterate<vt>([&](auto i) {`
			`auto index = vt * tid + i;`
			`x[i] = i ?`
			`((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) :`
			`(x[i] = (index < count) ? x[i] : init);`
			`});`
			`}`

			`// Scan the thread-local reductions for a carry-in for each thread.`
			`auto result = operator()(`
			`tid, x[vt - 1], storage,`
			`(count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE`
			`);`

			`// Perform the scan downsweep and add both the global carry-in and the`
			`// thread carry-in to the values.`
			`if(use_carry_in) {`
			`result.reduction = op(carry_in, result.reduction);`
			`result.scan = tid ? op(carry_in, result.scan) : carry_in;`
			`} else {`
			`use_carry_in = tid > 0;`
			`}`

			`cudaArray<T, vt> y;`
			`cuda_iterate<vt>([&](auto i) {`
			`if(cudaScanType::EXCLUSIVE == type) {`
			`y[i] = i ? x[i - 1] : result.scan;`
			`if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);`
			`} else {`
			`y[i] = use_carry_in ? op(x[i], result.scan) : x[i];`
			`}`
			`});`

			`return cudaScanResult<T, vt> { y, result.reduction };`
			`}`

			`/**`
			`@private`
			`@brief single-pass scan for small input`
			`*/`
			`template <typename P, typename I, typename O, typename C>`
			`void cuda_single_pass_scan(`
			`P&& p,`
			`cudaScanType scan_type,`
			`I input,`
			`unsigned count,`
			`O output,`
			`C op`
			`//reduction_it reduction,`
			`) {`

			`using T = typename std::iterator_traits<O>::value_type;`
			`using E = std::decay_t<P>;`

			`// Small input specialization. This is the non-recursive branch.`
			`cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {`

			`using scan_t = cudaBlockScan<E::nt, T>;`

			`__shared__ union {`
			`typename scan_t::storage_t scan;`
			`T values[E::nv];`
			`} shared;`

			`auto carry_in = T();`
			`for(unsigned cur = 0; cur < count; cur += E::nv) {`
			`// Cooperatively load values into register.`
			`auto count2 = min(count - cur, E::nv);`

			`auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur,`
			`tid, count2, shared.values);`

			`auto result = scan_t()(tid, x, shared.scan,`
			`carry_in, cur > 0, count2, op, T(), scan_type);`

			`// Store the scanned values back to global memory.`
			`cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2,`
			`output + cur, shared.values);`

			`// Roll the reduction into carry_in.`
			`carry_in = result.reduction;`
			`}`

			`// Store the carry-out to the reduction pointer. This may be a`
			`// discard_iterator_t if no reduction is wanted.`
			`//if(!tid) *reduction = carry_in;`
			`});`
			`}`

			`/**`
			`@private`

			`@brief main scan loop`
			`*/`
			`template<typename P, typename I, typename O, typename C>`
			`void cuda_scan_loop(`
			`P&& p,`
			`cudaScanType scan_type,`
			`I input,`
			`unsigned count,`
			`O output,`
			`C op,`
			`//reduction_it reduction,`
			`void* ptr`
			`) {`

			`using E = std::decay_t<P>;`
			`using T = typename std::iterator_traits<O>::value_type;`

			`T* buffer = static_cast<T*>(ptr);`

			`//launch_t::cta_dim(context).B(count);`
			`unsigned B = (count + E::nv - 1) / E::nv;`

			`if(B > cudaScanRecursionThreshold) {`

			`//cudaDeviceVector<T> partials(B);`
			`//auto buffer = partials.data();`

			`// upsweep phase`
			`cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {`

			`__shared__ typename cudaBlockReduce<E::nt, T>::Storage shm;`

			`// Load the tile's data into register.`
			`auto tile = cuda_get_tile(bid, E::nv, count);`
			`auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(`
			`input + tile.begin, tid, tile.count()`
			`);`

			`// Reduce the thread's values into a scalar.`
			`T scalar;`
			`cuda_strided_iterate<E::nt, E::vt>(`
			`[&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; },`
			`tid, tile.count()`
			`);`

			`// Reduce across all threads.`
			`auto all_reduce = cudaBlockReduce<E::nt, T>()(`
			`tid, scalar, shm, tile.count(), op`
			`);`

			`// Store the final reduction to the partials.`
			`if(!tid) {`
			`buffer[bid] = all_reduce;`
			`}`
			`});`

			`// recursively call scan`
			`//cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S);`
			`cuda_scan_loop(`
			`p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B`
			`);`

			`// downsweep: perform an intra-tile scan and add the scan of the partials`
			`// as carry-in`
			`cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {`

			`using scan_t = cudaBlockScan<E::nt, T>;`

			`__shared__ union {`
			`typename scan_t::storage_t scan;`
			`T values[E::nv];`
			`} shared;`

			`// Load a tile to register in thread order.`
			`auto tile = cuda_get_tile(bid, E::nv, count);`
			`auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(`
			`input + tile.begin, tid, tile.count(), shared.values`
			`);`

			`// Scan the array with carry-in from the partials.`
			`auto y = scan_t()(tid, x, shared.scan,`
			`buffer[bid], bid > 0, tile.count(), op, T(),`
			`scan_type).scan;`

			`// Store the scanned values to the output.`
			`cuda_reg_to_mem_thread<E::nt, E::vt>(`
			`y, tid, tile.count(), output + tile.begin, shared.values`
			`);`
			`});`
			`}`
			`// Small input specialization. This is the non-recursive branch.`
			`else {`
			`cuda_single_pass_scan(p, scan_type, input, count, output, op);`
			`}`
			`}`

			`} // namespace tf::detail ----------------------------------------------------`

			`namespace tf {`

			`// Function: scan_bufsz`
			`template <unsigned NT, unsigned VT>`
			`template <typename T>`
			`unsigned cudaExecutionPolicy<NT, VT>::scan_bufsz(unsigned count) {`
			`unsigned B = num_blocks(count);`
			`unsigned n = 0;`
			`for(auto b=B; b>detail::cudaScanRecursionThreshold; b=num_blocks(b)) {`
			`n += b;`
			`}`
			`return n*sizeof(T);`
			`}`


			`/**`
			`@brief performs asynchronous inclusive scan over a range of items`

			`@tparam P execution policy type`
			`@tparam I input iterator`
			`@tparam O output iterator`
			`@tparam C binary operator type`

			`@param p execution policy`
			`@param first iterator to the beginning of the input range`
			`@param last iterator to the end of the input range`
			`@param output iterator to the beginning of the output range`
			`@param op binary operator to apply to scan`
			`@param buf pointer to the temporary buffer`

			`*/`
			`template<typename P, typename I, typename O, typename C>`
			`void cuda_inclusive_scan(`
			`P&& p, I first, I last, O output, C op, void* buf`
			`) {`

			`unsigned count = std::distance(first, last);`

			`if(count == 0) {`
			`return;`
			`}`

			`// launch the scan loop`
			`detail::cuda_scan_loop(`
			`p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf`
			`);`
			`}`

			`/**`
			`@brief performs asynchronous inclusive scan over a range of transformed items`

			`@tparam P execution policy type`
			`@tparam I input iterator`
			`@tparam O output iterator`
			`@tparam C binary operator type`
			`@tparam U unary operator type`

			`@param p execution policy`
			`@param first iterator to the beginning of the input range`
			`@param last iterator to the end of the input range`
			`@param output iterator to the beginning of the output range`
			`@param bop binary operator to apply to scan`
			`@param uop unary operator to apply to transform each item before scan`
			`@param buf pointer to the temporary buffer`

			`*/`
			`template<typename P, typename I, typename O, typename C, typename U>`
			`void cuda_transform_inclusive_scan(`
			`P&& p, I first, I last, O output, C bop, U uop, void* buf`
			`) {`

			`using T = typename std::iterator_traits<O>::value_type;`

			`unsigned count = std::distance(first, last);`

			`if(count == 0) {`
			`return;`
			`}`

			`// launch the scan loop`
			`detail::cuda_scan_loop(`
			`p, detail::cudaScanType::INCLUSIVE,`
			`cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),`
			`count, output, bop, buf`
			`);`
			`}`

			`/**`
			`@brief performs asynchronous exclusive scan over a range of items`

			`@tparam P execution policy type`
			`@tparam I input iterator`
			`@tparam O output iterator`
			`@tparam C binary operator type`

			`@param p execution policy`
			`@param first iterator to the beginning of the input range`
			`@param last iterator to the end of the input range`
			`@param output iterator to the beginning of the output range`
			`@param op binary operator to apply to scan`
			`@param buf pointer to the temporary buffer`

			`*/`
			`template<typename P, typename I, typename O, typename C>`
			`void cuda_exclusive_scan(`
			`P&& p, I first, I last, O output, C op, void* buf`
			`) {`

			`unsigned count = std::distance(first, last);`

			`if(count == 0) {`
			`return;`
			`}`

			`// launch the scan loop`
			`detail::cuda_scan_loop(`
			`p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf`
			`);`
			`}`

			`/**`
			`@brief performs asynchronous exclusive scan over a range of items`

			`@tparam P execution policy type`
			`@tparam I input iterator`
			`@tparam O output iterator`
			`@tparam C binary operator type`
			`@tparam U unary operator type`

			`@param p execution policy`
			`@param first iterator to the beginning of the input range`
			`@param last iterator to the end of the input range`
			`@param output iterator to the beginning of the output range`
			`@param bop binary operator to apply to scan`
			`@param uop unary operator to apply to transform each item before scan`
			`@param buf pointer to the temporary buffer`

			`*/`
			`template<typename P, typename I, typename O, typename C, typename U>`
			`void cuda_transform_exclusive_scan(`
			`P&& p, I first, I last, O output, C bop, U uop, void* buf`
			`) {`

			`using T = typename std::iterator_traits<O>::value_type;`

			`unsigned count = std::distance(first, last);`

			`if(count == 0) {`
			`return;`
			`}`

			`// launch the scan loop`
			`detail::cuda_scan_loop(`
			`p, detail::cudaScanType::EXCLUSIVE,`
			`cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),`
			`count, output, bop, buf`
			`);`
			`}`


			`} // end of namespace tf -----------------------------------------------------`