mesytec-mnode/external/taskflow-3.8.0/taskflow/cuda/algorithm/transform.hpp

283 lines
7.1 KiB
C++
Raw Permalink Normal View History

2025-01-04 01:25:05 +01:00
#pragma once
#include "../cudaflow.hpp"
/**
@file taskflow/cuda/algorithm/transform.hpp
@brief cuda parallel-transform algorithms include file
*/
namespace tf {
// ----------------------------------------------------------------------------
// transform
// ----------------------------------------------------------------------------
namespace detail {
/**
@private
*/
template <size_t nt, size_t vt, typename I, typename O, typename C>
__global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
auto tile = cuda_get_tile(bid, nt*vt, count);
cuda_strided_iterate<nt, vt>(
[=]__device__(auto, auto j) {
auto offset = j + tile.begin;
*(output + offset) = op(*(first+offset));
},
tid,
tile.count()
);
}
/**
@private
*/
template <size_t nt, size_t vt, typename I1, typename I2, typename O, typename C>
__global__ void cuda_transform_kernel(
I1 first1, I2 first2, unsigned count, O output, C op
) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
auto tile = cuda_get_tile(bid, nt*vt, count);
cuda_strided_iterate<nt, vt>(
[=]__device__(auto, auto j) {
auto offset = j + tile.begin;
*(output + offset) = op(*(first1+offset), *(first2+offset));
},
tid,
tile.count()
);
}
} // end of namespace detail -------------------------------------------------
// ----------------------------------------------------------------------------
// CUDA standard algorithms: transform
// ----------------------------------------------------------------------------
/**
@brief performs asynchronous parallel transforms over a range of items
@tparam P execution policy type
@tparam I input iterator type
@tparam O output iterator type
@tparam C unary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param output iterator to the beginning of the output range
@param op unary operator to apply to transform each item
This method is equivalent to the parallel execution of the following loop on a GPU:
@code{.cpp}
while (first != last) {
*output++ = op(*first++);
}
@endcode
*/
template <typename P, typename I, typename O, typename C>
void cuda_transform(P&& p, I first, I last, O output, C op) {
using E = std::decay_t<P>;
unsigned count = std::distance(first, last);
if(count == 0) {
return;
}
detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>
<<<E::num_blocks(count), E::nt, 0, p.stream()>>> (
first, count, output, op
);
}
/**
@brief performs asynchronous parallel transforms over two ranges of items
@tparam P execution policy type
@tparam I1 first input iterator type
@tparam I2 second input iterator type
@tparam O output iterator type
@tparam C binary operator type
@param p execution policy
@param first1 iterator to the beginning of the first range
@param last1 iterator to the end of the first range
@param first2 iterator to the beginning of the second range
@param output iterator to the beginning of the output range
@param op binary operator to apply to transform each pair of items
This method is equivalent to the parallel execution of the following loop on a GPU:
@code{.cpp}
while (first1 != last1) {
*output++ = op(*first1++, *first2++);
}
@endcode
*/
template <typename P, typename I1, typename I2, typename O, typename C>
void cuda_transform(
P&& p, I1 first1, I1 last1, I2 first2, O output, C op
) {
using E = std::decay_t<P>;
unsigned count = std::distance(first1, last1);
if(count == 0) {
return;
}
detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>
<<<E::num_blocks(count), E::nt, 0, p.stream()>>> (
first1, first2, count, output, op
);
}
// ----------------------------------------------------------------------------
// cudaFlow
// ----------------------------------------------------------------------------
// Function: transform
template <typename I, typename O, typename C>
cudaTask cudaFlow::transform(I first, I last, O output, C c) {
using E = cudaDefaultExecutionPolicy;
unsigned count = std::distance(first, last);
// TODO:
//if(count == 0) {
// return;
//}
return kernel(
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>,
first, count, output, c
);
}
// Function: transform
template <typename I1, typename I2, typename O, typename C>
cudaTask cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C c) {
using E = cudaDefaultExecutionPolicy;
unsigned count = std::distance(first1, last1);
// TODO:
//if(count == 0) {
// return;
//}
return kernel(
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>,
first1, first2, count, output, c
);
}
// Function: update transform
template <typename I, typename O, typename C>
void cudaFlow::transform(cudaTask task, I first, I last, O output, C c) {
using E = cudaDefaultExecutionPolicy;
unsigned count = std::distance(first, last);
// TODO:
//if(count == 0) {
// return;
//}
kernel(task,
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<E::nt, E::vt, I, O, C>,
first, count, output, c
);
}
// Function: update transform
template <typename I1, typename I2, typename O, typename C>
void cudaFlow::transform(
cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
) {
using E = cudaDefaultExecutionPolicy;
unsigned count = std::distance(first1, last1);
// TODO:
//if(count == 0) {
// return;
//}
kernel(task,
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<E::nt, E::vt, I1, I2, O, C>,
first1, first2, count, output, c
);
}
// ----------------------------------------------------------------------------
// cudaFlowCapturer
// ----------------------------------------------------------------------------
// Function: transform
template <typename I, typename O, typename C>
cudaTask cudaFlowCapturer::transform(I first, I last, O output, C op) {
return on([=](cudaStream_t stream) mutable {
cudaDefaultExecutionPolicy p(stream);
cuda_transform(p, first, last, output, op);
});
}
// Function: transform
template <typename I1, typename I2, typename O, typename C>
cudaTask cudaFlowCapturer::transform(
I1 first1, I1 last1, I2 first2, O output, C op
) {
return on([=](cudaStream_t stream) mutable {
cudaDefaultExecutionPolicy p(stream);
cuda_transform(p, first1, last1, first2, output, op);
});
}
// Function: transform
template <typename I, typename O, typename C>
void cudaFlowCapturer::transform(
cudaTask task, I first, I last, O output, C op
) {
on(task, [=] (cudaStream_t stream) mutable {
cudaDefaultExecutionPolicy p(stream);
cuda_transform(p, first, last, output, op);
});
}
// Function: transform
template <typename I1, typename I2, typename O, typename C>
void cudaFlowCapturer::transform(
cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
) {
on(task, [=] (cudaStream_t stream) mutable {
cudaDefaultExecutionPolicy p(stream);
cuda_transform(p, first1, last1, first2, output, op);
});
}
} // end of namespace tf -----------------------------------------------------