mesytec-mnode/external/taskflow-3.8.0/taskflow/algorithm/scan.hpp
2025-01-04 01:25:05 +01:00

633 lines
18 KiB
C++

#pragma once
#include "launch.hpp"
namespace tf {
namespace detail {
// Function: scan_loop
template <typename Iterator, typename BufferT, typename B>
void scan_loop(
tf::Runtime& rt,
std::atomic<size_t>& counter,
BufferT& buf,
B bop,
Iterator d_beg,
size_t W,
size_t w,
size_t chunk_size
){
// whoever finishes the last performs global scan
if(counter.fetch_add(1, std::memory_order_acq_rel) == W-1) {
for(size_t i=1; i<buf.size(); i++) {
buf[i].data = bop(buf[i-1].data, buf[i].data);
}
counter.store(0, std::memory_order_release);
}
// first worker no need to do any work
if(w==0) {
return;
}
// need to do public corun because multiple workers can call this
rt.executor().corun_until([&counter](){
return counter.load(std::memory_order_acquire) == 0;
});
// block addup
for(size_t i=0; i<chunk_size; i++) {
*d_beg++ = bop(buf[w-1].data, *d_beg);
}
}
} // end of namespace tf::detail ---------------------------------------------
// Function: make_inclusive_scan_task
template <typename B, typename E, typename D, typename BOP, typename P = DefaultPartitioner,
std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr
>
auto make_inclusive_scan_task(
B first, E last, D d_first, BOP bop, P part = P()
) {
using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
using value_type = typename std::iterator_traits<B_t>::value_type;
return [=] (Runtime& rt) mutable {
// fetch the stateful values
B_t s_beg = first;
E_t s_end = last;
D_t d_beg = d_first;
if(s_beg == s_end) {
return;
}
size_t W = rt.executor().num_workers();
size_t N = std::distance(s_beg, s_end);
// only myself - no need to spawn another graph
if(W <= 1 || N <= 2) {
launch_loop(part, [&](){
std::inclusive_scan(s_beg, s_end, d_beg, bop);
});
return;
}
if(N < W) {
W = N;
}
std::vector<CachelineAligned<value_type>> buf(W);
std::atomic<size_t> counter(0);
size_t Q = N/W;
size_t R = N%W;
//auto orig_d_beg = d_beg;
//ExecutionPolicy<StaticPartitioner> policy;
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w < R), N - curr_b);
// block scan
launch_loop(W, w, rt, part, [=, &rt, &bop, &buf, &counter] () mutable {
auto result = d_beg;
// local scan per worker
auto& init = buf[w].data;
*d_beg++ = init = *s_beg++;
for(size_t i=1; i<chunk_size; i++){
*d_beg++ = init = bop(init, *s_beg++);
}
// block scan
detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
//size_t offset = R ? Q + 1 : Q;
//size_t rest = N - offset;
//size_t rest_Q = rest / W;
//size_t rest_R = rest % W;
//
//chunk_size = policy.chunk_size() == 0 ?
// rest_Q + (w < rest_R) : policy.chunk_size();
//
//size_t curr_b = policy.chunk_size() == 0 ?
// offset + (w<rest_R ? w*(rest_Q + 1) : rest_R + w*rest_Q) :
// offset + w*policy.chunk_size();
//policy(N, W, curr_b, chunk_size,
// [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
// std::advance(orig_d_beg, curr_b - prev_e);
// for(size_t x = curr_b; x<curr_e; x++) {
// size_t j = x < (Q+1)*R ? x/(Q+1) : (x-(Q+1)*R)/Q + R;
// *orig_d_beg++ = bop(buf[j-1].data, *orig_d_beg);
// }
// prev_e = curr_e;
// }
//);
});
std::advance(s_beg, chunk_size);
std::advance(d_beg, chunk_size);
curr_b += chunk_size;
}
rt.corun_all();
};
}
// Function: make_inclusive_scan_task
template <typename B, typename E, typename D, typename BOP, typename T, typename P = DefaultPartitioner,
std::enable_if_t<!is_partitioner_v<std::decay_t<T>>, void>* = nullptr
>
auto make_inclusive_scan_task(
B first, E last, D d_first, BOP bop, T init, P part = P()
) {
using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
using value_type = typename std::iterator_traits<B_t>::value_type;
return [=] (Runtime& rt) mutable {
// fetch the stateful values
B_t s_beg = first;
E_t s_end = last;
D_t d_beg = d_first;
if(s_beg == s_end) {
return;
}
size_t W = rt.executor().num_workers();
size_t N = std::distance(s_beg, s_end);
// only myself - no need to spawn another graph
if(W <= 1 || N <= 2) {
launch_loop(part, [&](){
std::inclusive_scan(s_beg, s_end, d_beg, bop, init);
});
return;
}
if(N < W) {
W = N;
}
std::vector<CachelineAligned<value_type>> buf(W);
std::atomic<size_t> counter(0);
// set up the initial value for the first worker
buf[0].data = std::move(init);
size_t Q = N/W;
size_t R = N%W;
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w < R), N - curr_b);
// block scan
launch_loop(W, w, rt, part, [=, &rt, &bop, &buf, &counter] () mutable {
auto result = d_beg;
// local scan per worker
auto& local = buf[w].data;
*d_beg++ = local = (w == 0) ? bop(local, *s_beg++) : *s_beg++;
for(size_t i=1; i<chunk_size; i++){
*d_beg++ = local = bop(local, *s_beg++);
}
// block scan
detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
});
std::advance(s_beg, chunk_size);
std::advance(d_beg, chunk_size);
curr_b += chunk_size;
}
rt.corun_all();
};
}
// ----------------------------------------------------------------------------
// Transform Inclusive Scan
// ----------------------------------------------------------------------------
// Function: transform_inclusive_scan
template <typename B, typename E, typename D, typename BOP, typename UOP, typename P = DefaultPartitioner,
std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>* = nullptr
>
auto make_transform_inclusive_scan_task(
B first, E last, D d_first, BOP bop, UOP uop, P part = P()
) {
using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
using value_type = typename std::iterator_traits<B_t>::value_type;
return [=] (Runtime& rt) mutable {
// fetch the stateful values
B_t s_beg = first;
E_t s_end = last;
D_t d_beg = d_first;
if(s_beg == s_end) {
return;
}
size_t W = rt.executor().num_workers();
size_t N = std::distance(s_beg, s_end);
// only myself - no need to spawn another graph
if(W <= 1 || N <= 2) {
launch_loop(part, [&](){
std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop);
});
return;
}
if(N < W) {
W = N;
}
std::vector<CachelineAligned<value_type>> buf(W);
std::atomic<size_t> counter(0);
size_t Q = N/W;
size_t R = N%W;
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w < R), N - curr_b);
// block scan
launch_loop(W, w, rt, part, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
auto result = d_beg;
// local scan per worker
auto& init = buf[w].data;
*d_beg++ = init = uop(*s_beg++);
for(size_t i=1; i<chunk_size; i++){
*d_beg++ = init = bop(init, uop(*s_beg++));
}
// block scan
detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
});
std::advance(s_beg, chunk_size);
std::advance(d_beg, chunk_size);
curr_b += chunk_size;
}
rt.corun_all();
};
}
// Function: transform_inclusive_scan
template <typename B, typename E, typename D, typename BOP, typename UOP, typename T, typename P = DefaultPartitioner,
std::enable_if_t<!is_partitioner_v<std::decay_t<T>>, void>* = nullptr
>
auto make_transform_inclusive_scan_task(
B first, E last, D d_first, BOP bop, UOP uop, T init, P part = P()
) {
using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
using value_type = typename std::iterator_traits<B_t>::value_type;
return [=] (Runtime& rt) mutable {
// fetch the stateful values
B_t s_beg = first;
E_t s_end = last;
D_t d_beg = d_first;
if(s_beg == s_end) {
return;
}
size_t W = rt.executor().num_workers();
size_t N = std::distance(s_beg, s_end);
// only myself - no need to spawn another graph
if(W <= 1 || N <= 2) {
launch_loop(part, [&](){
std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init);
});
return;
}
if(N < W) {
W = N;
}
std::vector<CachelineAligned<value_type>> buf(W);
std::atomic<size_t> counter(0);
// set up the initial value for the first worker
buf[0].data = std::move(init);
size_t Q = N/W;
size_t R = N%W;
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w < R), N - curr_b);
// block scan
launch_loop(W, w, rt, part, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
auto result = d_beg;
// local scan per worker
auto& local = buf[w].data;
*d_beg++ = local = (w == 0) ? bop(local, uop(*s_beg++)) : uop(*s_beg++);
for(size_t i=1; i<chunk_size; i++){
*d_beg++ = local = bop(local, uop(*s_beg++));
}
// block scan
detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
});
std::advance(s_beg, chunk_size);
std::advance(d_beg, chunk_size);
curr_b += chunk_size;
}
rt.corun_all();
};
}
// ----------------------------------------------------------------------------
// Exclusive Scan
// ----------------------------------------------------------------------------
// Function: make_exclusive_scan_task
template <typename B, typename E, typename D, typename T, typename BOP, typename P = DefaultPartitioner>
auto make_exclusive_scan_task(
B first, E last, D d_first, T init, BOP bop, P part = P()
) {
using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
using value_type = typename std::iterator_traits<B_t>::value_type;
return [=] (Runtime& rt) mutable {
// fetch the stateful values
B_t s_beg = first;
E_t s_end = last;
D_t d_beg = d_first;
if(s_beg == s_end) {
return;
}
size_t W = rt.executor().num_workers();
size_t N = std::distance(s_beg, s_end);
// only myself - no need to spawn another graph
if(W <= 1 || N <= 2) {
launch_loop(part, [&](){
std::exclusive_scan(s_beg, s_end, d_beg, init, bop);
});
return;
}
if(N < W) {
W = N;
}
std::vector<CachelineAligned<value_type>> buf(W);
std::atomic<size_t> counter(0);
size_t Q = N/W;
size_t R = N%W;
// fetch the init value
auto s_beg_temp = s_beg;
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w<R), N - curr_b);
buf[w].data = w ? *s_beg_temp : std::move(init);
std::advance(s_beg_temp, chunk_size - !w);
curr_b += chunk_size;
}
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w < R), N - curr_b);
// block scan
launch_loop(W, w, rt, part, [=, &rt, &bop, &buf, &counter] () mutable {
auto result = d_beg;
// local scan per worker
auto& local = buf[w].data;
for(size_t i=1; i<chunk_size; i++) {
auto v = local;
local = bop(local, *s_beg++);
*d_beg++ = std::move(v);
}
*d_beg++ = local;
// block scan
detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
});
std::advance(s_beg, chunk_size);
std::advance(d_beg, chunk_size);
curr_b += chunk_size;
}
rt.corun_all();
};
}
// ----------------------------------------------------------------------------
// Transform Exclusive Scan
// ----------------------------------------------------------------------------
// Function:
template <typename B, typename E, typename D, typename T, typename BOP, typename UOP, typename P = DefaultPartitioner>
auto make_transform_exclusive_scan_task(
B first, E last, D d_first, T init, BOP bop, UOP uop, P part = P()
) {
using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
using value_type = typename std::iterator_traits<B_t>::value_type;
return [=] (Runtime& rt) mutable {
// fetch the stateful values
B_t s_beg = first;
E_t s_end = last;
D_t d_beg = d_first;
if(s_beg == s_end) {
return;
}
size_t W = rt.executor().num_workers();
size_t N = std::distance(s_beg, s_end);
// only myself - no need to spawn another graph
if(W <= 1 || N <= 2) {
launch_loop(part, [&](){
std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop);
});
return;
}
if(N < W) {
W = N;
}
std::vector<CachelineAligned<value_type>> buf(W);
std::atomic<size_t> counter(0);
size_t Q = N/W;
size_t R = N%W;
// fetch the init value
auto s_beg_temp = s_beg;
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w<R), N - curr_b);
buf[w].data = w ? uop(*s_beg_temp) : std::move(init);
std::advance(s_beg_temp, chunk_size - !w);
curr_b += chunk_size;
}
for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
chunk_size = std::min(Q + (w < R), N - curr_b);
// block scan
launch_loop(W, w, rt, part, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
auto result = d_beg;
// local scan per worker
auto& local = buf[w].data;
for(size_t i=1; i<chunk_size; i++) {
auto v = local;
local = bop(local, uop(*s_beg++));
*d_beg++ = std::move(v);
}
*d_beg++ = local;
// block scan
detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
});
std::advance(s_beg, chunk_size);
std::advance(d_beg, chunk_size);
curr_b += chunk_size;
}
rt.corun_all();
};
}
// ----------------------------------------------------------------------------
// Inclusive Scan
// ----------------------------------------------------------------------------
// Function: inclusive_scan
template <typename B, typename E, typename D, typename BOP, typename P,
std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>*
>
Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, P part) {
return emplace(make_inclusive_scan_task(first, last, d_first, bop, part));
}
// Function: inclusive_scan
template <typename B, typename E, typename D, typename BOP, typename T, typename P,
std::enable_if_t<!is_partitioner_v<std::decay_t<T>>, void>*
>
Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init, P part) {
return emplace(make_inclusive_scan_task(first, last, d_first, bop, init, part));
}
// ----------------------------------------------------------------------------
// Transform Inclusive Scan
// ----------------------------------------------------------------------------
// Function: transform_inclusive_scan
template <typename B, typename E, typename D, typename BOP, typename UOP, typename P,
std::enable_if_t<is_partitioner_v<std::decay_t<P>>, void>*
>
Task FlowBuilder::transform_inclusive_scan(
B first, E last, D d_first, BOP bop, UOP uop, P part
) {
return emplace(make_transform_inclusive_scan_task(
first, last, d_first, bop, uop, part
));
}
// Function: transform_inclusive_scan
template <typename B, typename E, typename D, typename BOP, typename UOP, typename T, typename P,
std::enable_if_t<!is_partitioner_v<std::decay_t<T>>, void>*
>
Task FlowBuilder::transform_inclusive_scan(
B first, E last, D d_first, BOP bop, UOP uop, T init, P part
) {
return emplace(make_transform_inclusive_scan_task(
first, last, d_first, bop, uop, init, part
));
}
// ----------------------------------------------------------------------------
// Exclusive Scan
// ----------------------------------------------------------------------------
// Function: exclusive_scan
template <typename B, typename E, typename D, typename T, typename BOP, typename P>
Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop, P part) {
return emplace(make_exclusive_scan_task(
first, last, d_first, init, bop, part
));
}
// ----------------------------------------------------------------------------
// Transform Exclusive Scan
// ----------------------------------------------------------------------------
// Function: transform_exclusive_scan
template <typename B, typename E, typename D, typename T, typename BOP, typename UOP, typename P>
Task FlowBuilder::transform_exclusive_scan(
B first, E last, D d_first, T init, BOP bop, UOP uop, P part
) {
return emplace(make_transform_exclusive_scan_task(
first, last, d_first, init, bop, uop, part
));
}
} // end of namespace tf -----------------------------------------------------