#pragma once #include "launch.hpp" namespace tf { namespace detail { // Function: scan_loop template void scan_loop( tf::Runtime& rt, std::atomic& counter, BufferT& buf, B bop, Iterator d_beg, size_t W, size_t w, size_t chunk_size ){ // whoever finishes the last performs global scan if(counter.fetch_add(1, std::memory_order_acq_rel) == W-1) { for(size_t i=1; i>, void>* = nullptr > auto make_inclusive_scan_task( B first, E last, D d_first, BOP bop, P part = P() ) { using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; using value_type = typename std::iterator_traits::value_type; return [=] (Runtime& rt) mutable { // fetch the stateful values B_t s_beg = first; E_t s_end = last; D_t d_beg = d_first; if(s_beg == s_end) { return; } size_t W = rt.executor().num_workers(); size_t N = std::distance(s_beg, s_end); // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { launch_loop(part, [&](){ std::inclusive_scan(s_beg, s_end, d_beg, bop); }); return; } if(N < W) { W = N; } std::vector> buf(W); std::atomic counter(0); size_t Q = N/W; size_t R = N%W; //auto orig_d_beg = d_beg; //ExecutionPolicy policy; for(size_t w=0, curr_b=0, chunk_size; w>, void>* = nullptr > auto make_inclusive_scan_task( B first, E last, D d_first, BOP bop, T init, P part = P() ) { using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; using value_type = typename std::iterator_traits::value_type; return [=] (Runtime& rt) mutable { // fetch the stateful values B_t s_beg = first; E_t s_end = last; D_t d_beg = d_first; if(s_beg == s_end) { return; } size_t W = rt.executor().num_workers(); size_t N = std::distance(s_beg, s_end); // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { launch_loop(part, [&](){ std::inclusive_scan(s_beg, s_end, d_beg, bop, init); }); return; } if(N < W) { W = N; } std::vector> buf(W); std::atomic counter(0); // set up the initial value for the first worker buf[0].data = std::move(init); size_t Q = N/W; size_t R = N%W; for(size_t w=0, curr_b=0, chunk_size; w>, void>* = nullptr > auto make_transform_inclusive_scan_task( B first, E last, D d_first, BOP bop, UOP uop, P part = P() ) { using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; using value_type = typename std::iterator_traits::value_type; return [=] (Runtime& rt) mutable { // fetch the stateful values B_t s_beg = first; E_t s_end = last; D_t d_beg = d_first; if(s_beg == s_end) { return; } size_t W = rt.executor().num_workers(); size_t N = std::distance(s_beg, s_end); // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { launch_loop(part, [&](){ std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop); }); return; } if(N < W) { W = N; } std::vector> buf(W); std::atomic counter(0); size_t Q = N/W; size_t R = N%W; for(size_t w=0, curr_b=0, chunk_size; w>, void>* = nullptr > auto make_transform_inclusive_scan_task( B first, E last, D d_first, BOP bop, UOP uop, T init, P part = P() ) { using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; using value_type = typename std::iterator_traits::value_type; return [=] (Runtime& rt) mutable { // fetch the stateful values B_t s_beg = first; E_t s_end = last; D_t d_beg = d_first; if(s_beg == s_end) { return; } size_t W = rt.executor().num_workers(); size_t N = std::distance(s_beg, s_end); // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { launch_loop(part, [&](){ std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init); }); return; } if(N < W) { W = N; } std::vector> buf(W); std::atomic counter(0); // set up the initial value for the first worker buf[0].data = std::move(init); size_t Q = N/W; size_t R = N%W; for(size_t w=0, curr_b=0, chunk_size; w auto make_exclusive_scan_task( B first, E last, D d_first, T init, BOP bop, P part = P() ) { using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; using value_type = typename std::iterator_traits::value_type; return [=] (Runtime& rt) mutable { // fetch the stateful values B_t s_beg = first; E_t s_end = last; D_t d_beg = d_first; if(s_beg == s_end) { return; } size_t W = rt.executor().num_workers(); size_t N = std::distance(s_beg, s_end); // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { launch_loop(part, [&](){ std::exclusive_scan(s_beg, s_end, d_beg, init, bop); }); return; } if(N < W) { W = N; } std::vector> buf(W); std::atomic counter(0); size_t Q = N/W; size_t R = N%W; // fetch the init value auto s_beg_temp = s_beg; for(size_t w=0, curr_b=0, chunk_size; w auto make_transform_exclusive_scan_task( B first, E last, D d_first, T init, BOP bop, UOP uop, P part = P() ) { using B_t = std::decay_t>; using E_t = std::decay_t>; using D_t = std::decay_t>; using value_type = typename std::iterator_traits::value_type; return [=] (Runtime& rt) mutable { // fetch the stateful values B_t s_beg = first; E_t s_end = last; D_t d_beg = d_first; if(s_beg == s_end) { return; } size_t W = rt.executor().num_workers(); size_t N = std::distance(s_beg, s_end); // only myself - no need to spawn another graph if(W <= 1 || N <= 2) { launch_loop(part, [&](){ std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop); }); return; } if(N < W) { W = N; } std::vector> buf(W); std::atomic counter(0); size_t Q = N/W; size_t R = N%W; // fetch the init value auto s_beg_temp = s_beg; for(size_t w=0, curr_b=0, chunk_size; w>, void>* > Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, P part) { return emplace(make_inclusive_scan_task(first, last, d_first, bop, part)); } // Function: inclusive_scan template >, void>* > Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init, P part) { return emplace(make_inclusive_scan_task(first, last, d_first, bop, init, part)); } // ---------------------------------------------------------------------------- // Transform Inclusive Scan // ---------------------------------------------------------------------------- // Function: transform_inclusive_scan template >, void>* > Task FlowBuilder::transform_inclusive_scan( B first, E last, D d_first, BOP bop, UOP uop, P part ) { return emplace(make_transform_inclusive_scan_task( first, last, d_first, bop, uop, part )); } // Function: transform_inclusive_scan template >, void>* > Task FlowBuilder::transform_inclusive_scan( B first, E last, D d_first, BOP bop, UOP uop, T init, P part ) { return emplace(make_transform_inclusive_scan_task( first, last, d_first, bop, uop, init, part )); } // ---------------------------------------------------------------------------- // Exclusive Scan // ---------------------------------------------------------------------------- // Function: exclusive_scan template Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop, P part) { return emplace(make_exclusive_scan_task( first, last, d_first, init, bop, part )); } // ---------------------------------------------------------------------------- // Transform Exclusive Scan // ---------------------------------------------------------------------------- // Function: transform_exclusive_scan template Task FlowBuilder::transform_exclusive_scan( B first, E last, D d_first, T init, BOP bop, UOP uop, P part ) { return emplace(make_transform_exclusive_scan_task( first, last, d_first, init, bop, uop, part )); } } // end of namespace tf -----------------------------------------------------