mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/parallel_for_internals.hpp
2025-01-04 01:25:05 +01:00

1240 lines
54 KiB
C++

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/*!
* \link
* \file parallel_for_internals.hpp
* \ingroup aux_classes
*
* \brief Internal classes and functions for parallel_for/parallel_reduce skeletons.
*/
/* ***************************************************************************
*
* FastFlow is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License version 3 as
* published by the Free Software Foundation.
* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
****************************************************************************
*/
/*
* - Author:
* Massimo Torquati <torquati@di.unipi.it>
*
* History:
* - started in May 2013
* - January 2014: code optimized
* - February 2014:
* - avoided to start the Scheduler thread if it is not needed
* A new (non lock-free) decentralized scheduler has been implemented
* for the case when adding an extra thread is not useful.
* - introduced the parallel_for functions
* - added the ParallelFor and ParallelForReduce classes
* - June 2014:
* - parallel_for_static
*
*/
#ifndef FF_PARFOR_INTERNALS_HPP
#define FF_PARFOR_INTERNALS_HPP
// #ifndef __INTEL_COMPILER
// // see http://www.stroustrup.com/C++11FAQ.html#11
// #if __cplusplus <= 199711L
// #error "parallel_for requires C++11 features"
// #endif
// #endif
#include <atomic>
#include <algorithm>
#include <deque>
#include <vector>
#include <cmath>
#include <functional>
#include <ff/lb.hpp>
#include <ff/node.hpp>
#include <ff/farm.hpp>
#include <ff/spin-lock.hpp>
enum {FF_AUTO=-1};
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
static int dummyTask;
static bool globalSchedRunning;
#endif
#if defined(__ICC)
#define PRAGMA_IVDEP _Pragma("ivdep")
#else
#define PRAGMA_IVDEP
#endif
namespace ff {
/* -------------------- Parallel For/Reduce Macros -------------------- */
/* Usage example:
* // loop parallelization using 3 workers
* // and a minimum task grain of 2
* wthread = 3;
* grain = 2;
* for(int i=0;i<N;++i) FF_PARFOR_BEGIN(for,i,0,N,1,grain,wthread) {
* A[i]=f(i) ----> A[i]=f(i);
* } FF_PARFOR_END(for);
*
* parallel for + reduction:
*
* s=4;
* for(int i=0;i<N;++i) FF_PARFORREDUCE_BEGIN(for,s,0,i,0,N,1,grain,wthread) {
* s*=f(i) ----> s*=f(i);
* } FF_PARFORREDUCE_END(for,s,*);
*
*
* FF_PARFOR_INIT(pf,maxwthread);
* ....
* while(k<nTime) { while(k<nTime) {
* for(int i=0;i<N;++i) FF_PARFORREDUCE_START(pf,s,0,i,0,N,1,grain,wthread) {
* s*=f(i,k); ----> s*=f(i,k);
* } } FF_PARFORREDUCE_STOP(pf,s,*);
* }
* ....
*
* FF_PARFOR_DONE(pf);
*
*
* NOTE: inside the body of the PARFOR/PARFORREDUCE, it is possible to use the
* '_ff_thread_id' const integer variable to identify the thread id
* running the sequential portion of the loop.
*/
/**
* name : of the parallel for
* idx : iteration index
* begin: for starting point
* end : for ending point
* step : for step
* chunk: chunk size
* nw : n. of worker threads
*/
#define FF_PARFOR_BEGIN(name, idx, begin, end, step, chunk, nw) \
ff_forall_farm<forallreduce_W<int> > name(nw,false,true); \
name.setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, const int) { \
FF_IGNORE_UNUSED(_ff_thread_id); \
PRAGMA_IVDEP; \
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
/* This is equivalent to the above one except that the user has to define
* the for loop in the range (ff_start_idx,ff_stop_idx(
* This can be useful if you have to perform some actions before starting
* the local loop and/or some actions after the local loop finishes.
* The onoff parameter allow to disable/enable the scheduler thread
* (by default the scheduler is active.
*/
#define FF_PARFOR_BEGIN_IDX(name, idx, begin, end, step, chunk, nw, onoff) \
ff_forall_farm<forallreduce_W<int> > name(nw,false,true); \
name.setloop(begin,end,step,chunk, nw); \
name.disableScheduler(onoff); \
auto F_##name = [&] (const long ff_start_idx, const long ff_stop_idx, \
const int _ff_thread_id, const int) { \
/* here you have to define the for loop using ff_start/stop_idx */
#define FF_PARFOR_END(name) \
}; \
{ \
if (name.getnw()>1) { \
name.setF(F_##name); \
if (name.run_and_wait_end()<0) { \
error("running parallel for\n"); \
} \
} else F_##name(name.startIdx(),name.stopIdx(),0,0); \
}
/* ---------------------------------------------- */
/**
* name : of the parallel for
* var : variable on which the reduce operator is applied
* identity: the value such that var == var op identity
* idx : iteration index
* begin : for starting point
* end : for ending point
* step : for step
* chunk : chunk size
* nw : n. of worker threads
*
* op : reduce operation (+ * ....)
*/
#define FF_PARFORREDUCE_BEGIN(name, var,identity, idx,begin,end,step, chunk, nw) \
ff_forall_farm<forallreduce_W<decltype(var)> > name(nw,false,true); \
name.setloop(begin,end,step,chunk,nw); \
auto idtt_##name =identity; \
auto F_##name =[&](const long start,const long stop,const int _ff_thread_id, \
decltype(var) &var) { \
FF_IGNORE_UNUSED(_ff_thread_id); \
PRAGMA_IVDEP; \
for(long idx=start;idx<stop;idx+=step)
#define FF_PARFORREDUCE_BEGIN_IDX(name, var,identity, idx,begin,end,step, chunk, nw, onoff) \
ff_forall_farm<forallreduce_W<decltype(var)> > name(nw,false,true); \
name.setloop(begin,end,step,chunk,nw); \
name.disableScheduler(onoff); \
auto idtt_##name =identity; \
auto F_##name =[&](const long ff_start_idx,const long ff_stop_idx,const int _ff_thread_id, \
decltype(var) &var) { \
FF_IGNORE_UNUSED(_ff_thread_id);
#define FF_PARFORREDUCE_END(name, var, op) \
}; \
if (name.getnw()>1) { \
auto ovar_##name = var; \
name.setF(F_##name,idtt_##name); \
if (name.run_and_wait_end()<0) { \
error("running forall_##name\n"); \
} \
var = ovar_##name; \
for(size_t i=0;i<name.getnw();++i) { \
var op##= name.getres(i); \
} \
} else { \
var = ovar_##name; \
F_##name(name.startIdx(),name.stopIdx(),0,var); \
}
#define FF_PARFORREDUCE_F_END(name, var, F) \
}; \
if (name.getnw()>1) { \
auto ovar_##name = var; \
name.setF(F_##name,idtt_##name); \
if (name.run_and_wait_end()<0) \
error("running ff_forall_farm (reduce F end)\n"); \
var = ovar_##name; \
for(size_t i=0;i<name.getnw();++i) { \
F(var,name.getres(i)); \
} \
} else { \
F_##name(name.startIdx(),name.stopIdx(),0,var); \
}
/* ---------------------------------------------- */
/* FF_PARFOR_START and FF_PARFOR_STOP have the same meaning of
* FF_PARFOR_BEGIN and FF_PARFOR_END but they have to be used in
* conjunction with FF_PARFOR_INIT FF_PARFOR_END.
*
* The same is for FF_PARFORREDUCE_START/STOP.
*/
#define FF_PARFOR_INIT(name, nw) \
ff_forall_farm<forallreduce_W<int> > *name = \
new ff_forall_farm<forallreduce_W<int> >(nw)
#define FF_PARFOR_DECL(name) ff_forall_farm<forallreduce_W<int> > * name
#define FF_PARFOR_ASSIGN(name,nw) name=new ff_forall_farm<forallreduce_W<int> >(nw)
#define FF_PARFOR_DONE(name) name->stop(); name->wait(); delete name;
#define FF_PARFORREDUCE_INIT(name, type, nw) \
ff_forall_farm<forallreduce_W<type> > *name = \
new ff_forall_farm<forallreduce_W<type> >(nw)
#define FF_PARFORREDUCE_DECL(name,type) ff_forall_farm<forallreduce_W<type> > * name
#define FF_PARFORREDUCE_ASSIGN(name,type,nw) name= \
new ff_forall_farm<forallreduce_W<type> >(nw)
#define FF_PARFORREDUCE_DONE(name) name->stop();name->wait();delete name
#define FF_PARFOR_START(name, idx, begin, end, step, chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, const int) { \
FF_IGNORE_UNUSED(_ff_thread_id); \
PRAGMA_IVDEP; \
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
#define FF_PARFOR2_START(name, idx, begin, end, step, chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, const int) { \
FF_IGNORE_UNUSED(_ff_thread_id);
/* here you have to define the for loop using ff_start/stop_##idx */
/* this is equivalent to FF_PARFOR2_START but the start/stop indexes have a fixed name */
#define FF_PARFOR_START_IDX(name, idx, begin, end, step, chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_idx, const long ff_stop_idx, \
const int _ff_thread_id, const int) { \
FF_IGNORE_UNUSED(_ff_thread_id);
/* here you have to define the for loop using ff_start/stop_idx */
// just another variat that may be used together with FF_PARFORREDUCE_INIT
#define FF_PARFOR_T_START(name, type, idx, begin, end, step, chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, const type&) { \
FF_IGNORE_UNUSED(_ff_thread_id); \
PRAGMA_IVDEP; \
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
// just another variat that may be used together with FF_PARFORREDUCE_INIT
#define FF_PARFOR_T_START_STATIC(name, type, idx, begin, end, step, chunk, nw) \
assert(chunk<=0); \
name->setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, const type&) { \
const long _ff_jump0=(name->getnw())*(-chunk*step); \
const long _ff_jump1=(-chunk*step); \
FF_IGNORE_UNUSED(_ff_thread_id); \
PRAGMA_IVDEP; \
for(long _ff_##idx=ff_start_##idx;_ff_##idx<ff_stop_##idx;_ff_##idx+=_ff_jump0) \
for(long idx=_ff_##idx,_ff_end_##idx=std::min(ff_stop_##idx,_ff_##idx +_ff_jump1); \
idx<_ff_end_##idx;idx+=step)
#define FF_PARFOR_T_START_IDX(name, type, idx, begin, end,step,chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto F_##name = [&] (const long ff_start_idx, const long ff_stop_idx, \
const int _ff_thread_id, const type&) { \
FF_IGNORE_UNUSED(_ff_thread_id);
/* here you have to use the fixed indexes ff_idx_start, ff_idx_stop */
#define FF_PARFOR_STOP(name) \
}; \
if (name->getnw()>1) { \
name->setF(F_##name); \
if (name->run_then_freeze(name->getnw())<0) \
error("running ff_forall_farm (name)\n"); \
name->wait_freezing(); \
} else F_##name(name->startIdx(),name->stopIdx(),0,0);
#define FF_PARFOR_T_STOP(name, type) \
}; \
if (name->getnw()>1) { \
name->setF(F_##name, type()); \
if (name->run_then_freeze(name->getnw())<0) \
error("running ff_forall_farm (name)\n"); \
name->wait_freezing(); \
} else { \
F_##name(name->startIdx(),name->stopIdx(),0,type()); \
}
#define FF_PARFORREDUCE_START(name, var,identity, idx,begin,end,step, chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto idtt_##name =identity; \
auto F_##name =[&](const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, decltype(var) &var) { \
FF_IGNORE_UNUSED(_ff_thread_id); \
PRAGMA_IVDEP \
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
#define FF_PARFORREDUCE_START_IDX(name, var,identity, idx,begin,end,step, chunk, nw) \
name->setloop(begin,end,step,chunk,nw); \
auto idtt_##name =identity; \
auto F_##name =[&](const long ff_start_idx, const long ff_stop_idx, \
const int _ff_thread_id, decltype(var) &var) { \
FF_IGNORE_UNUSED(_ff_thread_id);
#define FF_PARFORREDUCE_START_STATIC(name, var,identity, idx,begin,end,step, chunk, nw) \
assert(chunk<=0); \
name->setloop(begin,end,step,chunk,nw); \
auto idtt_##name =identity; \
auto F_##name =[&](const long ff_start_##idx, const long ff_stop_##idx, \
const int _ff_thread_id, decltype(var) &var) { \
const long _ff_jump0=(name->getnw())*(-chunk*step); \
const long _ff_jump1=(-chunk*step); \
PRAGMA_IVDEP; \
for(long _ff_##idx=ff_start_##idx;_ff_##idx<ff_stop_##idx;_ff_##idx+=_ff_jump0) \
for(long idx=_ff_##idx,_ff_end_##idx=std::min(ff_stop_##idx,_ff_##idx +_ff_jump1); \
idx<_ff_end_##idx;idx+=step)
#define FF_PARFORREDUCE_STOP(name, var, op) \
}; \
if (name->getnw()>1) { \
auto ovar_##name = var; \
name->setF(F_##name,idtt_##name); \
if (name->run_then_freeze(name->getnw())<0) \
error("running ff_forall_farm (name)\n"); \
name->wait_freezing(); \
var = ovar_##name; \
for(size_t i=0;i<name->getnw();++i) { \
var op##= name->getres(i); \
} \
} else { \
F_##name(name->startIdx(),name->stopIdx(),0,var); \
}
#define FF_PARFORREDUCE_F_STOP(name, var, F) \
}; \
if (name->getnw()>1) { \
auto ovar_##name = var; \
name->setF(F_##name,idtt_##name); \
if (name->run_then_freeze(name->getnw())<0) \
error("running ff_forall_farm (name)\n"); \
name->wait_freezing(); \
var = ovar_##name; \
for(size_t i=0;i<name->getnw();++i) { \
F(var,name->getres(i)); \
} \
} else { \
F_##name(name->startIdx(),name->stopIdx(),0,var); \
}
//
// see NOTE in setloop to understand the meaning of 'default static'
// 'static with grain size' and 'dynamic with grain size'
//
#define PARFOR_STATIC(X) (X>0?-X:X)
#define PARFOR_DYNAMIC(X) (X<0?-X:X)
/* ------------------------------------------------------------------- */
// parallel for task, it represents a range (start,end( of indexes
struct forall_task_t {
forall_task_t() : end(0) {
start.store(0); // MA: consistency of store to be checked
}
forall_task_t(const forall_task_t &t):end(t.end) {
start.store(t.start.load(std::memory_order_relaxed)); // MA: consistency of store to be checked
}
forall_task_t & operator=(const forall_task_t &t) {
start=t.start.load(std::memory_order_relaxed), end=t.end;
return *this;
}
void set(long s, long e) { start=s,end=e; }
std::atomic_long start;
long end;
};
struct dataPair {
std::atomic_long ntask;
ALIGN_TO_PRE(CACHE_LINE_SIZE)
forall_task_t task;
ALIGN_TO_POST(CACHE_LINE_SIZE)
dataPair():task() {
ntask.store(0); // MA: consistency of store to be checked
};
dataPair(const dataPair &d):task(d.task) {
ntask.store(d.ntask.load(std::memory_order_relaxed)); // MA: consistency of store to be checked
}
dataPair& operator=(const dataPair &d) { ntask=d.ntask.load(std::memory_order_relaxed), task=d.task; return *this; }
};
// compare functiong
static inline bool data_cmp(const dataPair &a,const dataPair &b) {
return a.ntask < b.ntask;
}
// delay function for worker threads
static inline void workerlosetime_in(const bool aggressive) {
if (aggressive) PAUSE();
else ff_relax(0);
}
// parallel for/reduce task scheduler
class forall_Scheduler: public ff_node {
protected:
std::vector<bool> eossent;
std::vector<dataPair> data;
std::atomic_long maxid;
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
std::atomic_long _nextIteration;
#endif
protected:
// initialize the data vector
virtual inline size_t init_data(ssize_t start, ssize_t stop) {
static_scheduling = false; // enable work stealing in the nextTaskConcurrent
const long numtasks = std::lrint(std::ceil((stop-start)/(double)_step));
long totalnumtasks = std::lrint(std::ceil(numtasks/(double)_chunk));
long tt = totalnumtasks;
size_t ntxw = totalnumtasks / _nw;
size_t r = totalnumtasks % _nw;
// try to keep the n. of tasks per worker as smaller as possible
if (ntxw == 0 && r>=1) { ntxw = 1, r = 0; }
data.resize(_nw); eossent.resize(_nw);
taskv.resize(8*_nw); // 8 is the maximum n. of jumps, see the heuristic below
skip1=false,jump=0,maxid=-1;
ssize_t end, t=0, e;
for(size_t i=0;i<_nw && totalnumtasks>0;++i, totalnumtasks-=t) {
t = ntxw + ( (r>1 && (i<r)) ? 1 : 0 );
e = start + (t*_chunk - 1)*_step + 1;
end = (e<stop) ? e : stop;
data[i].ntask=t;
data[i].task.set(start,end);
start = (end-1)+_step;
}
if (totalnumtasks) {
assert(totalnumtasks==1);
// try to keep the n. of tasks per worker as smaller as possible
if (ntxw > 1) data[_nw-1].ntask += totalnumtasks;
else { --tt, _chunk*=2; }
data[_nw-1].task.end = stop;
}
// printf("init_data\n");
// for(size_t i=0;i<_nw;++i) {
// printf("W=%ld %ld <%ld,%ld>\n", i, data[i].ntask.load(), data[i].task.start.load(), data[i].task.end);
// }
// printf("totaltasks=%ld\n", tt);
return tt;
}
// initialize the data vector
virtual inline size_t init_data_static(long start, long stop) {
assert(_chunk <= 0);
static_scheduling = true; // this forces static scheduling in the nextTaskConcurrent
skip1=false,jump=0,maxid=-1;
if (_chunk == 0) {
// default static scheduling, i.e. the iteration space is almost equally divided
// in contiguous chunks among threads
const long numtasks = std::lrint(std::ceil((stop-start)/(double)_step));
long totalnumtasks = (long)_nw;
size_t r = numtasks % _nw;
_chunk = numtasks / long(_nw);
data.resize(_nw); taskv.resize(_nw);eossent.resize(_nw);
long end, e;
for(size_t i=0; totalnumtasks>0; ++i,--totalnumtasks) {
e = start + (_chunk - 1)*_step + 1 + ((i<r) ? _step : 0 );
end = (e<stop) ? e : stop;
data[i].ntask=1;
data[i].task.set(start,end);
start = (end-1)+_step;
}
if (r) ++_chunk;
return _nw;
}
// fill out the table with only the first task just to start the worker threads
long chunk = -_chunk;
_chunk = stop; // needed because sendTask has to send the range (begin, stop(
const long numtasks = std::lrint(std::ceil((stop-start)/(double)_step));
const long totalnumtasks = std::lrint(std::ceil(numtasks/(double)chunk));
const size_t ntxw = (std::min)(_nw, (size_t)totalnumtasks);
for(size_t i=0;i<ntxw;++i) {
data[i].ntask = 1;
data[i].task.set(start+long(i)*chunk,stop);
}
// printf("init_data_static\n");
// for(size_t i=0;i<_nw;++i) {
// long start=data[i].task.start;
// long ntask=data[i].ntask;
// printf("W=%ld %ld <%ld,%ld>\n", i, ntask.load(), start.load(), data[i].task.end);
// }
// printf("total task=%ld\n", ntxw);
return ntxw;
}
public:
forall_Scheduler(ff_loadbalancer* lb, long start, long stop, long step, long chunk, size_t nw):
lb(lb),_start(start),_stop(stop),_step(step),_chunk(chunk),totaltasks(0),_nw(nw),
jump(0),skip1(false),workersspinwait(false),static_scheduling(false) {
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
_nextIteration = _start;
#endif
maxid.store(-1); // MA: consistency of store to be checked
if (_chunk<=0) totaltasks = init_data_static(start,stop);
else totaltasks = init_data(start,stop);
assert(totaltasks>=1);
}
forall_Scheduler(ff_loadbalancer* lb, size_t nw):
lb(lb),_start(0),_stop(0),_step(1),_chunk(1),totaltasks(0),_nw(nw),
jump(0),skip1(false),workersspinwait(false),static_scheduling(false) {
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
_nextIteration = 0;
#endif
maxid.store(-1); // MA: consistency of store to be checked
totaltasks = init_data(0,0);
assert(totaltasks==0);
}
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
inline bool canUseNoStealing(){
return !globalSchedRunning && !static_scheduling && _step == 1 && _chunk == 1;
}
#endif
inline bool sendTask(const bool skipmore=false) {
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
if(canUseNoStealing()){
// Just start the workers and die.
for(size_t wid=0;wid<_nw;++wid) {
lb->ff_send_out_to((void*) &dummyTask, (int) wid);
}
return true;
}
#endif
size_t remaining = totaltasks;
const long endchunk = (_chunk-1)*_step + 1;
more:
for(size_t wid=0;wid<_nw;++wid) {
if (data[wid].ntask >0) {
long start = data[wid].task.start;
long end = (std::min)(start+endchunk, data[wid].task.end);
taskv[wid+jump].set(start, end);
lb->ff_send_out_to(&taskv[wid+jump], (int) wid);
--remaining, --data[wid].ntask;
(data[wid].task).start = (end-1)+_step;
eossent[wid]=false;
} else skip1=true; //skip2=skip3=true;
}
// January 2014 (massimo): this heuristic maight not be the best option in presence
// of very high load imbalance between iterations.
// Update: removed skip2 and skip3 so that it is less aggressive !
jump+=long(_nw);
assert((jump / _nw) <= 8);
// heuristic: try to assign more task at the very beginning
if (!skipmore && !skip1 && totaltasks>=4*_nw) { skip1=true; goto more;}
//if (!skip2 && totaltasks>=64*_nw) { skip1=false; skip2=true; goto moretask;}
//if (!skip3 && totaltasks>=1024*_nw){ skip1=false; skip2=false; skip3=true; goto moretask;}
return (remaining>0);
}
inline void sendWakeUp() {
for(size_t id=0;id<_nw;++id) {
taskv[id].set(0,0);
lb->ff_send_out_to(&taskv[id], int(id));
}
}
inline bool nextTaskConcurrentNoStealing(forall_task_t *task, const int wid) {
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
long r = _nextIteration.fetch_add(_step);
if(r >= _stop){return false;}
task->set(r, r + _step);
return true;
#else
FF_IGNORE_UNUSED(task);
FF_IGNORE_UNUSED(wid);
error("To use nextTaskConcurrentNoStealing you need to define macro FF_PARFOR_PASSIVE_NOSTEALING\n");
return false;
#endif
}
// this method is accessed concurrently by all worker threads
inline bool nextTaskConcurrent(forall_task_t *task, const int wid) {
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
if(canUseNoStealing()){
return nextTaskConcurrentNoStealing(task, wid);
}
#endif
const long endchunk = (_chunk-1)*_step + 1; // next end-point
auto id = wid;
L1:
if (data[id].ntask.load(std::memory_order_acquire)>0) {
auto oldstart = data[id].task.start.load(std::memory_order_relaxed);
auto end = (std::min)(oldstart+endchunk, data[id].task.end);
auto newstart = (end-1)+_step;
if (!data[id].task.start.compare_exchange_weak(oldstart, newstart,
std::memory_order_release,
std::memory_order_relaxed)) {
workerlosetime_in(_nw <= lb->getnworkers());
goto L1; // restart the sequence from the beginning
}
// after fetch_sub ntask may be less than 0
data[id].ntask.fetch_sub(1,std::memory_order_release);
if (oldstart<end) { // it might be possible that oldstart == end
task->set(oldstart, end);
return true;
}
}
// no available task for the current thread
if (static_scheduling) return false; // <------------------------------------
#if !defined(PARFOR_MULTIPLE_TASKS_STEALING)
// the following scheduling policy for the tasks focuses mostly to load-balancing
long _maxid = 0, ntask = 0;
if (maxid.load(std::memory_order_acquire)<0)
_maxid = (long) (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
else _maxid = maxid;
ntask = data[_maxid].ntask.load(std::memory_order_relaxed);
if (ntask>0) {
if (_maxid != maxid) maxid.store(_maxid, std::memory_order_release);
id = _maxid;
goto L1;
}
// no more tasks, exit
#else
// the following scheduling policy for the tasks is a little bit more
// complex and costly. It tries to find a trade-off between
// task-to-thread localy and load-balancing by moving a bunch of tasks
// from one thread to another one
long _maxid = 0, ntask = 0;
if (maxid.load(std::memory_order_acquire)<0)
_maxid = (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
else _maxid = maxid;
L2:
ntask = data[_maxid].ntask.load(std::memory_order_relaxed);
if (ntask>0) {
if (_maxid != maxid) maxid.store(_maxid, std::memory_order_release);
if (ntask<=3) { id = _maxid; goto L1; }
// try to steal half of the tasks remaining to _maxid
auto oldstart = data[_maxid].task.start.load(std::memory_order_relaxed);
auto q = ((data[_maxid].task.end-oldstart)/_chunk) >> 1;
if (q<=3) { id = _maxid; goto L1; }
auto newstart = oldstart + (q*_chunk-1)*_step +1;
if (!data[_maxid].task.start.compare_exchange_weak(oldstart, newstart,
std::memory_order_release,
std::memory_order_relaxed)) {
workerlosetime_in(_nw <= lb->getnworkers());
goto L2; // restart the sequence from the beginning
}
assert(newstart <= data[_maxid].task.end);
data[_maxid].ntask.fetch_sub(q, std::memory_order_release);
data[wid].task.start.store(oldstart, std::memory_order_relaxed);
data[wid].task.end = newstart;
data[wid].ntask.store(q, std::memory_order_release);
id = wid;
goto L1;
}
#endif
return false;
}
inline bool nextTask(forall_task_t *task, const int wid) {
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
if(canUseNoStealing()){
return nextTaskConcurrentNoStealing(task, wid);
}
#endif
const long endchunk = (_chunk-1)*_step + 1;
int id = wid;
if (data[id].ntask) {
L1:
long start = data[id].task.start;
long end = (std::min)(start+endchunk, data[id].task.end);
--data[id].ntask, (data[id].task).start = (end-1)+_step;
task->set(start, end);
return true;
}
// no available task for the current thread
#if !defined(PARFOR_MULTIPLE_TASKS_STEALING)
// the following scheduling policy for the tasks focuses mostly to load-balancing
if (maxid<0) { //check if maxid has been set
L2:
maxid = (long) (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
if (data[maxid].ntask > 0) {
id=maxid;
goto L1;
}
// no more tasks, exit
} else {
if (data[maxid].ntask > 0) {
id=maxid;
goto L1;
}
goto L2;
}
#else
auto flag=false;
if (maxid<0) {
L2:
maxid = (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
flag=true;
}
id = maxid;
if (data[id].ntask>0) {
if (data[id].ntask<=3) goto L1;
// steal half of the tasks
auto q = data[id].ntask >> 1, r = data[id].ntask & 0x1;
data[id].ntask = q;
data[wid].ntask = q+r;
data[wid].task.end = data[id].task.end;
data[id].task.end = data[id].task.start + (q*_chunk-1)*_step +1;
data[wid].task.start = data[id].task.end;
id = wid;
goto L1;
} else if (!flag) goto L2;
#endif
return false;
}
inline void* svc(void* t) {
if (t==NULL) {
if (totaltasks==0) { lb->broadcast_task(GO_OUT); return GO_OUT;}
sendTask();
return GO_ON;
}
auto wid = lb->get_channel_id();
assert(wid>=0);
if (--totaltasks <=0) {
if (!eossent[wid]) {
lb->ff_send_out_to(workersspinwait?EOS_NOFREEZE:GO_OUT, int(wid));
eossent[wid]=true;
}
return GO_OUT;
}
if (nextTask((forall_task_t*)t, (int) wid)) lb->ff_send_out_to(t, int(wid));
else {
if (!eossent[wid]) {
lb->ff_send_out_to((workersspinwait?EOS_NOFREEZE:GO_OUT), int(wid));
eossent[wid]=true;
}
}
return GO_ON;
}
inline void setloop(long start, long stop, long step, long chunk, size_t nw) {
_start=start, _stop=stop, _step=step, _chunk=chunk, _nw=nw;
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
_nextIteration = _start;
#endif
if (_chunk<=0) totaltasks = init_data_static(start,stop);
else totaltasks = init_data(start,stop);
assert(totaltasks>=1);
// adjust the number of workers that have to be started
if ( (totaltasks/(double)_nw) <= 1.0 || (totaltasks==1) )
_nw = totaltasks;
}
inline long startIdx() const { return _start;}
inline long stopIdx() const { return _stop;}
inline long stepIdx() const { return _step;}
inline size_t running() const { return _nw; }
inline void workersSpinWait() { workersspinwait=true;}
inline size_t getnumtasks() const { return totaltasks;}
protected:
// the following fields are used only by the scheduler thread
ff_loadbalancer *lb;
long _start,_stop,_step; // for step
long _chunk; // a chunk of indexes
size_t totaltasks; // total n. of tasks
size_t _nw; // num. of workers
long jump;
bool skip1;
bool workersspinwait;
bool static_scheduling;
std::vector<forall_task_t> taskv;
};
// parallel for/reduce worker node
template<typename Tres>
class forallreduce_W: public ff_node {
public:
typedef Tres Tres_t;
typedef std::function<void(const long,const long, const int, Tres&)> F_t;
protected:
virtual inline void losetime_in(unsigned long) {
//FFTRACE(lostpopticks+=ff_node::TICKS2WAIT; ++popwait); // FIX
workerlosetime_in(aggressive);
}
public:
forallreduce_W(forall_Scheduler *const sched, ffBarrier *const loopbar, F_t F):
sched(sched),loopbar(loopbar), schedRunning(true),
spinwait(false), aggressive(true),F(F) {}
inline void setSchedRunning(bool r) { schedRunning = r; }
inline void* svc(void* t) {
auto task = (forall_task_t*)t;
auto myid = get_my_id();
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
forall_task_t tmptask;
if(t != (void*) &dummyTask || schedRunning){
F(task->start,task->end,myid,res);
if (schedRunning) return t;
}else{
task = &tmptask;
}
#else
F(task->start,task->end,myid,res);
if (schedRunning) return t;
#endif
// the code below is executed only if the scheduler thread is not running
while(sched->nextTaskConcurrent(task,myid))
F(task->start,task->end,myid,res);
if (spinwait) {
loopbar->doBarrier(myid);
return GO_ON;
}
return GO_OUT;
}
inline void enableSpinWait() { spinwait=true; }
inline void setF(F_t _F, const Tres& idtt, bool a=true) {
F=_F, res=idtt, aggressive=a;
}
inline const Tres& getres() const { return res; }
protected:
forall_Scheduler *const sched;
ffBarrier *const loopbar;
bool schedRunning;
protected:
bool spinwait,aggressive;
F_t F;
Tres res;
};
class forallpipereduce_W: public forallreduce_W<ff_buffernode> {
public:
typedef ff_buffernode Tres_t;
typedef std::function<void(const long,const long, const int, ff_buffernode&)> F_t;
public:
forallpipereduce_W(forall_Scheduler *const sched,ffBarrier *const loopbar, F_t F):
forallreduce_W<ff_buffernode>(sched,loopbar,F) {
res.set(8192,false,get_my_id());
res.init_blocking_stuff();
}
inline void* svc(void* t) {
auto task = (forall_task_t*)t;
auto myid = get_my_id();
F(task->start,task->end,myid,res);
if (schedRunning) return t;
// the code below is executed only if the scheduler thread is not running
while(sched->nextTaskConcurrent(task,myid))
F(task->start,task->end,myid,res);
if (spinwait) {
res.ff_send_out(EOS);
loopbar->doBarrier(myid);
return GO_ON;
}
return GO_OUT;
}
void svc_end() { res.ff_send_out(EOS); }
inline void setF(F_t _F, const Tres_t&, bool a=true) {
F=_F, aggressive=a;
}
// The following methods are custom for this node which is not multi-output. FIX
bool isMultiOutput() const { return true; }
void get_out_nodes(svector<ff_node*> &w) { w.push_back(&res); }
void get_out_nodes_feedback(svector<ff_node*> &w) { w.push_back(this); }
};
template <typename Worker_t>
class ff_forall_farm: public ff_farm {
public:
typedef typename Worker_t::Tres_t Tres_t;
typedef typename Worker_t::F_t F_t;
protected:
// removes possible EOS still in the input queues of the workers
inline void resetqueues(const int _nw) {
const svector<ff_node*> &nodes = getWorkers();
for(int i=0;i<_nw;++i) nodes[i]->reset();
}
// used just to redefine losetime_in
class foralllb_t: public ff_loadbalancer {
protected:
virtual inline void losetime_in(unsigned long) {
if ((int)(getnworkers())>=ncores) {
//FFTRACE(lostpopticks+=(100*TICKS2WAIT);++popwait); // FIX: adjust tracing
ff_relax(0);
return;
}
//FFTRACE(lostpushticks+=TICKS2WAIT;++pushwait);
PAUSE();
}
public:
foralllb_t(size_t n):ff_loadbalancer(n),ncores(ff_realNumCores()) {}
inline int getNCores() const { return ncores;}
private:
const int ncores;
};
private:
Tres_t t; // not used
size_t numCores;
ffBarrier *loopbar;
public:
ff_forall_farm(ssize_t maxnw, const bool spinwait=false, const bool skipwarmup=false, const bool spinbarrier=false):
ff_farm(false,8*DEF_MAX_NUM_WORKERS,8*DEF_MAX_NUM_WORKERS,
true, DEF_MAX_NUM_WORKERS,true), // cleanup at exit !
loopbar( (spinwait && spinbarrier) ?
(ffBarrier*)(new spinBarrier(maxnw<=0?DEF_MAX_NUM_WORKERS+1:(size_t)(maxnw+1))) :
(ffBarrier*)(new Barrier(maxnw<=0?DEF_MAX_NUM_WORKERS+1:(size_t)(maxnw+1))) ),
skipwarmup(skipwarmup),spinwait(spinwait) {
foralllb_t* _lb = new foralllb_t(DEF_MAX_NUM_WORKERS);
assert(_lb);
ff_farm::setlb(_lb);
numCores = ((foralllb_t*const)getlb())->getNCores();
if (maxnw<=0) maxnw=numCores;
std::vector<ff_node *> forall_w;
auto donothing=[](const long,const long,const int,const Tres_t&) -> void { };
forall_Scheduler *sched = new forall_Scheduler(getlb(),maxnw);
ff_farm::add_emitter(sched);
for(size_t i=0;i<(size_t)maxnw;++i)
forall_w.push_back(new Worker_t(sched, loopbar, donothing));
ff_farm::add_workers(forall_w);
ff_farm::wrap_around();
// needed to avoid the initial barrier (see (**) below)
if (ff_farm::prepare() < 0)
error("running base forall farm(2)\n");
// NOTE: the warmup phase has to be done, if not now later on.
// The run_then_freeze method will fail if skipwarmup is true.
if (!skipwarmup) {
auto r=-1;
getlb()->freeze();
if (getlb()->run() != -1)
r = getlb()->wait_freezing();
if (r<0) error("running base forall farm(1)\n");
}
if (spinwait) {
sched->workersSpinWait();
for(size_t i=0;i<(size_t)maxnw;++i) {
//auto w = (forallreduce_W<Tres>*)forall_w[i];
auto w = (Worker_t*)forall_w[i];
w->enableSpinWait();
}
//resetqueues(maxnw);
}
ff_farm::cleanup_all(); // delete everything at exit
}
virtual ~ff_forall_farm() {
if (loopbar) delete loopbar;
if (ff_farm::getlb()) delete ff_farm::getlb();
}
// It returns true if the scheduler has to be started, false otherwise.
//
// Unless the removeSched flag is set, the scheduler thread will be started
// only if there are less threads than cores AND if the number of tasks per thread
// is greather than 1. In case of static scheduling (i.e. chunk<=0), the scheduler
// is never started because numtasks == nwtostart;
//
// By defining at compile time NO_PARFOR_SCHEDULER_THREAD the
// scheduler won't be started.
//
// To always start the scheduler thread, the PARFOR_SCHEDULER_THREAD
// may be defined at compile time.
//
inline bool startScheduler(const size_t nwtostart, const size_t numtasks) const {
#if defined(NO_PARFOR_SCHEDULER_THREAD)
return false;
#elif defined(PARFOR_SCHEDULER_THREAD)
return true;
#else
if (removeSched) return false;
return ((numtasks > nwtostart) && (nwtostart < numCores));
#endif
}
// set/reset removeSched flag
// By calling this method with 'true' the scheduler will be disabled.
//
// NOTE:
// Sometimes may be usefull (in terms of performance) to explicitly disable
// the scheduler thread when #numworkers > ff_realNumCores() on systems where
// ff_numCores() > ff_realNumCores() (i.e. HT or SMT is enabled)
inline void disableScheduler(bool onoff=true) { removeSched=onoff; }
inline int run_then_freeze(ssize_t nw_=-1) {
assert(skipwarmup == false);
const ssize_t nwtostart = (nw_ == -1)?getNWorkers():nw_;
auto r = -1;
if (schedRunning) {
getlb()->skipfirstpop(true);
if (spinwait) {
// NOTE: here we have to be sure to send one task to each worker!
((forall_Scheduler*)getEmitter())->sendTask(true);
}
r=ff_farm::run_then_freeze(nwtostart);
} else {
if (spinwait) {
// all worker threads have already crossed the barrier so it is safe to restart it
loopbar->barrierSetup(nwtostart+1);
// NOTE: here is not possible to use sendTask because otherwise there could be
// a race between the main thread and the workers in accessing the task table.
((forall_Scheduler*)getEmitter())->sendWakeUp();
} else
((forall_Scheduler*)getEmitter())->sendTask(true);
r = getlb()->thawWorkers(true, nwtostart);
}
return r;
}
inline int run_and_wait_end() {
assert(spinwait == false);
const size_t nwtostart = getnw();
auto r= -1;
if (schedRunning) {
//resetqueues(nwtostart);
getlb()->skipfirstpop(true);
// (**) this way we avoid the initial barrier
if (getlb()->runlb()!= -1) {
if (getlb()->runWorkers(nwtostart)!=-1)
r = getlb()->wait();
}
} else {
((forall_Scheduler*)getEmitter())->sendTask(true);
if (getlb()->runWorkers(nwtostart) != -1)
r = getlb()->waitWorkers();
}
return r;
}
// it puts all threads to sleep but does not disable the spinWait flag
inline int stopSpinning() {
if (!spinwait) return -1;
// getnworkers() returns the number of threads that are running
// it may be different from getnw() (i.e. the n. of threads currently
// executing the parallel iterations)
size_t running = getlb()->getnworkers();
if (running == (size_t)-1) return 0;
getlb()->freezeWorkers();
getlb()->broadcast_task(GO_OUT);
return getlb()->wait_freezingWorkers();
}
inline int enableSpinning() {
if (spinwait) return -1;
const svector<ff_node*> &nodes = getWorkers();
for(size_t i=0;i<nodes.size();++i) {
auto w = (Worker_t*)nodes[i];
w->enableSpinWait();
}
((forall_Scheduler*)getEmitter())->workersSpinWait();
spinwait = true;
return 0;
}
inline int wait_freezing() {
//if (startScheduler(getnw())) return getlb()->wait_lb_freezing();
if (schedRunning) return getlb()->wait_lb_freezing();
if (spinwait) {
loopbar->doBarrier(getnw());
return 0;
}
return getlb()->wait_freezingWorkers();
}
inline int wait() {
if (spinwait){
const svector<ff_node*> &nodes = getWorkers();
for(size_t i=0;i<nodes.size();++i)
getlb()->ff_send_out_to(EOS,i);
}
return ff_farm::wait();
}
inline void setF(F_t _F, const Tres_t& idtt=Tres_t()) { //(Tres)0) {
const size_t nw = getnw();
const svector<ff_node*> &nodes = getWorkers();
// aggressive mode enabled if the number of threads is less than
// or equal to the number of cores
const bool mode = (nw <= numCores);
// NOTE: in case of static scheduling, the scheduler is never started !
schedRunning = (!removeSched && startScheduler(nw, ((forall_Scheduler*)getEmitter())->getnumtasks()));
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
globalSchedRunning = schedRunning;
#endif
if (schedRunning) {
for(size_t i=0;i<nw;++i) {
//auto w = (forallreduce_W<Tres>*)nodes[i];
auto w = (Worker_t*)nodes[i];
w->setF(_F, idtt, mode);
w->setSchedRunning(true);
}
} else {
for(size_t i=0;i<nw;++i) {
//auto w = (forallreduce_W<Tres>*)nodes[i];
auto w = (Worker_t*)nodes[i];
w->setF(_F, idtt, mode);
w->setSchedRunning(false);
}
}
}
/* NOTE: - chunk>0 means dynamic scheduling with grain equal to chunk, that is,
* no more than chunk iterations at a time is computed by
* one thread
* - chunk==0 means default static scheduling, that is, a bunch of ~(#iteration/nw)
* iterations per thread is computed by each thread
* - chunk<0 means static scheduling with grain equal to chunk, that is,
* the iteration space is divided in chunks each one of no more
* than chunk iterations. Then chunks are assigned to the threads
* in a round-robin fashion.
*/
inline void setloop(long begin,long end,long step,long chunk,long nw) {
if (nw>(ssize_t)getNWorkers()) {
error("The number of threads specified is greater than the number set in the ParallelFor* constructor, it will be downsized\n");
nw = getNWorkers();
}
assert(nw<=(ssize_t)getNWorkers());
forall_Scheduler *sched = (forall_Scheduler*)getEmitter();
sched->setloop(begin,end,step,chunk,(nw<=0)?getNWorkers():(size_t)nw);
}
// return the number of workers running or supposed to run
inline size_t getnw() { return ((const forall_Scheduler*)getEmitter())->running(); }
inline const Tres_t& getres(int i) {
//return ((forallreduce_W<Tres>*)(getWorkers()[i]))->getres();
return ((Worker_t*)(getWorkers()[i]))->getres();
}
inline long startIdx(){ return ((const forall_Scheduler*)getEmitter())->startIdx(); }
inline long stopIdx() { return ((const forall_Scheduler*)getEmitter())->stopIdx(); }
inline long stepIdx() { return ((const forall_Scheduler*)getEmitter())->stepIdx(); }
void resetskipwarmup() { assert(skipwarmup); skipwarmup=false;}
protected:
bool removeSched = false;
bool schedRunning= true;
bool skipwarmup = false;
bool spinwait = false;
};
} // namespace ff
#endif /* FF_PARFOR_INTERNALS_HPP */