1240 lines
54 KiB
C++
1240 lines
54 KiB
C++
/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
|
|
/*!
|
|
* \link
|
|
* \file parallel_for_internals.hpp
|
|
* \ingroup aux_classes
|
|
*
|
|
* \brief Internal classes and functions for parallel_for/parallel_reduce skeletons.
|
|
*/
|
|
|
|
/* ***************************************************************************
|
|
*
|
|
* FastFlow is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU Lesser General Public License version 3 as
|
|
* published by the Free Software Foundation.
|
|
* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
|
|
* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
* License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*
|
|
****************************************************************************
|
|
*/
|
|
|
|
/*
|
|
* - Author:
|
|
* Massimo Torquati <torquati@di.unipi.it>
|
|
*
|
|
* History:
|
|
* - started in May 2013
|
|
* - January 2014: code optimized
|
|
* - February 2014:
|
|
* - avoided to start the Scheduler thread if it is not needed
|
|
* A new (non lock-free) decentralized scheduler has been implemented
|
|
* for the case when adding an extra thread is not useful.
|
|
* - introduced the parallel_for functions
|
|
* - added the ParallelFor and ParallelForReduce classes
|
|
* - June 2014:
|
|
* - parallel_for_static
|
|
*
|
|
*/
|
|
|
|
#ifndef FF_PARFOR_INTERNALS_HPP
|
|
#define FF_PARFOR_INTERNALS_HPP
|
|
|
|
// #ifndef __INTEL_COMPILER
|
|
// // see http://www.stroustrup.com/C++11FAQ.html#11
|
|
// #if __cplusplus <= 199711L
|
|
// #error "parallel_for requires C++11 features"
|
|
// #endif
|
|
// #endif
|
|
|
|
#include <atomic>
|
|
#include <algorithm>
|
|
#include <deque>
|
|
#include <vector>
|
|
#include <cmath>
|
|
#include <functional>
|
|
#include <ff/lb.hpp>
|
|
#include <ff/node.hpp>
|
|
#include <ff/farm.hpp>
|
|
#include <ff/spin-lock.hpp>
|
|
|
|
enum {FF_AUTO=-1};
|
|
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
static int dummyTask;
|
|
static bool globalSchedRunning;
|
|
#endif
|
|
|
|
#if defined(__ICC)
|
|
#define PRAGMA_IVDEP _Pragma("ivdep")
|
|
#else
|
|
#define PRAGMA_IVDEP
|
|
#endif
|
|
|
|
namespace ff {
|
|
|
|
/* -------------------- Parallel For/Reduce Macros -------------------- */
|
|
/* Usage example:
|
|
* // loop parallelization using 3 workers
|
|
* // and a minimum task grain of 2
|
|
* wthread = 3;
|
|
* grain = 2;
|
|
* for(int i=0;i<N;++i) FF_PARFOR_BEGIN(for,i,0,N,1,grain,wthread) {
|
|
* A[i]=f(i) ----> A[i]=f(i);
|
|
* } FF_PARFOR_END(for);
|
|
*
|
|
* parallel for + reduction:
|
|
*
|
|
* s=4;
|
|
* for(int i=0;i<N;++i) FF_PARFORREDUCE_BEGIN(for,s,0,i,0,N,1,grain,wthread) {
|
|
* s*=f(i) ----> s*=f(i);
|
|
* } FF_PARFORREDUCE_END(for,s,*);
|
|
*
|
|
*
|
|
* FF_PARFOR_INIT(pf,maxwthread);
|
|
* ....
|
|
* while(k<nTime) { while(k<nTime) {
|
|
* for(int i=0;i<N;++i) FF_PARFORREDUCE_START(pf,s,0,i,0,N,1,grain,wthread) {
|
|
* s*=f(i,k); ----> s*=f(i,k);
|
|
* } } FF_PARFORREDUCE_STOP(pf,s,*);
|
|
* }
|
|
* ....
|
|
*
|
|
* FF_PARFOR_DONE(pf);
|
|
*
|
|
*
|
|
* NOTE: inside the body of the PARFOR/PARFORREDUCE, it is possible to use the
|
|
* '_ff_thread_id' const integer variable to identify the thread id
|
|
* running the sequential portion of the loop.
|
|
*/
|
|
|
|
/**
|
|
* name : of the parallel for
|
|
* idx : iteration index
|
|
* begin: for starting point
|
|
* end : for ending point
|
|
* step : for step
|
|
* chunk: chunk size
|
|
* nw : n. of worker threads
|
|
*/
|
|
#define FF_PARFOR_BEGIN(name, idx, begin, end, step, chunk, nw) \
|
|
ff_forall_farm<forallreduce_W<int> > name(nw,false,true); \
|
|
name.setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, const int) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id); \
|
|
PRAGMA_IVDEP; \
|
|
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
|
|
|
|
/* This is equivalent to the above one except that the user has to define
|
|
* the for loop in the range (ff_start_idx,ff_stop_idx(
|
|
* This can be useful if you have to perform some actions before starting
|
|
* the local loop and/or some actions after the local loop finishes.
|
|
* The onoff parameter allow to disable/enable the scheduler thread
|
|
* (by default the scheduler is active.
|
|
*/
|
|
#define FF_PARFOR_BEGIN_IDX(name, idx, begin, end, step, chunk, nw, onoff) \
|
|
ff_forall_farm<forallreduce_W<int> > name(nw,false,true); \
|
|
name.setloop(begin,end,step,chunk, nw); \
|
|
name.disableScheduler(onoff); \
|
|
auto F_##name = [&] (const long ff_start_idx, const long ff_stop_idx, \
|
|
const int _ff_thread_id, const int) { \
|
|
/* here you have to define the for loop using ff_start/stop_idx */
|
|
|
|
|
|
#define FF_PARFOR_END(name) \
|
|
}; \
|
|
{ \
|
|
if (name.getnw()>1) { \
|
|
name.setF(F_##name); \
|
|
if (name.run_and_wait_end()<0) { \
|
|
error("running parallel for\n"); \
|
|
} \
|
|
} else F_##name(name.startIdx(),name.stopIdx(),0,0); \
|
|
}
|
|
|
|
|
|
/* ---------------------------------------------- */
|
|
|
|
/**
|
|
* name : of the parallel for
|
|
* var : variable on which the reduce operator is applied
|
|
* identity: the value such that var == var op identity
|
|
* idx : iteration index
|
|
* begin : for starting point
|
|
* end : for ending point
|
|
* step : for step
|
|
* chunk : chunk size
|
|
* nw : n. of worker threads
|
|
*
|
|
* op : reduce operation (+ * ....)
|
|
*/
|
|
#define FF_PARFORREDUCE_BEGIN(name, var,identity, idx,begin,end,step, chunk, nw) \
|
|
ff_forall_farm<forallreduce_W<decltype(var)> > name(nw,false,true); \
|
|
name.setloop(begin,end,step,chunk,nw); \
|
|
auto idtt_##name =identity; \
|
|
auto F_##name =[&](const long start,const long stop,const int _ff_thread_id, \
|
|
decltype(var) &var) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id); \
|
|
PRAGMA_IVDEP; \
|
|
for(long idx=start;idx<stop;idx+=step)
|
|
|
|
#define FF_PARFORREDUCE_BEGIN_IDX(name, var,identity, idx,begin,end,step, chunk, nw, onoff) \
|
|
ff_forall_farm<forallreduce_W<decltype(var)> > name(nw,false,true); \
|
|
name.setloop(begin,end,step,chunk,nw); \
|
|
name.disableScheduler(onoff); \
|
|
auto idtt_##name =identity; \
|
|
auto F_##name =[&](const long ff_start_idx,const long ff_stop_idx,const int _ff_thread_id, \
|
|
decltype(var) &var) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id);
|
|
|
|
|
|
|
|
#define FF_PARFORREDUCE_END(name, var, op) \
|
|
}; \
|
|
if (name.getnw()>1) { \
|
|
auto ovar_##name = var; \
|
|
name.setF(F_##name,idtt_##name); \
|
|
if (name.run_and_wait_end()<0) { \
|
|
error("running forall_##name\n"); \
|
|
} \
|
|
var = ovar_##name; \
|
|
for(size_t i=0;i<name.getnw();++i) { \
|
|
var op##= name.getres(i); \
|
|
} \
|
|
} else { \
|
|
var = ovar_##name; \
|
|
F_##name(name.startIdx(),name.stopIdx(),0,var); \
|
|
}
|
|
|
|
|
|
#define FF_PARFORREDUCE_F_END(name, var, F) \
|
|
}; \
|
|
if (name.getnw()>1) { \
|
|
auto ovar_##name = var; \
|
|
name.setF(F_##name,idtt_##name); \
|
|
if (name.run_and_wait_end()<0) \
|
|
error("running ff_forall_farm (reduce F end)\n"); \
|
|
var = ovar_##name; \
|
|
for(size_t i=0;i<name.getnw();++i) { \
|
|
F(var,name.getres(i)); \
|
|
} \
|
|
} else { \
|
|
F_##name(name.startIdx(),name.stopIdx(),0,var); \
|
|
}
|
|
|
|
|
|
/* ---------------------------------------------- */
|
|
|
|
/* FF_PARFOR_START and FF_PARFOR_STOP have the same meaning of
|
|
* FF_PARFOR_BEGIN and FF_PARFOR_END but they have to be used in
|
|
* conjunction with FF_PARFOR_INIT FF_PARFOR_END.
|
|
*
|
|
* The same is for FF_PARFORREDUCE_START/STOP.
|
|
*/
|
|
#define FF_PARFOR_INIT(name, nw) \
|
|
ff_forall_farm<forallreduce_W<int> > *name = \
|
|
new ff_forall_farm<forallreduce_W<int> >(nw)
|
|
|
|
#define FF_PARFOR_DECL(name) ff_forall_farm<forallreduce_W<int> > * name
|
|
#define FF_PARFOR_ASSIGN(name,nw) name=new ff_forall_farm<forallreduce_W<int> >(nw)
|
|
#define FF_PARFOR_DONE(name) name->stop(); name->wait(); delete name;
|
|
|
|
#define FF_PARFORREDUCE_INIT(name, type, nw) \
|
|
ff_forall_farm<forallreduce_W<type> > *name = \
|
|
new ff_forall_farm<forallreduce_W<type> >(nw)
|
|
|
|
#define FF_PARFORREDUCE_DECL(name,type) ff_forall_farm<forallreduce_W<type> > * name
|
|
#define FF_PARFORREDUCE_ASSIGN(name,type,nw) name= \
|
|
new ff_forall_farm<forallreduce_W<type> >(nw)
|
|
#define FF_PARFORREDUCE_DONE(name) name->stop();name->wait();delete name
|
|
|
|
#define FF_PARFOR_START(name, idx, begin, end, step, chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, const int) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id); \
|
|
PRAGMA_IVDEP; \
|
|
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
|
|
|
|
#define FF_PARFOR2_START(name, idx, begin, end, step, chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, const int) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id);
|
|
/* here you have to define the for loop using ff_start/stop_##idx */
|
|
|
|
/* this is equivalent to FF_PARFOR2_START but the start/stop indexes have a fixed name */
|
|
#define FF_PARFOR_START_IDX(name, idx, begin, end, step, chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_idx, const long ff_stop_idx, \
|
|
const int _ff_thread_id, const int) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id);
|
|
/* here you have to define the for loop using ff_start/stop_idx */
|
|
|
|
|
|
// just another variat that may be used together with FF_PARFORREDUCE_INIT
|
|
#define FF_PARFOR_T_START(name, type, idx, begin, end, step, chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, const type&) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id); \
|
|
PRAGMA_IVDEP; \
|
|
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
|
|
|
|
|
|
// just another variat that may be used together with FF_PARFORREDUCE_INIT
|
|
#define FF_PARFOR_T_START_STATIC(name, type, idx, begin, end, step, chunk, nw) \
|
|
assert(chunk<=0); \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, const type&) { \
|
|
const long _ff_jump0=(name->getnw())*(-chunk*step); \
|
|
const long _ff_jump1=(-chunk*step); \
|
|
FF_IGNORE_UNUSED(_ff_thread_id); \
|
|
PRAGMA_IVDEP; \
|
|
for(long _ff_##idx=ff_start_##idx;_ff_##idx<ff_stop_##idx;_ff_##idx+=_ff_jump0) \
|
|
for(long idx=_ff_##idx,_ff_end_##idx=std::min(ff_stop_##idx,_ff_##idx +_ff_jump1); \
|
|
idx<_ff_end_##idx;idx+=step)
|
|
|
|
|
|
#define FF_PARFOR_T_START_IDX(name, type, idx, begin, end,step,chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto F_##name = [&] (const long ff_start_idx, const long ff_stop_idx, \
|
|
const int _ff_thread_id, const type&) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id);
|
|
/* here you have to use the fixed indexes ff_idx_start, ff_idx_stop */
|
|
|
|
#define FF_PARFOR_STOP(name) \
|
|
}; \
|
|
if (name->getnw()>1) { \
|
|
name->setF(F_##name); \
|
|
if (name->run_then_freeze(name->getnw())<0) \
|
|
error("running ff_forall_farm (name)\n"); \
|
|
name->wait_freezing(); \
|
|
} else F_##name(name->startIdx(),name->stopIdx(),0,0);
|
|
|
|
#define FF_PARFOR_T_STOP(name, type) \
|
|
}; \
|
|
if (name->getnw()>1) { \
|
|
name->setF(F_##name, type()); \
|
|
if (name->run_then_freeze(name->getnw())<0) \
|
|
error("running ff_forall_farm (name)\n"); \
|
|
name->wait_freezing(); \
|
|
} else { \
|
|
F_##name(name->startIdx(),name->stopIdx(),0,type()); \
|
|
}
|
|
|
|
#define FF_PARFORREDUCE_START(name, var,identity, idx,begin,end,step, chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto idtt_##name =identity; \
|
|
auto F_##name =[&](const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, decltype(var) &var) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id); \
|
|
PRAGMA_IVDEP \
|
|
for(long idx=ff_start_##idx;idx<ff_stop_##idx;idx+=step)
|
|
|
|
#define FF_PARFORREDUCE_START_IDX(name, var,identity, idx,begin,end,step, chunk, nw) \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto idtt_##name =identity; \
|
|
auto F_##name =[&](const long ff_start_idx, const long ff_stop_idx, \
|
|
const int _ff_thread_id, decltype(var) &var) { \
|
|
FF_IGNORE_UNUSED(_ff_thread_id);
|
|
|
|
|
|
|
|
#define FF_PARFORREDUCE_START_STATIC(name, var,identity, idx,begin,end,step, chunk, nw) \
|
|
assert(chunk<=0); \
|
|
name->setloop(begin,end,step,chunk,nw); \
|
|
auto idtt_##name =identity; \
|
|
auto F_##name =[&](const long ff_start_##idx, const long ff_stop_##idx, \
|
|
const int _ff_thread_id, decltype(var) &var) { \
|
|
const long _ff_jump0=(name->getnw())*(-chunk*step); \
|
|
const long _ff_jump1=(-chunk*step); \
|
|
PRAGMA_IVDEP; \
|
|
for(long _ff_##idx=ff_start_##idx;_ff_##idx<ff_stop_##idx;_ff_##idx+=_ff_jump0) \
|
|
for(long idx=_ff_##idx,_ff_end_##idx=std::min(ff_stop_##idx,_ff_##idx +_ff_jump1); \
|
|
idx<_ff_end_##idx;idx+=step)
|
|
|
|
|
|
#define FF_PARFORREDUCE_STOP(name, var, op) \
|
|
}; \
|
|
if (name->getnw()>1) { \
|
|
auto ovar_##name = var; \
|
|
name->setF(F_##name,idtt_##name); \
|
|
if (name->run_then_freeze(name->getnw())<0) \
|
|
error("running ff_forall_farm (name)\n"); \
|
|
name->wait_freezing(); \
|
|
var = ovar_##name; \
|
|
for(size_t i=0;i<name->getnw();++i) { \
|
|
var op##= name->getres(i); \
|
|
} \
|
|
} else { \
|
|
F_##name(name->startIdx(),name->stopIdx(),0,var); \
|
|
}
|
|
|
|
|
|
#define FF_PARFORREDUCE_F_STOP(name, var, F) \
|
|
}; \
|
|
if (name->getnw()>1) { \
|
|
auto ovar_##name = var; \
|
|
name->setF(F_##name,idtt_##name); \
|
|
if (name->run_then_freeze(name->getnw())<0) \
|
|
error("running ff_forall_farm (name)\n"); \
|
|
name->wait_freezing(); \
|
|
var = ovar_##name; \
|
|
for(size_t i=0;i<name->getnw();++i) { \
|
|
F(var,name->getres(i)); \
|
|
} \
|
|
} else { \
|
|
F_##name(name->startIdx(),name->stopIdx(),0,var); \
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// see NOTE in setloop to understand the meaning of 'default static'
|
|
// 'static with grain size' and 'dynamic with grain size'
|
|
//
|
|
#define PARFOR_STATIC(X) (X>0?-X:X)
|
|
#define PARFOR_DYNAMIC(X) (X<0?-X:X)
|
|
|
|
/* ------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
// parallel for task, it represents a range (start,end( of indexes
|
|
struct forall_task_t {
|
|
forall_task_t() : end(0) {
|
|
start.store(0); // MA: consistency of store to be checked
|
|
}
|
|
forall_task_t(const forall_task_t &t):end(t.end) {
|
|
start.store(t.start.load(std::memory_order_relaxed)); // MA: consistency of store to be checked
|
|
}
|
|
forall_task_t & operator=(const forall_task_t &t) {
|
|
start=t.start.load(std::memory_order_relaxed), end=t.end;
|
|
return *this;
|
|
}
|
|
void set(long s, long e) { start=s,end=e; }
|
|
|
|
std::atomic_long start;
|
|
long end;
|
|
};
|
|
struct dataPair {
|
|
std::atomic_long ntask;
|
|
ALIGN_TO_PRE(CACHE_LINE_SIZE)
|
|
forall_task_t task;
|
|
ALIGN_TO_POST(CACHE_LINE_SIZE)
|
|
|
|
dataPair():task() {
|
|
ntask.store(0); // MA: consistency of store to be checked
|
|
};
|
|
dataPair(const dataPair &d):task(d.task) {
|
|
ntask.store(d.ntask.load(std::memory_order_relaxed)); // MA: consistency of store to be checked
|
|
}
|
|
dataPair& operator=(const dataPair &d) { ntask=d.ntask.load(std::memory_order_relaxed), task=d.task; return *this; }
|
|
};
|
|
|
|
// compare functiong
|
|
static inline bool data_cmp(const dataPair &a,const dataPair &b) {
|
|
return a.ntask < b.ntask;
|
|
}
|
|
// delay function for worker threads
|
|
static inline void workerlosetime_in(const bool aggressive) {
|
|
if (aggressive) PAUSE();
|
|
else ff_relax(0);
|
|
}
|
|
|
|
|
|
// parallel for/reduce task scheduler
|
|
class forall_Scheduler: public ff_node {
|
|
protected:
|
|
std::vector<bool> eossent;
|
|
std::vector<dataPair> data;
|
|
std::atomic_long maxid;
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
std::atomic_long _nextIteration;
|
|
#endif
|
|
protected:
|
|
// initialize the data vector
|
|
virtual inline size_t init_data(ssize_t start, ssize_t stop) {
|
|
static_scheduling = false; // enable work stealing in the nextTaskConcurrent
|
|
const long numtasks = std::lrint(std::ceil((stop-start)/(double)_step));
|
|
long totalnumtasks = std::lrint(std::ceil(numtasks/(double)_chunk));
|
|
long tt = totalnumtasks;
|
|
size_t ntxw = totalnumtasks / _nw;
|
|
size_t r = totalnumtasks % _nw;
|
|
|
|
// try to keep the n. of tasks per worker as smaller as possible
|
|
if (ntxw == 0 && r>=1) { ntxw = 1, r = 0; }
|
|
|
|
data.resize(_nw); eossent.resize(_nw);
|
|
taskv.resize(8*_nw); // 8 is the maximum n. of jumps, see the heuristic below
|
|
skip1=false,jump=0,maxid=-1;
|
|
|
|
ssize_t end, t=0, e;
|
|
for(size_t i=0;i<_nw && totalnumtasks>0;++i, totalnumtasks-=t) {
|
|
t = ntxw + ( (r>1 && (i<r)) ? 1 : 0 );
|
|
e = start + (t*_chunk - 1)*_step + 1;
|
|
end = (e<stop) ? e : stop;
|
|
data[i].ntask=t;
|
|
data[i].task.set(start,end);
|
|
start = (end-1)+_step;
|
|
}
|
|
|
|
if (totalnumtasks) {
|
|
assert(totalnumtasks==1);
|
|
// try to keep the n. of tasks per worker as smaller as possible
|
|
if (ntxw > 1) data[_nw-1].ntask += totalnumtasks;
|
|
else { --tt, _chunk*=2; }
|
|
data[_nw-1].task.end = stop;
|
|
}
|
|
// printf("init_data\n");
|
|
// for(size_t i=0;i<_nw;++i) {
|
|
// printf("W=%ld %ld <%ld,%ld>\n", i, data[i].ntask.load(), data[i].task.start.load(), data[i].task.end);
|
|
// }
|
|
// printf("totaltasks=%ld\n", tt);
|
|
return tt;
|
|
}
|
|
// initialize the data vector
|
|
virtual inline size_t init_data_static(long start, long stop) {
|
|
assert(_chunk <= 0);
|
|
static_scheduling = true; // this forces static scheduling in the nextTaskConcurrent
|
|
skip1=false,jump=0,maxid=-1;
|
|
|
|
if (_chunk == 0) {
|
|
// default static scheduling, i.e. the iteration space is almost equally divided
|
|
// in contiguous chunks among threads
|
|
const long numtasks = std::lrint(std::ceil((stop-start)/(double)_step));
|
|
long totalnumtasks = (long)_nw;
|
|
size_t r = numtasks % _nw;
|
|
_chunk = numtasks / long(_nw);
|
|
|
|
data.resize(_nw); taskv.resize(_nw);eossent.resize(_nw);
|
|
|
|
long end, e;
|
|
for(size_t i=0; totalnumtasks>0; ++i,--totalnumtasks) {
|
|
e = start + (_chunk - 1)*_step + 1 + ((i<r) ? _step : 0 );
|
|
end = (e<stop) ? e : stop;
|
|
data[i].ntask=1;
|
|
data[i].task.set(start,end);
|
|
start = (end-1)+_step;
|
|
}
|
|
if (r) ++_chunk;
|
|
return _nw;
|
|
}
|
|
// fill out the table with only the first task just to start the worker threads
|
|
long chunk = -_chunk;
|
|
_chunk = stop; // needed because sendTask has to send the range (begin, stop(
|
|
const long numtasks = std::lrint(std::ceil((stop-start)/(double)_step));
|
|
const long totalnumtasks = std::lrint(std::ceil(numtasks/(double)chunk));
|
|
const size_t ntxw = (std::min)(_nw, (size_t)totalnumtasks);
|
|
|
|
for(size_t i=0;i<ntxw;++i) {
|
|
data[i].ntask = 1;
|
|
data[i].task.set(start+long(i)*chunk,stop);
|
|
}
|
|
// printf("init_data_static\n");
|
|
// for(size_t i=0;i<_nw;++i) {
|
|
// long start=data[i].task.start;
|
|
// long ntask=data[i].ntask;
|
|
// printf("W=%ld %ld <%ld,%ld>\n", i, ntask.load(), start.load(), data[i].task.end);
|
|
// }
|
|
// printf("total task=%ld\n", ntxw);
|
|
|
|
return ntxw;
|
|
}
|
|
public:
|
|
forall_Scheduler(ff_loadbalancer* lb, long start, long stop, long step, long chunk, size_t nw):
|
|
lb(lb),_start(start),_stop(stop),_step(step),_chunk(chunk),totaltasks(0),_nw(nw),
|
|
jump(0),skip1(false),workersspinwait(false),static_scheduling(false) {
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
_nextIteration = _start;
|
|
#endif
|
|
maxid.store(-1); // MA: consistency of store to be checked
|
|
if (_chunk<=0) totaltasks = init_data_static(start,stop);
|
|
else totaltasks = init_data(start,stop);
|
|
assert(totaltasks>=1);
|
|
}
|
|
forall_Scheduler(ff_loadbalancer* lb, size_t nw):
|
|
lb(lb),_start(0),_stop(0),_step(1),_chunk(1),totaltasks(0),_nw(nw),
|
|
jump(0),skip1(false),workersspinwait(false),static_scheduling(false) {
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
_nextIteration = 0;
|
|
#endif
|
|
maxid.store(-1); // MA: consistency of store to be checked
|
|
totaltasks = init_data(0,0);
|
|
assert(totaltasks==0);
|
|
}
|
|
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
inline bool canUseNoStealing(){
|
|
return !globalSchedRunning && !static_scheduling && _step == 1 && _chunk == 1;
|
|
}
|
|
#endif
|
|
inline bool sendTask(const bool skipmore=false) {
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
if(canUseNoStealing()){
|
|
// Just start the workers and die.
|
|
for(size_t wid=0;wid<_nw;++wid) {
|
|
lb->ff_send_out_to((void*) &dummyTask, (int) wid);
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
size_t remaining = totaltasks;
|
|
const long endchunk = (_chunk-1)*_step + 1;
|
|
|
|
more:
|
|
for(size_t wid=0;wid<_nw;++wid) {
|
|
if (data[wid].ntask >0) {
|
|
long start = data[wid].task.start;
|
|
long end = (std::min)(start+endchunk, data[wid].task.end);
|
|
taskv[wid+jump].set(start, end);
|
|
lb->ff_send_out_to(&taskv[wid+jump], (int) wid);
|
|
--remaining, --data[wid].ntask;
|
|
(data[wid].task).start = (end-1)+_step;
|
|
eossent[wid]=false;
|
|
} else skip1=true; //skip2=skip3=true;
|
|
}
|
|
// January 2014 (massimo): this heuristic maight not be the best option in presence
|
|
// of very high load imbalance between iterations.
|
|
// Update: removed skip2 and skip3 so that it is less aggressive !
|
|
|
|
jump+=long(_nw);
|
|
assert((jump / _nw) <= 8);
|
|
// heuristic: try to assign more task at the very beginning
|
|
if (!skipmore && !skip1 && totaltasks>=4*_nw) { skip1=true; goto more;}
|
|
//if (!skip2 && totaltasks>=64*_nw) { skip1=false; skip2=true; goto moretask;}
|
|
//if (!skip3 && totaltasks>=1024*_nw){ skip1=false; skip2=false; skip3=true; goto moretask;}
|
|
return (remaining>0);
|
|
}
|
|
|
|
inline void sendWakeUp() {
|
|
for(size_t id=0;id<_nw;++id) {
|
|
taskv[id].set(0,0);
|
|
lb->ff_send_out_to(&taskv[id], int(id));
|
|
}
|
|
}
|
|
inline bool nextTaskConcurrentNoStealing(forall_task_t *task, const int wid) {
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
long r = _nextIteration.fetch_add(_step);
|
|
if(r >= _stop){return false;}
|
|
task->set(r, r + _step);
|
|
return true;
|
|
#else
|
|
FF_IGNORE_UNUSED(task);
|
|
FF_IGNORE_UNUSED(wid);
|
|
error("To use nextTaskConcurrentNoStealing you need to define macro FF_PARFOR_PASSIVE_NOSTEALING\n");
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
// this method is accessed concurrently by all worker threads
|
|
inline bool nextTaskConcurrent(forall_task_t *task, const int wid) {
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
if(canUseNoStealing()){
|
|
return nextTaskConcurrentNoStealing(task, wid);
|
|
}
|
|
#endif
|
|
const long endchunk = (_chunk-1)*_step + 1; // next end-point
|
|
auto id = wid;
|
|
L1:
|
|
if (data[id].ntask.load(std::memory_order_acquire)>0) {
|
|
auto oldstart = data[id].task.start.load(std::memory_order_relaxed);
|
|
auto end = (std::min)(oldstart+endchunk, data[id].task.end);
|
|
auto newstart = (end-1)+_step;
|
|
|
|
if (!data[id].task.start.compare_exchange_weak(oldstart, newstart,
|
|
std::memory_order_release,
|
|
std::memory_order_relaxed)) {
|
|
workerlosetime_in(_nw <= lb->getnworkers());
|
|
goto L1; // restart the sequence from the beginning
|
|
}
|
|
|
|
// after fetch_sub ntask may be less than 0
|
|
data[id].ntask.fetch_sub(1,std::memory_order_release);
|
|
if (oldstart<end) { // it might be possible that oldstart == end
|
|
task->set(oldstart, end);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// no available task for the current thread
|
|
if (static_scheduling) return false; // <------------------------------------
|
|
|
|
#if !defined(PARFOR_MULTIPLE_TASKS_STEALING)
|
|
// the following scheduling policy for the tasks focuses mostly to load-balancing
|
|
long _maxid = 0, ntask = 0;
|
|
if (maxid.load(std::memory_order_acquire)<0)
|
|
_maxid = (long) (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
|
|
else _maxid = maxid;
|
|
ntask = data[_maxid].ntask.load(std::memory_order_relaxed);
|
|
if (ntask>0) {
|
|
if (_maxid != maxid) maxid.store(_maxid, std::memory_order_release);
|
|
id = _maxid;
|
|
goto L1;
|
|
}
|
|
// no more tasks, exit
|
|
|
|
#else
|
|
// the following scheduling policy for the tasks is a little bit more
|
|
// complex and costly. It tries to find a trade-off between
|
|
// task-to-thread localy and load-balancing by moving a bunch of tasks
|
|
// from one thread to another one
|
|
long _maxid = 0, ntask = 0;
|
|
if (maxid.load(std::memory_order_acquire)<0)
|
|
_maxid = (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
|
|
else _maxid = maxid;
|
|
L2:
|
|
ntask = data[_maxid].ntask.load(std::memory_order_relaxed);
|
|
if (ntask>0) {
|
|
if (_maxid != maxid) maxid.store(_maxid, std::memory_order_release);
|
|
if (ntask<=3) { id = _maxid; goto L1; }
|
|
|
|
// try to steal half of the tasks remaining to _maxid
|
|
|
|
auto oldstart = data[_maxid].task.start.load(std::memory_order_relaxed);
|
|
auto q = ((data[_maxid].task.end-oldstart)/_chunk) >> 1;
|
|
if (q<=3) { id = _maxid; goto L1; }
|
|
auto newstart = oldstart + (q*_chunk-1)*_step +1;
|
|
if (!data[_maxid].task.start.compare_exchange_weak(oldstart, newstart,
|
|
std::memory_order_release,
|
|
std::memory_order_relaxed)) {
|
|
workerlosetime_in(_nw <= lb->getnworkers());
|
|
goto L2; // restart the sequence from the beginning
|
|
}
|
|
assert(newstart <= data[_maxid].task.end);
|
|
|
|
data[_maxid].ntask.fetch_sub(q, std::memory_order_release);
|
|
data[wid].task.start.store(oldstart, std::memory_order_relaxed);
|
|
data[wid].task.end = newstart;
|
|
data[wid].ntask.store(q, std::memory_order_release);
|
|
id = wid;
|
|
goto L1;
|
|
}
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
inline bool nextTask(forall_task_t *task, const int wid) {
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
if(canUseNoStealing()){
|
|
return nextTaskConcurrentNoStealing(task, wid);
|
|
}
|
|
#endif
|
|
const long endchunk = (_chunk-1)*_step + 1;
|
|
int id = wid;
|
|
if (data[id].ntask) {
|
|
L1:
|
|
long start = data[id].task.start;
|
|
long end = (std::min)(start+endchunk, data[id].task.end);
|
|
--data[id].ntask, (data[id].task).start = (end-1)+_step;
|
|
task->set(start, end);
|
|
return true;
|
|
}
|
|
// no available task for the current thread
|
|
#if !defined(PARFOR_MULTIPLE_TASKS_STEALING)
|
|
// the following scheduling policy for the tasks focuses mostly to load-balancing
|
|
if (maxid<0) { //check if maxid has been set
|
|
L2:
|
|
maxid = (long) (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
|
|
if (data[maxid].ntask > 0) {
|
|
id=maxid;
|
|
goto L1;
|
|
}
|
|
// no more tasks, exit
|
|
} else {
|
|
if (data[maxid].ntask > 0) {
|
|
id=maxid;
|
|
goto L1;
|
|
}
|
|
goto L2;
|
|
}
|
|
#else
|
|
auto flag=false;
|
|
if (maxid<0) {
|
|
L2:
|
|
maxid = (std::max_element(data.begin(),data.end(),data_cmp) - data.begin());
|
|
flag=true;
|
|
}
|
|
id = maxid;
|
|
if (data[id].ntask>0) {
|
|
if (data[id].ntask<=3) goto L1;
|
|
|
|
// steal half of the tasks
|
|
auto q = data[id].ntask >> 1, r = data[id].ntask & 0x1;
|
|
data[id].ntask = q;
|
|
data[wid].ntask = q+r;
|
|
data[wid].task.end = data[id].task.end;
|
|
data[id].task.end = data[id].task.start + (q*_chunk-1)*_step +1;
|
|
data[wid].task.start = data[id].task.end;
|
|
id = wid;
|
|
goto L1;
|
|
} else if (!flag) goto L2;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
inline void* svc(void* t) {
|
|
if (t==NULL) {
|
|
if (totaltasks==0) { lb->broadcast_task(GO_OUT); return GO_OUT;}
|
|
sendTask();
|
|
return GO_ON;
|
|
}
|
|
auto wid = lb->get_channel_id();
|
|
assert(wid>=0);
|
|
if (--totaltasks <=0) {
|
|
if (!eossent[wid]) {
|
|
lb->ff_send_out_to(workersspinwait?EOS_NOFREEZE:GO_OUT, int(wid));
|
|
eossent[wid]=true;
|
|
}
|
|
return GO_OUT;
|
|
}
|
|
if (nextTask((forall_task_t*)t, (int) wid)) lb->ff_send_out_to(t, int(wid));
|
|
else {
|
|
if (!eossent[wid]) {
|
|
lb->ff_send_out_to((workersspinwait?EOS_NOFREEZE:GO_OUT), int(wid));
|
|
eossent[wid]=true;
|
|
}
|
|
}
|
|
return GO_ON;
|
|
}
|
|
|
|
inline void setloop(long start, long stop, long step, long chunk, size_t nw) {
|
|
_start=start, _stop=stop, _step=step, _chunk=chunk, _nw=nw;
|
|
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
_nextIteration = _start;
|
|
#endif
|
|
if (_chunk<=0) totaltasks = init_data_static(start,stop);
|
|
else totaltasks = init_data(start,stop);
|
|
|
|
assert(totaltasks>=1);
|
|
// adjust the number of workers that have to be started
|
|
if ( (totaltasks/(double)_nw) <= 1.0 || (totaltasks==1) )
|
|
_nw = totaltasks;
|
|
}
|
|
|
|
inline long startIdx() const { return _start;}
|
|
inline long stopIdx() const { return _stop;}
|
|
inline long stepIdx() const { return _step;}
|
|
inline size_t running() const { return _nw; }
|
|
inline void workersSpinWait() { workersspinwait=true;}
|
|
inline size_t getnumtasks() const { return totaltasks;}
|
|
protected:
|
|
// the following fields are used only by the scheduler thread
|
|
ff_loadbalancer *lb;
|
|
long _start,_stop,_step; // for step
|
|
long _chunk; // a chunk of indexes
|
|
size_t totaltasks; // total n. of tasks
|
|
size_t _nw; // num. of workers
|
|
long jump;
|
|
bool skip1;
|
|
bool workersspinwait;
|
|
bool static_scheduling;
|
|
std::vector<forall_task_t> taskv;
|
|
};
|
|
|
|
// parallel for/reduce worker node
|
|
template<typename Tres>
|
|
class forallreduce_W: public ff_node {
|
|
public:
|
|
typedef Tres Tres_t;
|
|
typedef std::function<void(const long,const long, const int, Tres&)> F_t;
|
|
protected:
|
|
virtual inline void losetime_in(unsigned long) {
|
|
//FFTRACE(lostpopticks+=ff_node::TICKS2WAIT; ++popwait); // FIX
|
|
workerlosetime_in(aggressive);
|
|
}
|
|
public:
|
|
forallreduce_W(forall_Scheduler *const sched, ffBarrier *const loopbar, F_t F):
|
|
sched(sched),loopbar(loopbar), schedRunning(true),
|
|
spinwait(false), aggressive(true),F(F) {}
|
|
|
|
inline void setSchedRunning(bool r) { schedRunning = r; }
|
|
|
|
inline void* svc(void* t) {
|
|
auto task = (forall_task_t*)t;
|
|
auto myid = get_my_id();
|
|
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
forall_task_t tmptask;
|
|
if(t != (void*) &dummyTask || schedRunning){
|
|
F(task->start,task->end,myid,res);
|
|
if (schedRunning) return t;
|
|
}else{
|
|
task = &tmptask;
|
|
}
|
|
#else
|
|
F(task->start,task->end,myid,res);
|
|
if (schedRunning) return t;
|
|
#endif
|
|
|
|
// the code below is executed only if the scheduler thread is not running
|
|
while(sched->nextTaskConcurrent(task,myid))
|
|
F(task->start,task->end,myid,res);
|
|
|
|
if (spinwait) {
|
|
loopbar->doBarrier(myid);
|
|
return GO_ON;
|
|
}
|
|
return GO_OUT;
|
|
}
|
|
|
|
inline void enableSpinWait() { spinwait=true; }
|
|
|
|
inline void setF(F_t _F, const Tres& idtt, bool a=true) {
|
|
F=_F, res=idtt, aggressive=a;
|
|
}
|
|
inline const Tres& getres() const { return res; }
|
|
|
|
protected:
|
|
forall_Scheduler *const sched;
|
|
ffBarrier *const loopbar;
|
|
bool schedRunning;
|
|
protected:
|
|
bool spinwait,aggressive;
|
|
F_t F;
|
|
Tres res;
|
|
};
|
|
|
|
|
|
class forallpipereduce_W: public forallreduce_W<ff_buffernode> {
|
|
public:
|
|
typedef ff_buffernode Tres_t;
|
|
typedef std::function<void(const long,const long, const int, ff_buffernode&)> F_t;
|
|
public:
|
|
forallpipereduce_W(forall_Scheduler *const sched,ffBarrier *const loopbar, F_t F):
|
|
forallreduce_W<ff_buffernode>(sched,loopbar,F) {
|
|
res.set(8192,false,get_my_id());
|
|
res.init_blocking_stuff();
|
|
}
|
|
|
|
inline void* svc(void* t) {
|
|
auto task = (forall_task_t*)t;
|
|
auto myid = get_my_id();
|
|
|
|
F(task->start,task->end,myid,res);
|
|
if (schedRunning) return t;
|
|
|
|
// the code below is executed only if the scheduler thread is not running
|
|
while(sched->nextTaskConcurrent(task,myid))
|
|
F(task->start,task->end,myid,res);
|
|
|
|
if (spinwait) {
|
|
res.ff_send_out(EOS);
|
|
loopbar->doBarrier(myid);
|
|
return GO_ON;
|
|
}
|
|
return GO_OUT;
|
|
}
|
|
|
|
void svc_end() { res.ff_send_out(EOS); }
|
|
|
|
inline void setF(F_t _F, const Tres_t&, bool a=true) {
|
|
F=_F, aggressive=a;
|
|
}
|
|
|
|
// The following methods are custom for this node which is not multi-output. FIX
|
|
bool isMultiOutput() const { return true; }
|
|
void get_out_nodes(svector<ff_node*> &w) { w.push_back(&res); }
|
|
void get_out_nodes_feedback(svector<ff_node*> &w) { w.push_back(this); }
|
|
};
|
|
|
|
|
|
|
|
template <typename Worker_t>
|
|
class ff_forall_farm: public ff_farm {
|
|
public:
|
|
typedef typename Worker_t::Tres_t Tres_t;
|
|
typedef typename Worker_t::F_t F_t;
|
|
protected:
|
|
// removes possible EOS still in the input queues of the workers
|
|
inline void resetqueues(const int _nw) {
|
|
const svector<ff_node*> &nodes = getWorkers();
|
|
for(int i=0;i<_nw;++i) nodes[i]->reset();
|
|
}
|
|
// used just to redefine losetime_in
|
|
class foralllb_t: public ff_loadbalancer {
|
|
protected:
|
|
virtual inline void losetime_in(unsigned long) {
|
|
if ((int)(getnworkers())>=ncores) {
|
|
//FFTRACE(lostpopticks+=(100*TICKS2WAIT);++popwait); // FIX: adjust tracing
|
|
ff_relax(0);
|
|
return;
|
|
}
|
|
//FFTRACE(lostpushticks+=TICKS2WAIT;++pushwait);
|
|
PAUSE();
|
|
}
|
|
public:
|
|
foralllb_t(size_t n):ff_loadbalancer(n),ncores(ff_realNumCores()) {}
|
|
inline int getNCores() const { return ncores;}
|
|
private:
|
|
const int ncores;
|
|
};
|
|
|
|
private:
|
|
Tres_t t; // not used
|
|
size_t numCores;
|
|
ffBarrier *loopbar;
|
|
public:
|
|
|
|
ff_forall_farm(ssize_t maxnw, const bool spinwait=false, const bool skipwarmup=false, const bool spinbarrier=false):
|
|
ff_farm(false,8*DEF_MAX_NUM_WORKERS,8*DEF_MAX_NUM_WORKERS,
|
|
true, DEF_MAX_NUM_WORKERS,true), // cleanup at exit !
|
|
loopbar( (spinwait && spinbarrier) ?
|
|
(ffBarrier*)(new spinBarrier(maxnw<=0?DEF_MAX_NUM_WORKERS+1:(size_t)(maxnw+1))) :
|
|
(ffBarrier*)(new Barrier(maxnw<=0?DEF_MAX_NUM_WORKERS+1:(size_t)(maxnw+1))) ),
|
|
skipwarmup(skipwarmup),spinwait(spinwait) {
|
|
|
|
foralllb_t* _lb = new foralllb_t(DEF_MAX_NUM_WORKERS);
|
|
assert(_lb);
|
|
ff_farm::setlb(_lb);
|
|
|
|
numCores = ((foralllb_t*const)getlb())->getNCores();
|
|
if (maxnw<=0) maxnw=numCores;
|
|
std::vector<ff_node *> forall_w;
|
|
auto donothing=[](const long,const long,const int,const Tres_t&) -> void { };
|
|
forall_Scheduler *sched = new forall_Scheduler(getlb(),maxnw);
|
|
ff_farm::add_emitter(sched);
|
|
for(size_t i=0;i<(size_t)maxnw;++i)
|
|
forall_w.push_back(new Worker_t(sched, loopbar, donothing));
|
|
ff_farm::add_workers(forall_w);
|
|
ff_farm::wrap_around();
|
|
|
|
// needed to avoid the initial barrier (see (**) below)
|
|
if (ff_farm::prepare() < 0)
|
|
error("running base forall farm(2)\n");
|
|
|
|
// NOTE: the warmup phase has to be done, if not now later on.
|
|
// The run_then_freeze method will fail if skipwarmup is true.
|
|
if (!skipwarmup) {
|
|
auto r=-1;
|
|
getlb()->freeze();
|
|
if (getlb()->run() != -1)
|
|
r = getlb()->wait_freezing();
|
|
if (r<0) error("running base forall farm(1)\n");
|
|
}
|
|
|
|
if (spinwait) {
|
|
sched->workersSpinWait();
|
|
for(size_t i=0;i<(size_t)maxnw;++i) {
|
|
//auto w = (forallreduce_W<Tres>*)forall_w[i];
|
|
auto w = (Worker_t*)forall_w[i];
|
|
w->enableSpinWait();
|
|
}
|
|
//resetqueues(maxnw);
|
|
}
|
|
ff_farm::cleanup_all(); // delete everything at exit
|
|
}
|
|
virtual ~ff_forall_farm() {
|
|
if (loopbar) delete loopbar;
|
|
if (ff_farm::getlb()) delete ff_farm::getlb();
|
|
}
|
|
|
|
|
|
// It returns true if the scheduler has to be started, false otherwise.
|
|
//
|
|
// Unless the removeSched flag is set, the scheduler thread will be started
|
|
// only if there are less threads than cores AND if the number of tasks per thread
|
|
// is greather than 1. In case of static scheduling (i.e. chunk<=0), the scheduler
|
|
// is never started because numtasks == nwtostart;
|
|
//
|
|
// By defining at compile time NO_PARFOR_SCHEDULER_THREAD the
|
|
// scheduler won't be started.
|
|
//
|
|
// To always start the scheduler thread, the PARFOR_SCHEDULER_THREAD
|
|
// may be defined at compile time.
|
|
//
|
|
inline bool startScheduler(const size_t nwtostart, const size_t numtasks) const {
|
|
#if defined(NO_PARFOR_SCHEDULER_THREAD)
|
|
return false;
|
|
#elif defined(PARFOR_SCHEDULER_THREAD)
|
|
return true;
|
|
#else
|
|
if (removeSched) return false;
|
|
return ((numtasks > nwtostart) && (nwtostart < numCores));
|
|
#endif
|
|
}
|
|
// set/reset removeSched flag
|
|
// By calling this method with 'true' the scheduler will be disabled.
|
|
//
|
|
// NOTE:
|
|
// Sometimes may be usefull (in terms of performance) to explicitly disable
|
|
// the scheduler thread when #numworkers > ff_realNumCores() on systems where
|
|
// ff_numCores() > ff_realNumCores() (i.e. HT or SMT is enabled)
|
|
inline void disableScheduler(bool onoff=true) { removeSched=onoff; }
|
|
|
|
inline int run_then_freeze(ssize_t nw_=-1) {
|
|
assert(skipwarmup == false);
|
|
const ssize_t nwtostart = (nw_ == -1)?getNWorkers():nw_;
|
|
auto r = -1;
|
|
if (schedRunning) {
|
|
getlb()->skipfirstpop(true);
|
|
if (spinwait) {
|
|
// NOTE: here we have to be sure to send one task to each worker!
|
|
((forall_Scheduler*)getEmitter())->sendTask(true);
|
|
}
|
|
r=ff_farm::run_then_freeze(nwtostart);
|
|
} else {
|
|
if (spinwait) {
|
|
// all worker threads have already crossed the barrier so it is safe to restart it
|
|
loopbar->barrierSetup(nwtostart+1);
|
|
// NOTE: here is not possible to use sendTask because otherwise there could be
|
|
// a race between the main thread and the workers in accessing the task table.
|
|
((forall_Scheduler*)getEmitter())->sendWakeUp();
|
|
} else
|
|
((forall_Scheduler*)getEmitter())->sendTask(true);
|
|
|
|
r = getlb()->thawWorkers(true, nwtostart);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
inline int run_and_wait_end() {
|
|
assert(spinwait == false);
|
|
const size_t nwtostart = getnw();
|
|
auto r= -1;
|
|
if (schedRunning) {
|
|
//resetqueues(nwtostart);
|
|
getlb()->skipfirstpop(true);
|
|
// (**) this way we avoid the initial barrier
|
|
if (getlb()->runlb()!= -1) {
|
|
if (getlb()->runWorkers(nwtostart)!=-1)
|
|
r = getlb()->wait();
|
|
}
|
|
} else {
|
|
((forall_Scheduler*)getEmitter())->sendTask(true);
|
|
if (getlb()->runWorkers(nwtostart) != -1)
|
|
r = getlb()->waitWorkers();
|
|
}
|
|
return r;
|
|
}
|
|
|
|
// it puts all threads to sleep but does not disable the spinWait flag
|
|
inline int stopSpinning() {
|
|
if (!spinwait) return -1;
|
|
// getnworkers() returns the number of threads that are running
|
|
// it may be different from getnw() (i.e. the n. of threads currently
|
|
// executing the parallel iterations)
|
|
size_t running = getlb()->getnworkers();
|
|
if (running == (size_t)-1) return 0;
|
|
getlb()->freezeWorkers();
|
|
getlb()->broadcast_task(GO_OUT);
|
|
return getlb()->wait_freezingWorkers();
|
|
}
|
|
|
|
inline int enableSpinning() {
|
|
if (spinwait) return -1;
|
|
const svector<ff_node*> &nodes = getWorkers();
|
|
for(size_t i=0;i<nodes.size();++i) {
|
|
auto w = (Worker_t*)nodes[i];
|
|
w->enableSpinWait();
|
|
}
|
|
((forall_Scheduler*)getEmitter())->workersSpinWait();
|
|
spinwait = true;
|
|
return 0;
|
|
}
|
|
|
|
inline int wait_freezing() {
|
|
//if (startScheduler(getnw())) return getlb()->wait_lb_freezing();
|
|
if (schedRunning) return getlb()->wait_lb_freezing();
|
|
if (spinwait) {
|
|
loopbar->doBarrier(getnw());
|
|
return 0;
|
|
}
|
|
return getlb()->wait_freezingWorkers();
|
|
}
|
|
|
|
inline int wait() {
|
|
if (spinwait){
|
|
const svector<ff_node*> &nodes = getWorkers();
|
|
for(size_t i=0;i<nodes.size();++i)
|
|
getlb()->ff_send_out_to(EOS,i);
|
|
}
|
|
return ff_farm::wait();
|
|
}
|
|
|
|
inline void setF(F_t _F, const Tres_t& idtt=Tres_t()) { //(Tres)0) {
|
|
const size_t nw = getnw();
|
|
const svector<ff_node*> &nodes = getWorkers();
|
|
// aggressive mode enabled if the number of threads is less than
|
|
// or equal to the number of cores
|
|
const bool mode = (nw <= numCores);
|
|
|
|
// NOTE: in case of static scheduling, the scheduler is never started !
|
|
schedRunning = (!removeSched && startScheduler(nw, ((forall_Scheduler*)getEmitter())->getnumtasks()));
|
|
|
|
#ifdef FF_PARFOR_PASSIVE_NOSTEALING
|
|
globalSchedRunning = schedRunning;
|
|
#endif
|
|
|
|
if (schedRunning) {
|
|
for(size_t i=0;i<nw;++i) {
|
|
//auto w = (forallreduce_W<Tres>*)nodes[i];
|
|
auto w = (Worker_t*)nodes[i];
|
|
w->setF(_F, idtt, mode);
|
|
w->setSchedRunning(true);
|
|
}
|
|
} else {
|
|
for(size_t i=0;i<nw;++i) {
|
|
//auto w = (forallreduce_W<Tres>*)nodes[i];
|
|
auto w = (Worker_t*)nodes[i];
|
|
w->setF(_F, idtt, mode);
|
|
w->setSchedRunning(false);
|
|
}
|
|
}
|
|
}
|
|
/* NOTE: - chunk>0 means dynamic scheduling with grain equal to chunk, that is,
|
|
* no more than chunk iterations at a time is computed by
|
|
* one thread
|
|
* - chunk==0 means default static scheduling, that is, a bunch of ~(#iteration/nw)
|
|
* iterations per thread is computed by each thread
|
|
* - chunk<0 means static scheduling with grain equal to chunk, that is,
|
|
* the iteration space is divided in chunks each one of no more
|
|
* than chunk iterations. Then chunks are assigned to the threads
|
|
* in a round-robin fashion.
|
|
*/
|
|
inline void setloop(long begin,long end,long step,long chunk,long nw) {
|
|
if (nw>(ssize_t)getNWorkers()) {
|
|
error("The number of threads specified is greater than the number set in the ParallelFor* constructor, it will be downsized\n");
|
|
nw = getNWorkers();
|
|
}
|
|
assert(nw<=(ssize_t)getNWorkers());
|
|
forall_Scheduler *sched = (forall_Scheduler*)getEmitter();
|
|
sched->setloop(begin,end,step,chunk,(nw<=0)?getNWorkers():(size_t)nw);
|
|
}
|
|
// return the number of workers running or supposed to run
|
|
inline size_t getnw() { return ((const forall_Scheduler*)getEmitter())->running(); }
|
|
|
|
inline const Tres_t& getres(int i) {
|
|
//return ((forallreduce_W<Tres>*)(getWorkers()[i]))->getres();
|
|
return ((Worker_t*)(getWorkers()[i]))->getres();
|
|
}
|
|
inline long startIdx(){ return ((const forall_Scheduler*)getEmitter())->startIdx(); }
|
|
inline long stopIdx() { return ((const forall_Scheduler*)getEmitter())->stopIdx(); }
|
|
inline long stepIdx() { return ((const forall_Scheduler*)getEmitter())->stepIdx(); }
|
|
|
|
void resetskipwarmup() { assert(skipwarmup); skipwarmup=false;}
|
|
protected:
|
|
bool removeSched = false;
|
|
bool schedRunning= true;
|
|
bool skipwarmup = false;
|
|
bool spinwait = false;
|
|
};
|
|
|
|
|
|
} // namespace ff
|
|
|
|
#endif /* FF_PARFOR_INTERNALS_HPP */
|
|
|