mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/parallel_for.hpp

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*! 
 *  \file parallel_for.hpp
 *  \ingroup high_level_patterns
 *
 *  \brief It describes the ParallelFor/ParallelForReduce/ParallelForPipeReduce patterns.
 *  
 *
 */
 
/* ***************************************************************************
 *
 *  FastFlow is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License version 3 as
 *  published by the Free Software Foundation.
 *  Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
 *  or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
 *
 *  This program is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 *  License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 ****************************************************************************
 */
/*
 *  - Author: 
 *     Massimo Torquati <torquati@di.unipi.it>
 *
 *
 *  This file contains the ParallelFor and the ParallelForReduce classes 
 *  (and also some static functions).
 * 
 *  Iterations scheduling:
 * 
 *  As a general rule, the scheduling strategy is selected according to the chunk value:
 *      - chunk == 0 means default static scheduling, that is, ~(#iteration_space/num_workers) 
 *                   iterations per thread assigned in one single shot at the beginning.
 *      - chunk >  0 means dynamic scheduling with grain equal to the chunk size, that is,
 *                   no more than chunk iterations at a time is assigned to one Worker, the 
 *                   chunk is assigned to Workers dynamically
 *      - chunk <  0 means static scheduling with grain equal to the chunk size, that is,
 *                   the iteration space is divided into chunks each one of no more 
 *                   than chunk iterations. Then chunks are assigned to the Workers statically 
 *                   and in a round-robin fashion.
 *
 *  If you want to use the static scheduling policy (either default or with a given grain),
 *  please use the **parallel_for_static** method.
 *
 *  To use or not to use a scheduler thread ?
 *  As always, it depends on the application, scheduling strategy, platform at hand, 
 *  parallelism degree, ...etc....
 *
 *  The general rule is: a scheduler thread is started if:
 *   1. the dynamic scheduling policy is used (chunk>0);
 *   2. there are enough cores for hosting both worker threads and the scheduler thread;
 *   3. the number of tasks per thread is greater than 1.
 *
 *  In case of static scheduling (chunk <= 0), the scheduler thread is never started.
 *  It is possible to explicitly disable/enable the presence of the scheduler thread
 *  both at compile time and at run-time by using the disableScheduler method and the 
 *  two defines NO_PARFOR_SCHEDULER_THREAD and PARFOR_SCHEDULER_THREAD. 
 *
 *
 *  How to use the ParallelFor (in a nutshell) :
 *                                      ParallelForReduce<long> pfr;
 *    for(long i=0;i<N;i++)             pfr.parallel_for(0,N,[&](const long i) {
 *       A[i]=f(i);                         A[i]=f(i);
 *    long sum=0;               --->    });
 *    for(long i=0; i<N;++i)            long sum=0;
 *       sum+=g(A[i]);                  pfr.parallel_reduce(sum,0,0,N,[&](const long i,long &sum) {
 *                                         sum+=g(A[i]);
 *                                      }, [](long &v, const long elem) {v+=elem;});
 *
 * 
 * 
 * For just a single parallel loop, it is better to use the one-shot version (see at the end of 
 * this file).  Useful when there is just 
     * a single parallel loop.
     * This version should not be used if the parallel loop is called many 
     * times (e.g., within a sequential loop)  or if there are several loops 
     * that can be parallelized by using the same ParallelFor* object. 
     * If this is the case, the version with the object instance is more 
     * efficient because the Worker threads are created once and then 
     * re-used many times.
     * On the contrary, the one-shot version has a lower setup overhead but 
     * Worker threads are destroyed at the end of the loop.

 *
 */

#ifndef FF_PARFOR_HPP
#define FF_PARFOR_HPP

#include <ff/pipeline.hpp>
#include <ff/parallel_for_internals.hpp>

namespace ff {

//
// TODO: to re-write the ParallelFor class as a specialization of the ParallelForReduce
//
    
/*! 
  * \class ParallelFor
  *  \ingroup high_level_patterns
  * 
  * \brief Parallel for loop. Run automatically.
  *
  *  Identifies an iterative work-sharing construct that specifies a region
  * (i.e. a Lambda function) in which the iterations of the associated loop 
  * should be executed in parallel. 
  * 
  * \example parfor_basic.cpp
  */ 
class ParallelFor: public ff_forall_farm<forallreduce_W<int> > {
public:
    /**
     * \brief Constructor

     * Set up a parallel for ParallelFor pattern run-time support 
     * (i.e. spawn workers threads)
     * A single object can be used as many times as needed to run different parallel for
     * pattern instances (different loop bodies). They cannot be nested nor recursive.
     * Nonblocking policy is to be preferred in case of repeated call of the 
     * some of the parallel_for methods (e.g. within a strict outer loop). On the same
     * ParallelFor object different parallel_for methods (e.g. parallel_for and 
     * parallel_for_thid, parallel_for_idx) can be called in sequence.

     * \param maxnw Maximum number of worker threads (not including active scheduler, if
     * any). Deafault <b>FF_AUTO</b> = N. of real cores.
     * \param spinwait. \p true nonblocking, \p false blocking.
     * \param spinbarrier. \p true it uses spinning barrier, \p false it uses blocking barrier.
     * The nonbloking behaviour will leave worker threads active until class destruction is called 
     * (the threads will be active and in the nonblocking barrier only after the 
     * first call to one of the parallel_for methods). To put threads to sleep between different
     * calls, the <b>threadPause</b> method may be called.
     */
    explicit ParallelFor(const long maxnw=FF_AUTO, bool spinwait=false, bool spinbarrier=false):
        ff_forall_farm<forallreduce_W<int> >(maxnw,spinwait,false,spinbarrier) {}
    /**
     * \brief Destructor
     * 
     *  Terminate ParallelFor run-time support and makes resources housekeeping.
     * Both nonlocking and blocking worker threads are terminated.
     */
    ~ParallelFor()                { 
        ff_forall_farm<forallreduce_W<int> >::stop();
        ff_forall_farm<forallreduce_W<int> >::wait();
    }

    /**
     * \brief Disable active scheduler (i.e. Emitter thread)
     *
     *  Disable active scheduler (i.e. Emitter thread of the master-worker
     * implementation). Active scheduling uses one dedicated nonblocking thread.
     * In passive scheduling, workers cooperatively schedule tasks via synchronisations
     * in memory. None of the above is always faster than the other: it depends on
     * parallelism degree, task grain and platform. 
     * As rule of thumb on large multicore and fine-grain tasks active scheduling is
     * faster. On few cores passive scheduler enhances overall performance.
     * Active scheduler is the default option.
     * \param onoff <b>true</b> disable active schduling, 
     * <b>false</b> enable active scheduling
     */ 

    inline void disableScheduler(bool onoff=true) { 
        ff_forall_farm<forallreduce_W<int> >::disableScheduler(onoff);
    }

    // It puts all spinning threads to sleep. It does not disable the spinWait flag
    // so at the next call, threads start spinning again.
    inline int threadPause() {
        return ff_forall_farm<forallreduce_W<int> >::stopSpinning();
    }

    // -------------------- parallel_for --------------------

    /**
     * \brief Parallel for region (basic) - static
     *
     * Static scheduling onto nw worker threads.
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step).
     * 
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param f <b>f(const long idx)</b>  Lambda function, 
     * body of the parallel loop. <b>idx</b>: iterator
     * \param nw number of worker threads (default FF_AUTO)
     */
    template <typename Function>
    inline void parallel_for(long first, long last, const Function& f, 
                             const long nw=FF_AUTO) {
        FF_PARFOR_START(this, parforidx,first,last,1,PARFOR_STATIC(0),nw) {
            f(parforidx);            
        } FF_PARFOR_STOP(this);
    }

    /**
     * \brief Parallel for region (step) - static
     *
     * Static scheduling onto nw worker threads.
     * Iteration space is walked with stride <b>step</b>. 
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step).
     * 
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param step step increment for the iteration variable
     * \param f <b>f(const long idx)</b> body of the parallel loop
     * \param nw number of worker threads
     */
    template <typename Function>
    inline void parallel_for(long first, long last, long step, const Function& f, 
                             const long nw=FF_AUTO) {
        FF_PARFOR_START(this, parforidx,first,last,step,PARFOR_STATIC(0),nw) {
            f(parforidx);            
        } FF_PARFOR_STOP(this);
    }


    /**
     * @brief Parallel for region (step, grain) - dynamic  
     *
     * @detail Dynamic scheduling onto nw worker threads. Iterations are scheduled in 
     * blocks of minimal size <b>grain</b>.
     * Iteration space is walked with stride <b>step</b>. 
     * 
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain (> 0) minimum computation grain 
     * (n. of iterations scheduled together to a single worker)
     * @param f <b>f(const long idx)</b>  Lambda function, 
     * body of the parallel loop. <b>idx</b>: iteration
     * param nw number of worker threads
     */
    template <typename Function>
    inline void parallel_for(long first, long last, long step, long grain, 
                             const Function& f, const long nw=FF_AUTO) {
        FF_PARFOR_START(this, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            f(parforidx);            
        } FF_PARFOR_STOP(this);
    }    

    /**
     * @brief Parallel for region with threadID (step, grain, thid) - dynamic
     *
     * Dynamic scheduling onto nw worker threads. Iterations are scheduled in
     * blocks of minimal size <b>grain</b>.
     * Iteration space is walked with stride <b>step</b>. <b>thid</b> Worker thread ID
     * is made available via a Lambda parameter.
     * 
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain  minimum computation grain  (n. of iterations scheduled together to a single worker)
     * @param f <b>f(const long idx, const int thid)</b>  Lambda function, body of the parallel loop. <b>idx</b>: iteration, <b>thid</b>: worker_id 
     * @param nw number of worker threads (default n. of platform HW contexts)
     */
    template <typename Function>
    inline void parallel_for_thid(long first, long last, long step, long grain, 
                                  const Function& f, const long nw=FF_AUTO) {
        FF_PARFOR_START(this, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            f(parforidx,_ff_thread_id);            
        } FF_PARFOR_STOP(this);
    }    

    /**
     * @brief Parallel for region with indexes ranges (step, grain, thid, idx) - 
     * dynamic - advanced usage
     *
     * @detail Dynamic scheduling onto nw worker threads. Iterations are scheduled in 
     * blocks of minimal size <b>grain</b>. Iteration space is walked with stride 
     * <b>step</b>. A chunk of <b>grain</b> iterations are assigned to each worker but
     * they are not automatically walked. Each chunk can be traversed within the
     * parallel_for body (e.g. with a for loop within <b>f</b> with the same step). 
     *
     * \note It requires some expertise.
     *
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain (> 0) minimum computation grain  (n. of iterations scheduled 
     * together to a single worker)
     * @param f <b>f(const long start_idx, const long stop_idx, const int thid)
     * </b>  Lambda function, body of the parallel loop. 
     * <b>start_idx</b> and <b>stop_idx</b>: iteration bounds assigned to 
     * worker_id <b>thid</b>. 
     * @param nw number of worker threads (default n. of platform HW contexts)
     */
    template <typename Function>
    inline void parallel_for_idx(long first, long last, long step, long grain, 
                                  const Function& f, const long nw=FF_AUTO) {
        FF_PARFOR_START_IDX(this,parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            f(ff_start_idx, ff_stop_idx,_ff_thread_id);            
        } FF_PARFOR_STOP(this);
    }

    /**
     * @brief Parallel for region (step, grain) - static
     * 
     * Static scheduling onto nw worker threads. Iterations are scheduled in
     * blocks of minimal size <b>grain > 1</b> or in maximal partitions
     * <b>grain == 0</b>. Iteration space is walked with stride 
     * <b>step</b>. 
     *
     *
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain (> 0) minimum computation grain  (n. of iterations scheduled 
     * together to a single worker)
     * @param f <b>f(const long idx)</b>
     * Lambda function, body of the parallel loop. 
     * <b>start_idx</b> and <b>stop_idx</b>: iteration bounds assigned to 
     * worker_id <b>thid</b>. 
     * @param nw number of worker threads (default n. of platform HW contexts)
     */
    template <typename Function>
    inline void parallel_for_static(long first, long last, long step, long grain, 
                                    const Function& f, const long nw=FF_AUTO) {
        if (grain==0 || nw==1) {
            // Divide in evenly partioned parts
            FF_PARFOR_START(this, parforidx,first,last,step,PARFOR_STATIC(grain),nw) {
                f(parforidx);            
            } FF_PARFOR_STOP(this);
        } else {
            FF_PARFOR_T_START_STATIC(this, int, parforidx,first,last,step,PARFOR_STATIC(grain),nw) {
                f(parforidx);
            } FF_PARFOR_T_STOP(this,int);
        }
    }
};

 /*!
  * \class ParallelForReduce
  *  \ingroup high_level_patterns
  *
  * \brief Parallel for and reduce. Run automatically.
  *
  *  Set up the run-time for parallel for and parallel reduce.
  *
  * Parallel for: Identifies an iterative work-sharing construct that
  * specifies a region
  * (i.e. a Lambda function) in which the iterations of the associated loop
  * should be executed in parallel.  in parallel.
  *
  * Parallel reduce: reduce an array of T to a single value by way of
  * an associative operation.
  *
  * \tparam T reduction op type: op(T,T) -> T
  */

template<typename T>
class ParallelForReduce: public ff_forall_farm<forallreduce_W<T> > {
public:
    /**
     * @brief Constructor
     * @param maxnw Maximum number of worker threads
     * @param spinwait \p true for noblocking support (run-time thread
     * will never suspend, even between successive calls to \p parallel_for
     * and \p parallel_reduce, useful when they are called in sequence on
     * small kernels), \p false blocking support
     */
    explicit ParallelForReduce(const long maxnw=FF_AUTO, bool spinwait=false, bool spinbarrier=false):
        ff_forall_farm<forallreduce_W<T> >(maxnw,spinwait,false,spinbarrier) {}


    // this constructor is useful to skip loop warmup and to disable spinwait
    /**
     * @brief Constructor
     * @param maxnw Maximum number of worker threads
     * @param spinWait \p true Noblocking support (run-time thread
     * will never suspend, even between successive calls to \p parallel_for
     * and \p parallel_reduce, useful when they are called in sequence on
     * small kernels), \p false blocking support
     * @param skipWarmup Skip warmup phase (autotuning)
     */
    ParallelForReduce(const long maxnw, bool /*spinWait*/, bool /*skipWarmup*/, bool /*spinbarrier*/): 
        ff_forall_farm<forallreduce_W<T> >(maxnw,false, true, false) {}


    ~ParallelForReduce()                { 
        ff_forall_farm<forallreduce_W<T> >::stop();
        ff_forall_farm<forallreduce_W<T> >::wait();
    }

    // By calling this method with 'true' the scheduler will be disabled,
    // to restore the usage of the scheduler thread just pass 'false' as 
    // parameter
    inline void disableScheduler(bool onoff=true) { 
        ff_forall_farm<forallreduce_W<T> >::disableScheduler(onoff);
    }

    // It puts all spinning threads to sleep. It does not disable the spinWait flag
    // so at the next call, threads start spinning again.
    inline int threadPause() {
        return ff_forall_farm<forallreduce_W<T> >::stopSpinning();
    }

    /* -------------------- parallel_for -------------------- */
    /**
     * \brief Parallel for region (basic) - static
     *
     *  Static scheduling onto nw worker threads.
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step).
     *
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param f <b>f(const long idx)</b>  Lambda function,
     * body of the parallel loop. <b>idx</b>: iterator
     * \param nw number of worker threads (default FF_AUTO)
     */
    template <typename Function>
    inline void parallel_for(long first, long last, const Function& f, 
                             const long nw=FF_AUTO) {
        FF_PARFOR_T_START(this, T, parforidx,first,last,1,PARFOR_STATIC(0),nw) {
            f(parforidx);            
        } FF_PARFOR_T_STOP(this,T);
    }
    /**
     * \brief Parallel for region (step) - static
     *
     * Static scheduling onto nw worker threads.
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step).
     *
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param step step increment for the iteration variable
     * \param f <b>f(const long idx)</b> body of the parallel loop
     * \param nw number of worker threads
     */
    template <typename Function>
    inline void parallel_for(long first, long last, long step, const Function& f, 
                             const long nw=FF_AUTO) {
        FF_PARFOR_T_START(this, T, parforidx,first,last,step,PARFOR_STATIC(0),nw) {
            f(parforidx);            
        } FF_PARFOR_T_STOP(this,T);
    }
    /**
     * @brief Parallel for region (step, grain) - dynamic
     *
     * Dynamic scheduling onto nw worker threads. Iterations are scheduled in
     * blocks of minimal size \p grain.
     * Iteration space is walked with stride \p step.
     *
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain (> 0) minimum computation grain
     * (n. of iterations scheduled together to a single worker)
     * @param f <b>f(const long idx)</b>  Lambda function,
     * body of the parallel loop. <b>idx</b>: iteration
     * param nw number of worker threads
     */
    template <typename Function>
    inline void parallel_for(long first, long last, long step, long grain, 
                             const Function& f, const long nw=FF_AUTO) {
        FF_PARFOR_T_START(this, T, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            f(parforidx);            
        } FF_PARFOR_STOP(this);
    }    
    /**
     * @brief Parallel for region with threadID (step, grain, thid) - dynamic
     *
     * Dynamic scheduling onto nw worker threads. Iterations are scheduled in
     * blocks of minimal size \p grain.
     * Iteration space is walked with stride \p step. \p thid Worker thread ID
     * is made available via a Lambda parameter.
     *
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain  minimum computation grain  (n. of iterations scheduled together to a single worker)
     * @param f <b>f(const long idx, const int thid)</b>  Lambda function, body of the parallel loop. <b>idx</b>: iteration, <b>thid</b>: worker_id
     * @param nw number of worker threads (default n. of platform HW contexts)
     */
    template <typename Function>
    inline void parallel_for_thid(long first, long last, long step, long grain, 
                                  const Function& f, const long nw=FF_AUTO) {
        FF_PARFOR_T_START(this,T, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            f(parforidx,_ff_thread_id);            
        } FF_PARFOR_T_STOP(this,T);
    }    
    /**
     * @brief Parallel for region with indexes ranges (step, grain, thid, idx) -
     * dynamic - advanced usage
     *
     * Dynamic scheduling onto nw worker threads. Iterations are scheduled in
     * blocks of minimal size <b>grain</b>. Iteration space is walked with stride
     * <b>step</b>. A block of <b>grain</b> iterations are assigned to each worker but
     * they are not automatically walked. Each block can be traversed within the
     * parallel_for body (e.g. with a for loop within <b>f</b> with the same step).
     *
     * \note Useful in few cases only - requires some expertise
     *
     * @param first first value of the iteration variable
     * @param last last value of the iteration variable
     * @param step step increment for the iteration variable
     * @param grain (> 0) minimum computation grain  (n. of iterations scheduled
     * together to a single worker)
     * @param f <b>f(const long start_idx, const long stop_idx, const int thid)
     * </b>  Lambda function, body of the parallel loop.
     * <b>start_idx</b> and <b>stop_idx</b>: iteration bounds assigned to
     * worker_id <b>thid</b>.
     * @param nw number of worker threads (default n. of platform HW contexts)
     */
    template <typename Function>
    inline void parallel_for_idx(long first, long last, long step, long grain, 
                                  const Function& f, const long nw=FF_AUTO) {

        FF_PARFOR_T_START_IDX(this,T, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            f(ff_start_idx, ff_stop_idx,_ff_thread_id);            
        } FF_PARFOR_T_STOP(this,T);
    }    
    /**
     * \brief Parallel for region (step) - static
     *
     * Static scheduling onto nw worker threads.
     * Iteration space is walked with stride <b>step</b>.
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step)
     *
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param step step increment for the iteration variable
     * \param f <b>f(const long idx)</b> body of the parallel loop
     * \param nw number of worker threads
     */
    template <typename Function>
    inline void parallel_for_static(long first, long last, long step, long grain, 
                                    const Function& f, const long nw=FF_AUTO) {
        if (grain==0 || nw==1) {
            FF_PARFOR_T_START(this, T, parforidx,first,last,step,PARFOR_STATIC(grain),nw) {
                f(parforidx);            
            } FF_PARFOR_T_STOP(this,T);
        } else {
            FF_PARFOR_T_START_STATIC(this, T, parforidx,first,last,step,PARFOR_STATIC(grain),nw) {
                f(parforidx);
            } FF_PARFOR_T_STOP(this,T);
        }
    }

    /* ------------------ parallel_reduce ------------------- */
    /**
     * \brief Parallel reduce (basic)
     *
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step)
     *
     * Reduce is executed in two phases: the first phase execute in
     * parallel a partial reduce (by way of \p partialreduce_body function),
     * the second reduces partial results (by way of \p finalresult_body).
     * Typically the two function are really the same.
     *
     * \param var inital value of reduction variable (accumulator)
     * \param indentity indetity value for the reduction function
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param partialreduce_body reduce operation (1st phase, executed in parallel)
     * \param finalreduce_body reduce operation (2nd phase, executed sequentially)
     * \param nw number of worker threads
     */
    template <typename Function, typename FReduction>
    inline void parallel_reduce(T& var, const T& identity, 
                                long first, long last, 
                                const Function& partialreduce_body, const FReduction& finalreduce_body,
                                const long nw=FF_AUTO) {
        FF_PARFORREDUCE_START(this, var, identity, parforidx, first, last, 1, PARFOR_STATIC(0), nw) {
            partialreduce_body(parforidx, var);
        } FF_PARFORREDUCE_F_STOP(this, var, finalreduce_body);
    }
    /**
     * \brief Parallel reduce (step)
     *
     * Iteration space is walked with stride <b>step</b>.
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step)
     *
     * Reduce is executed in two phases: the first phase execute in
     * parallel a partial reduce (by way of \p partialreduce_body function),
     * the second reduces partial results (by way of \p finalresult_body).
     * Typically the two function are really the same.
     *
     * \param var inital value of reduction variable (accumulator)
     * \param indentity indetity value for the reduction function
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param step step increment for the iteration variable
     * \param partialreduce_body reduce operation (1st phase, executed in parallel)
     * \param finalreduce_body reduce operation (2nd phase, executed sequentially)
     * \param nw number of worker threads
     */
    template <typename Function, typename FReduction>
    inline void parallel_reduce(T& var, const T& identity, 
                                long first, long last, long step, 
                                const Function& body, const FReduction& finalreduce,
                                const long nw=FF_AUTO) {
        FF_PARFORREDUCE_START(this, var, identity, parforidx,first,last,step,PARFOR_STATIC(0),nw) {
            body(parforidx, var);            
        } FF_PARFORREDUCE_F_STOP(this, var, finalreduce);
    }
    /**
     * \brief Parallel reduce (step, grain)
     *
     * Dynamic scheduling onto nw worker threads. Iterations are scheduled in
     * blocks of minimal size \p grain.
     * Iteration space is walked with stride /p step.
     *
     * Reduce is executed in two phases: the first phase execute in
     * parallel a partial reduce (by way of \p partialreduce_body function),
     * the second reduces partial results (by way of \p finalresult_body).
     * Typically the two function are really the same.
     *
     * \param var inital value of reduction variable (accumulator)
     * \param indentity indetity value for the reduction function
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param step step increment for the iteration variable
     * \param partialreduce_body reduce operation (1st phase, executed in parallel)
     * \param finalreduce_body reduce operation (2nd phase, executed sequentially)
     * \param nw number of worker threads
     */
    template <typename Function, typename FReduction>
    inline void parallel_reduce(T& var, const T& identity, 
                                long first, long last, long step, long grain, 
                                const Function& body, const FReduction& finalreduce,
                                const long nw=FF_AUTO) {
        FF_PARFORREDUCE_START(this, var, identity, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            body(parforidx, var);            
        } FF_PARFORREDUCE_F_STOP(this, var, finalreduce);
    }

    template <typename Function, typename FReduction>
    inline void parallel_reduce_thid(T& var, const T& identity,
                                     long first, long last, long step, long grain,
                                     const Function& body, const FReduction& finalreduce,
                                     const long nw=FF_AUTO) {
        FF_PARFORREDUCE_START(this, var, identity, parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            body(parforidx, var, _ff_thread_id);
        } FF_PARFORREDUCE_F_STOP(this, var, finalreduce);
    }

    template <typename Function, typename FReduction>
    inline void parallel_reduce_idx(T& var, const T& identity,
                                    long first, long last, long step, long grain,
                                    const Function& body, const FReduction& finalreduce,
                                    const long nw=FF_AUTO) {
        FF_PARFORREDUCE_START_IDX(this, var, identity, idx,first,last,step,PARFOR_DYNAMIC(grain),nw) {
            body(ff_start_idx, ff_stop_idx, var, _ff_thread_id);
        } FF_PARFORREDUCE_F_STOP(this, var, finalreduce);
    }

    /**
     * \brief Parallel reduce region (step) - static
     *
     * Static scheduling onto nw worker threads.
     * Iteration space is walked with stride \p step.
     * Data is statically partitioned in blocks, i.e.
     * partition size = last-first/(nw*step)
     *
     *
     * \param var inital value of reduction variable (accumulator)
     * \param indentity indetity value for the reduction function
     * \param first first value of the iteration variable
     * \param last last value of the iteration variable
     * \param step step increment for the iteration variable
     * \param f <b>f(const long idx)</b> body of the parallel loop
     * \param nw number of worker threads
     */
    template <typename Function, typename FReduction>
    inline void parallel_reduce_static(T& var, const T& identity,
                                       long first, long last, long step, long grain, 
                                       const Function& body, const FReduction& finalreduce,
                                       const long nw=FF_AUTO) {
        if (grain==0 || nw==1) {
            FF_PARFORREDUCE_START(this, var, identity, parforidx,first,last,step,grain,nw) {
                body(parforidx, var);            
            } FF_PARFORREDUCE_F_STOP(this, var, finalreduce);
        } else {
            FF_PARFORREDUCE_START_STATIC(this, var, identity, parforidx,first,last,step,PARFOR_STATIC(grain),nw) {
                body(parforidx, var);
            } FF_PARFORREDUCE_F_STOP(this, var, finalreduce);
        }
    }
    
};


//#ifndef WIN32 //VS12

//! ParallelForPipeReduce class
/**
 * \brief Parallel pipelined map+reduce
 *
 */
template<typename task_t>
class ParallelForPipeReduce: public ff_pipeline {
protected:
    ff_forall_farm<forallpipereduce_W> pfr; 
    struct reduceStage: ff_minode {        
        typedef std::function<void(const task_t &)> F_t;
        inline void *svc(void *t) {
            const task_t& task=reinterpret_cast<task_t>(t);
            F(task);
            return GO_ON;
        }
        inline int  wait() { return ff_minode::wait(); }        
        inline void setF(F_t f) { F = f; }

        F_t F;
    } reduce;

public:
    explicit ParallelForPipeReduce(const long maxnw=FF_AUTO, bool spinwait=false, bool /*spinbarrier*/=false):
        pfr(maxnw,false,true,false) // skip loop warmup and disable spinwait/spinbarrier
    {
        ff_pipeline::add_stage(&pfr);
        ff_pipeline::add_stage(&reduce);

        // required to avoid error
        pfr.remove_collector();

        // avoiding initial barrier
        if (ff_pipeline::dryrun()<0)  // preparing all connections
            error("ParallelForPipeReduce: preparing pipe\n");
        
        // warmup phase
        pfr.resetskipwarmup();
        auto r=-1;
        if (pfr.run_then_freeze() != -1)         
            if (reduce.run_then_freeze() != -1)
                r = ff_pipeline::wait_freezing();            
        if (r<0) error("ParallelForPipeReduce: running pipe\n");


        if (spinwait) { // NOTE: spinning is enabled only for the Map part and not for the Reduce part
            if (pfr.enableSpinning() == -1)
                error("ParallelForPipeReduce: enabling spinwait\n");
        }
    }
    
    ~ParallelForPipeReduce() {
        pfr.stop(); pfr.wait();
        reduce.wait(); 
    }

    // By calling this method with 'true' the scheduler will be disabled,
    // to restore the usage of the scheduler thread just pass 'false' as 
    // parameter
    inline void disableScheduler(bool onoff=true) { 
        pfr.disableScheduler(onoff);
    }

    // It puts all spinning threads to sleep. It does not disable the spinWait flag
    // so at the next call, threads start spinning again.
    inline int threadPause() {
        return pfr.stopSpinning();
    }

    /**
     * \brief map only call
     *
     */ 
    template <typename Function>
    inline void parallel_for_idx(long first, long last, long step, long grain, 
                                 const Function& Map, const long nw=FF_AUTO) {
        
        // the setloop decides the real number of worker threads that will be started
        // the n. maybe different from nw !
        pfr.setloop(first,last,step,grain,nw); 
        pfr.setF(Map);
        auto donothing=[](task_t) { };
        reduce.setF(donothing);
        auto r=-1;
        if (pfr.run_then_freeze(pfr.getnw()) != -1)
            if (reduce.run_then_freeze(pfr.getnw()) != -1)
                r = ff_pipeline::wait_freezing();                               
        if (r<0) error("ParallelForPipeReduce: parallel_for_idx, starting pipe\n");      
    }

    /**
     * \brief pipe(map,reduce)
     *
     */    
    template <typename Function, typename FReduction>
    inline void parallel_reduce_idx(long first, long last, long step, long grain, 
                                    const Function& Map, const FReduction& Reduce,
                                    const long nw=FF_AUTO) {
        
        // the setloop decides the real number of worker threads that will be started
        // the n. maybe different from nw !
        pfr.setloop(first,last,step,grain,nw); 
        pfr.setF(Map);
        reduce.setF(Reduce);
        auto r=-1;
        if (pfr.run_then_freeze(pfr.getnw()) != -1)
            if (reduce.run_then_freeze(pfr.getnw()) != -1)
                r = ff_pipeline::wait_freezing();            
        if (r<0) error("ParallelForPipeReduce: parallel_reduce_idx, starting pipe\n");
    }
};
//#endif //VS12

/// ---------------------------------------------------------------------------------
///  These are the one-shot versions. It is not needed to create an object instance.
///  They are useful (and more efficient) for a one-shot parallel loop execution
///  or when no extra settings are needed.

// ----------------- parallel_for ----------------------    
//! Parallel loop over a range of indexes (step=1)
template <typename Function>
static void parallel_for(long first, long last, const Function& body, 
                         const long nw=FF_AUTO) {
    FF_PARFOR_BEGIN(pfor, parforidx,first,last,1,PARFOR_STATIC(0),nw) {
        body(parforidx);            
    } FF_PARFOR_END(pfor);
}
//! Parallel loop over a range of indexes using a given step
template <typename Function>
static void parallel_for(long first, long last, long step, const Function& body, 
                         const long nw=FF_AUTO) {
    FF_PARFOR_BEGIN(pfor, parforidx,first,last,step,PARFOR_STATIC(0),nw) {
        body(parforidx);            
    } FF_PARFOR_END(pfor);
}
//! Parallel loop over a range of indexes using a given step and granularity
template <typename Function>
static void parallel_for(long first, long last, long step, long grain, 
                         const Function& body, const long nw=FF_AUTO) {
    FF_PARFOR_BEGIN(pfor, parforidx,first,last,step,grain,nw) {
        body(parforidx);            
    } FF_PARFOR_END(pfor);
}

// advanced version    
template <typename Function>
inline void parallel_for_idx(long first, long last, long step, long grain, 
                             const Function& f, const long nw=FF_AUTO,
                             const bool noActiveScheduler=false) {
    FF_PARFOR_BEGIN_IDX(pfor,parforidx,first,last,step,PARFOR_DYNAMIC(grain),nw,noActiveScheduler){
        f(ff_start_idx, ff_stop_idx,_ff_thread_id);            
    } FF_PARFOR_END(pfor);
}


// -------------- parallel_reduce -------------------    
template <typename Function, typename Value_t, typename FReduction>
void parallel_reduce(Value_t& var, const Value_t& identity, 
                     long first, long last, long step, long grain,
                     const Function& body, const FReduction& finalreduce,
                     const long nw=FF_AUTO) {
    Value_t _var = var;
    FF_PARFORREDUCE_BEGIN(pfr, _var, identity, parforidx, first, last, step, PARFOR_DYNAMIC(grain), nw) {
        body(parforidx, _var);            
    } FF_PARFORREDUCE_F_END(pfr, _var, finalreduce);
    var=_var;
}

// advanced version
template <typename Function, typename Value_t, typename FReduction>
void parallel_reduce_idx(Value_t& var, const Value_t& identity, 
                         long first, long last, long step, long grain,
                         const Function& body, const FReduction& finalreduce,
                         const long nw=FF_AUTO, const bool noActiveScheduler=false) {
    Value_t _var = var;
    FF_PARFORREDUCE_BEGIN_IDX(pfr, _var, identity, idx,first,last,step,PARFOR_DYNAMIC(grain),nw,noActiveScheduler) {
        body(ff_start_idx, ff_stop_idx, _var, _ff_thread_id);
    } FF_PARFORREDUCE_F_END(pfr, _var, finalreduce);
    var=_var;
}
    
    
} // namespace ff

#endif /* FF_PARFOR_HPP */