mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/node.hpp

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*!
 * \link
 * \file node.hpp
 * \ingroup building_blocks
 *
 * \brief FastFlow ff_node
 *
 * @detail FastFlow basic contanier for a shared-memory parallel activity
 *
 */

#ifndef FF_NODE_HPP
#define FF_NODE_HPP

/* ***************************************************************************
 *
 *  FastFlow is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License version 3 as
 *  published by the Free Software Foundation.
 *  Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
 *  or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
 *
 *  This program is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 *  License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 ****************************************************************************
 */

#include <stdlib.h>
#include <iosfwd>
#include <functional>
#include <ff/platforms/platform.h>
#include <ff/cycle.h>
#include <ff/utils.hpp>
#include <ff/buffer.hpp>
#include <ff/ubuffer.hpp>
#include <ff/mapper.hpp>
#include <ff/config.hpp>
#include <ff/svector.hpp>
#include <ff/barrier.hpp>
#include <atomic>

#ifdef DFF_ENABLED

#include <ff/distributed/ff_network.hpp>
#include <ff/distributed/ff_typetraits.hpp>
#include <cereal/cereal.hpp>
#include <cereal/types/polymorphic.hpp>
#include <cereal/archives/portable_binary.hpp>

#endif


namespace ff {

// distributed rts related type, but always defined
struct GroupInterface;


static void* FF_EOS           = (void*)(ULLONG_MAX);     /// automatically propagated
static void* FF_EOS_NOFREEZE  = (void*)(ULLONG_MAX-1);   /// not automatically propagated
static void* FF_EOSW          = (void*)(ULLONG_MAX-2);   /// propagated only by farm's stages
static void* FF_GO_ON         = (void*)(ULLONG_MAX-3);   /// not automatically propagated
static void* FF_GO_OUT        = (void*)(ULLONG_MAX-4);   /// not automatically propagated
static void* FF_TAG_MIN       = (void*)(ULLONG_MAX-10);  /// just a lower bound mark
// The FF_GO_OUT is quite similar to the FF_EOS_NOFREEZE. Both of them are not propagated automatically to
// the next stage, but while the first one is used to exit the main computation loop and, if this is the case, to be frozen,
// the second one is used to exit the computation loop and keep spinning on the input queue waiting for a new task
// without being frozen.
// EOSW is like EOS but it is not propagated outside a farm pattern. If an emitter receives EOSW in input,
// then it will be discarded.
//

/* optimization levels used in the optimize_static call (see optimize.hpp) */
struct OptLevel {
    ssize_t  max_nb_threads{MAX_NUM_THREADS};
    ssize_t  max_mapped_threads{MAX_NUM_THREADS};
    int      verbose_level{0};
    bool     no_initial_barrier{false};
    bool     no_default_mapping{false};
    bool     blocking_mode{false};
    bool     merge_with_emitter{false};
    bool     remove_collector{false};
    bool     merge_farms{false};
    bool     introduce_a2a{false};
};
struct OptLevel1: OptLevel {
    OptLevel1() {
        max_nb_threads=ff_numCores();   // TODO: use the mapper
        blocking_mode=true;
        no_initial_barrier=true;
        remove_collector=true;
    }
};
struct OptLevel2: OptLevel {
    OptLevel2() {
        max_nb_threads=ff_numCores();   // TODO: use the mapper
        blocking_mode=true;
        no_initial_barrier=true;
        merge_with_emitter=true;
        remove_collector=true;
        merge_farms= true;
    }
};
/* ----------------------------------------------------------------------- */

// This is just a counter, and is used to set the ff_node::tid value.
// The _noBarrier counter is to use with threads that are not part of a topology,
// such for example stand-alone nodes or manager node or ...etc...
static std::atomic_ulong   internal_threadCounter{0};
static std::atomic_ulong   internal_threadCounter_noBarrier{MAX_NUM_THREADS};

// TODO: Should be rewritten in terms of mapping_utils.hpp
#if defined(HAVE_PTHREAD_SETAFFINITY_NP) && !defined(NO_DEFAULT_MAPPING)

    /*
     *
     * \brief Initialize thread affinity
     * It initializes thread affinity i.e. which cpu the thread should be
     * assigned.
     *
     * \note Linux-specific code
     *
     * \param attr is the pthread attribute
     * \param cpuID is the identifier the core
     * \return -2  if error, the cpu identifier if successful
     */
static inline int init_thread_affinity(pthread_attr_t*attr, int cpuId) {
    // This is linux-specific code
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);

    int id;
    if (cpuId<0) {
        id = threadMapper::instance()->getCoreId();
        CPU_SET (id, &cpuset);
    } else  {
        id = cpuId;
        CPU_SET (cpuId, &cpuset);
    }

    if (pthread_attr_setaffinity_np (attr, sizeof(cpuset), &cpuset)<0) {
        perror("pthread_attr_setaffinity_np");
        return -2;
    }
    return id;
}
#elif !defined(HAVE_PTHREAD_SETAFFINITY_NP) && !defined(NO_DEFAULT_MAPPING)

/*
 * \brief Initializes thread affinity
 *
 * It initializes thread affinity i.e. it defines to which core ths thread
 * should be assigned.
 *
 * \return always return -1 because no thread mapping is done
 */
static inline int init_thread_affinity(pthread_attr_t*,int) {
    // Ensure that the threadMapper constructor is called
    threadMapper::instance();
    return -1;
}
#else
/*
 * \brief Initializes thread affinity
 *
 * It initializes thread affinity i.e. it defines to which core ths thread
 * should be assigned.
 *
 * \return always return -1 because no thread mapping is done
 */
static inline int init_thread_affinity(pthread_attr_t*,int) {
    // Do nothing
    return -1;
}
#endif /* HAVE_PTHREAD_SETAFFINITY_NP */


// forward decl
/*
 * \brief Proxy thread routine
 *
 */
static void * proxy_thread_routine(void * arg);

/*!
 *  \class ff_thread
 *  \ingroup buiding_blocks
 *
 *  \brief thread container for (leaves) ff_node
 *
 * It defines FastFlow's threading abstraction to run ff_node in parallel
 * in the shared-memory runtime
 *
 * \note Should not be used directly, it is called by ff_node
 */
class ff_thread {

    friend void * proxy_thread_routine(void *arg);

protected:
    ff_thread(BARRIER_T * barrier=NULL, bool default_mapping=true):
        tid((size_t)-1),threadid(0), default_mapping(default_mapping),
        barrier(barrier), stp(true), // only one shot by default
        spawned(false), freezing(0), frozen(false),isdone(false),
        init_error(false), attr(NULL) {
        (void)FF_TAG_MIN; // to avoid warnings

        /* Attr is NULL, default mutex attributes are used. Upon successful
         * initialization, the state of the mutex becomes initialized and
         * unlocked.
         * */
        if (pthread_mutex_init(&mutex,NULL)!=0) {
            error("FATAL ERROR: ff_thread: pthread_mutex_init fails!\n");
            abort();
        }
        if (pthread_cond_init(&cond,NULL)!=0) {
            error("FATAL ERROR: ff_thread: pthread_cond_init fails!\n");
            abort();
        }
        if (pthread_cond_init(&cond_frozen,NULL)!=0) {
            error("FATAL ERROR: ff_thread: pthread_cond_init fails!\n");
            abort();
        }
    }

    virtual ~ff_thread() {}

    void thread_routine() {
        threadid = ff_getThreadID();
#if defined(FF_INITIAL_BARRIER)
        if (barrier) {
            barrier->doBarrier(tid);
        }
        /* else {
         *    printf("THREAD %ld skip barrier\n", threadid);
         * }
         */
#endif
        void * ret;
        do {
            init_error=false;
            if (svc_init()<0) {
                error("ff_thread, svc_init failed, thread exit!!!\n");
                init_error=true;
                break;
            } else  {
                ret = svc(NULL);
            }
            svc_end();

            if (disable_cancelability()) {
                error("ff_thread, thread_routine, could not change thread cancelability");
                return;
            }

            // acquire lock. While freezing is true,
            // freeze and wait.
            pthread_mutex_lock(&mutex);
            if (ret != FF_EOS_NOFREEZE && !stp) {
                if ((freezing == 0) && (ret == FF_EOS)) stp = true;
                while(freezing==1) { // NOTE: freezing can change to 2
                    frozen=true;
                    pthread_cond_signal(&cond_frozen);
                    pthread_cond_wait(&cond,&mutex);
                }
            }

            //thawed=true;
            //pthread_cond_signal(&cond);
            //frozen=false;
            if (freezing != 0) freezing = 1; // freeze again next time
            pthread_mutex_unlock(&mutex);

            if (enable_cancelability()) {
                error("ff_thread, thread_routine, could not change thread cancelability");
                return;
            }
        } while(!stp);

        if (freezing) {
            pthread_mutex_lock(&mutex);
            frozen=true;
            pthread_cond_signal(&cond_frozen);
            pthread_mutex_unlock(&mutex);
        }
        isdone = true;
    }

    int disable_cancelability() {
        if (pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &old_cancelstate)) {
            perror("pthread_setcanceltype");
            return -1;
        }
        return 0;
    }

    int enable_cancelability() {
        if (pthread_setcancelstate(old_cancelstate, 0)) {
            perror("pthread_setcanceltype");
            return -1;
        }
        return 0;
    }


#if defined(FF_TASK_CALLBACK)
    virtual void callbackIn(void  * =NULL) { }
    virtual void callbackOut(void * =NULL) { }
#endif

public:

    virtual void* svc(void * task) = 0;
    virtual int   svc_init() { return 0; };
    virtual void  svc_end()  {}

    virtual void set_barrier(BARRIER_T * const b) { barrier=b;}
    virtual BARRIER_T* get_barrier() const { return barrier; }

    virtual void no_mapping() { default_mapping=false; }
    bool get_mapping() const { return default_mapping; }

    virtual int run(bool=false) { return spawn(); }

    virtual int spawn(int cpuId=-1) {
        if (spawned) return -1;

        if ((attr = (pthread_attr_t*)malloc(sizeof(pthread_attr_t))) == NULL) {
            error("spawn: pthread can not be created, malloc failed\n");
            return -1;
        }
        if (pthread_attr_init(attr)) {
                perror("pthread_attr_init: pthread can not be created.");
                return -1;
        }

        int CPUId = -1;
        if (default_mapping)
            init_thread_affinity(attr, cpuId);
        if (CPUId==-2) return -2;

        if (barrier)
            tid= internal_threadCounter.fetch_add(1);
        else
            tid= internal_threadCounter_noBarrier.fetch_add(1);
        int r=0;
        if ((r=pthread_create(&th_handle, attr,
                              proxy_thread_routine, this)) != 0) {
            errno=r;
            perror("pthread_create: pthread creation failed.");
            barrier?--internal_threadCounter:--internal_threadCounter_noBarrier;
            return -2;
        }
        spawned = true;
        return CPUId;
    }

    virtual int wait() {
        int r=0;
        stp=true;
        if (isfrozen()) {
            wait_freezing();
            thaw();
        }
        if (spawned) {
            pthread_join(th_handle, NULL);
            barrier ? --internal_threadCounter: --internal_threadCounter_noBarrier;
        }
        if (attr) {
            if (pthread_attr_destroy(attr)) {
                error("ERROR: ff_thread.wait: pthread_attr_destroy fails!");
                r=-1;
            }
            free(attr);
            attr = NULL;
        }
        spawned=false;
        return r;
    }

    virtual int wait_freezing() {
        pthread_mutex_lock(&mutex);
        while(!frozen) pthread_cond_wait(&cond_frozen,&mutex);
        pthread_mutex_unlock(&mutex);
        return (init_error?-1:0);
    }

    virtual void stop() { stp = true; };

    virtual void freeze() {
        stp=false;
        freezing = 1;
    }

    virtual void thaw(bool _freeze=false, ssize_t=-1) {
        pthread_mutex_lock(&mutex);
        // if this function is called even if the thread is not
        // in frozen state, then freezing has to be set to 1 and not 2
        //if (_freeze) freezing= (frozen?2:1); // next time freeze again the thread
        // October 2014, changed the above policy.
        // If thaw is called and the thread is not in the frozen stage,
        // then the thread won't fall to sleep at the next freezing point

        if (_freeze) freezing = 2; // next time freeze again the thread
        else freezing=0;
        //assert(thawed==false);
        frozen=false;
        pthread_cond_signal(&cond);
        pthread_mutex_unlock(&mutex);

        //pthread_mutex_lock(&mutex);
        //while(!thawed) pthread_cond_wait(&cond, &mutex);
        //thawed=false;
        //pthread_mutex_unlock(&mutex);
    }
    virtual bool isfrozen() const { return freezing>0;}
    virtual bool done()     const { return isdone || (frozen && !stp);}

    pthread_t get_handle() const { return th_handle;}

    inline size_t getTid() const { return tid; }
    inline size_t getOSThreadId() const { return threadid; }

protected:
    size_t          tid;                /// unique logical id of the thread
    size_t          threadid;           /// OS specific thread ID
    bool            default_mapping;
private:
    BARRIER_T    *  barrier;            /// A \p Barrier object
    bool            stp;
    bool            spawned;
    int             freezing;
    bool            frozen,isdone;
    bool            init_error;
    pthread_t       th_handle;
    pthread_attr_t *attr;
    pthread_mutex_t mutex;
    pthread_cond_t  cond;
    pthread_cond_t  cond_frozen;
    int             old_cancelstate;
};

static void * proxy_thread_routine(void * arg) {
    ff_thread & obj = *(ff_thread *)arg;
    obj.thread_routine();
    pthread_exit(NULL);
    return NULL;
}

// forward declaration
class ff_loadbalancer;
class ff_gatherer;

/*!
 *  \class ff_node
 *  \ingroup building_blocks
 *
 *  \brief The FastFlow abstract contanier for a parallel activity (actor).
 *
 * Implements \p ff_node, i.e. the general container for a parallel
 * activity. From the orchestration viewpoint, the process model to
 * be employed is a CSP/Actor hybrid model where activities (\p
 * ff_nodes) are named and the data paths between processes are
 * clearly identified. \p ff_nodes synchronise each another via
 * abstract units of SPSC communications and synchronisation (namely
 * 1:1 channels), which models data dependency between two
 * \p ff_nodes.  It is used to encapsulate
 * sequential portions of code implementing functions.
 *
 * \p In a multicore, a ff_node is implemented as non-blocking thread.
 * It is not and should
 * not be confused with a task. Typically a \p ff_node uses the 100% of one CPU
 * context (i.e. one core, either physical or HT, if any). Overall, the number of
 * ff_nodes running should not exceed the number of logical cores of the platform.
 *
 * \p A ff_node behaves as a loop that gets an input (i.e. the parameter of \p svc
 * method) and produces one or more outputs (i.e. return parameter of \p svc method
 * or parameter of the \p ff_send_out method that can be called in the \p svc method).
 * The loop complete on the output of the special value "end-of_stream" (EOS).
 * The EOS is propagated across channels to the next \p ff_node.
 *
 * Key methods are: \p svc_init, \p svc_end (optional), and \p svc (pure virtual,
 * mandatory). The \p svc_init method is called once at node initialization,
 * while the \p svn_end method is called after a EOS task has been returned.
 *
 *  This class is defined in \ref node.hpp
 */

class ff_node {
private:

    friend class ff_farm;
    friend class ff_pipeline;
    friend class ff_map;
    template <typename IN,typename OUT>
    friend class ff_nodeSelector;
    friend class ff_loadbalancer;
    friend class ff_gatherer;
    friend class ff_minode;
    friend class ff_monode;
    friend class ff_a2a;
    friend class ff_comb;
    friend struct internal_mo_transformer;
    friend struct internal_mi_transformer;

#ifdef DFF_ENABLED
    friend class dGroups;
    friend class dGroup;
#endif

private:
    FFBUFFER        * in;           ///< Input buffer, built upon SWSR lock-free (wait-free)
                                    ///< (un)bounded FIFO queue
    FFBUFFER        * out;          ///< Output buffer, built upon SWSR lock-free (wait-free)
                                    ///< (un)bounded FIFO queue
    ssize_t           myid;         ///< This is the node id, it is valid only for farm's workers
    ssize_t           CPUId;
    ssize_t           neos=1;       ///< n. of EOS the node expects to receive before terminating
    bool              myoutbuffer;
    bool              myinbuffer;
    bool              skip1pop;
#ifdef DFF_ENABLED
    bool _skipallpop;
#endif

    bool              in_active;    // allows to disable/enable input tasks receiving
    bool              my_own_thread;

    ff_thread       * thread;       /// A \p thWorker object, which extends the \p ff_thread class
    bool (*callback)(void *, int, unsigned long,unsigned long, void *);
    void            * callback_arg;
    BARRIER_T       * barrier;      /// A \p Barrier object
    struct timeval tstart;
    struct timeval tstop;
    struct timeval wtstart;
    struct timeval wtstop;
    double wttime;

protected:

    virtual void set_id(ssize_t id) {
        myid = id;
    }
    // sets how many EOSs the node has to receive before terminating,
    // it also sets when eosnotify has to be called, by default at each input EOS
    virtual void set_neos(ssize_t n) { neos = n; }

    virtual inline bool push(void * ptr) { return out->push(ptr); }
    virtual inline bool pop(void ** ptr) {
        if (!in_active) return false; // it does not want to receive data
        return in->pop(ptr);
    }
    virtual inline bool Push(void *ptr, unsigned long retry=((unsigned long)-1), unsigned long ticks=(TICKS2WAIT)) {
        if (blocking_out) {
        retry:
            bool empty=out->empty();
            bool r = push(ptr);
            if (r) { // OK
                if (empty) pthread_cond_signal(p_cons_c);
            } else { // FULL
                struct timespec tv;
                timedwait_timeout(tv);
                pthread_mutex_lock(prod_m);
                pthread_cond_timedwait(prod_c,prod_m,&tv);
                pthread_mutex_unlock(prod_m);
                goto retry;
            }
            return true;
        }
        for(unsigned long i=0;i<retry;++i) {
            if (push(ptr)) return true;
            losetime_out(ticks);
        }
        return false;
    }

    virtual inline bool Pop(void **ptr, unsigned long retry=((unsigned long)-1), unsigned long ticks=(TICKS2WAIT)) {
        if (blocking_in) {
            if (!in_active) { *ptr=NULL; return false; }
        retry:
            bool r = in->pop(ptr);
            if (!r) { // EMPTY
                struct timespec tv;
                timedwait_timeout(tv);
                pthread_mutex_lock(cons_m);
                pthread_cond_timedwait(cons_c, cons_m,&tv);
                pthread_mutex_unlock(cons_m);
                goto retry;
            }
            return true;
        }
        for(unsigned long i=0;i<retry;++i) {
            if (!in_active) { *ptr=NULL; return false; }
            if (pop(ptr)) return true;
            losetime_in(ticks);
        }
        return true;
    }


    // consumer
    virtual inline bool init_input_blocking(pthread_mutex_t   *&m,
                                            pthread_cond_t    *&c,
                                            bool /*feedback*/=true) {
        if (cons_m == nullptr) {
            assert(cons_c==nullptr);
            cons_m = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t));
            cons_c = (pthread_cond_t*)malloc(sizeof(pthread_cond_t));
            assert(cons_m); assert(cons_c);
            if (pthread_mutex_init(cons_m, NULL) != 0) return false;
            if (pthread_cond_init(cons_c, NULL) != 0)  return false;
        }
        m = cons_m,  c = cons_c;
        return true;
    }
    // producer
    virtual inline bool init_output_blocking(pthread_mutex_t   *&m,
                                             pthread_cond_t    *&c,
                                             bool /*feedback*/=true) {
        if (prod_m == nullptr) {
            assert(prod_c==nullptr);
            prod_m = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t));
            prod_c = (pthread_cond_t*)malloc(sizeof(pthread_cond_t));
            assert(prod_m); assert(prod_c);
            if (pthread_mutex_init(prod_m, NULL) != 0) return false;
            if (pthread_cond_init(prod_c, NULL) != 0)  return false;
        }
        m = prod_m, c = prod_c;
        return true;
    }
    virtual inline void set_output_blocking(pthread_mutex_t   *&m,
                                            pthread_cond_t    *&c,
                                            bool canoverwrite=false) {
        assert(canoverwrite ||
               (p_cons_c == nullptr) ||
               (p_cons_c == c));
        FF_IGNORE_UNUSED(canoverwrite);
        FF_IGNORE_UNUSED(m);
        p_cons_c = c;
    }

    // this function is used mainly for combined node where the cond variable must
    // be shared with the first internal node
    virtual inline void  set_cons_c(pthread_cond_t *c) {
        assert(cons_c == nullptr);
        assert(cons_m == nullptr);
        cons_c = c;
    }
    virtual inline pthread_cond_t    &get_cons_c()       { return *cons_c;}

    /**
     * \brief Set the ff_node to start with no input task
     *
     * Setting it to true let the \p ff_node execute the \p svc method spontaneusly
     * before receiving a task on the input channel. \p skipfirstpop makes it possible
     * to define a "producer" node that starts the network.
     *
     * \param sk \p true start spontaneously (*task will be NULL)
     *
     */
    virtual inline void skipfirstpop(bool sk)   { skip1pop=sk;}

#ifdef DFF_ENABLED
    virtual inline void skipallpop(bool sk) {_skipallpop = sk;}
#endif

    /**
     * \brief Gets the status of spontaneous start
     *
     * If \p true the \p ff_node execute the \p svc method spontaneusly
     * before receiving a task on the input channel. \p skipfirstpop makes it possible
     * to define a "producer" node that produce the stream.
     *
     * \return \p true if skip-the-first-element mode is set, \p false otherwise
     *
     * Example: \ref l1_ff_nodes_graph.cpp
     */
    bool skipfirstpop() const { return skip1pop; }

#ifdef DFF_ENABLED
    bool skipallpop() {return _skipallpop;}
#endif


    /**
     * \brief Creates the input channel
     *
     *  \param nentries: the number of elements of the buffer
     *  \param fixedsize flag to decide whether the buffer is bound or unbound.
     *  Default is \p true.
     *
     *  \return 0 if successful, -1 otherwise
     */
    virtual int create_input_buffer(int nentries, bool fixedsize=FF_FIXED_SIZE) {
        if (in) return -1;
        if (nentries<=0) return -1;
        in = new FFBUFFER(nentries,fixedsize);
        if (!in) return -1;
        myinbuffer=true;
        if (!in->init()) return -1;
        return 0;
    }

    virtual int create_input_buffer_mp(int nentries, bool fixedsize=FF_FIXED_SIZE, int neos=1) {
        if (create_input_buffer(nentries,fixedsize)<0) return -1;
        // setting multi-producer push
        in->pushPMF = &FFBUFFER::mp_push;
        set_neos(neos);
        return 0;
    }

    /**
     *  \brief Creates the output channel
     *
     *  \param nentries: the number of elements of the buffer
     *  \param fixedsize flag to decide whether the buffer is bound or unbound.
     *  Default is \p true.
     *
     *  \return 0 if successful, -1 otherwise
     */
    virtual int create_output_buffer(int nentries, bool fixedsize=FF_FIXED_SIZE) {
        if (out) return -1;
        if (nentries<=0) return -1;
        out = new FFBUFFER(nentries,fixedsize);
        if (!out) return -1;
        myoutbuffer=true;
        if (!out->init()) return -1;
        return 0;
    }

    /**
     *  \brief Assign the output channelname to a channel
     *
     * Attach the output of a \p ff_node to an existing channel, typically the input
     * channel of another \p ff_node
     *
     *  \param o reference to a channel of type \p FFBUFFER
     *
     *  \return 0 if successful, -1 otherwise
     */
    virtual int set_output_buffer(FFBUFFER * const o) {
        if (myoutbuffer) return -1;
        out = o;
        return 0;
    }


    /**
     *  \brief Assign the input channelname to a channel
     *
     * Attach the input of a \p ff_node to an existing channel, typically the output
     * channel of another \p ff_node
     *
     *  \param i a buffer object of type \p FFBUFFER
     *
     *  \return 0 if successful, -1 otherwise
     */
    virtual int set_input_buffer(FFBUFFER * const i) {
        if (myinbuffer) return -1;
        in = i;
        return 0;
    }

    virtual inline int set_input(const svector<ff_node *> &) { return -1;}
    virtual inline int set_input(ff_node *n) {
        return set_input_buffer(n->get_in_buffer());
    }
    virtual inline int set_input_feedback(ff_node *) { return -1;}
    virtual inline int set_output(const svector<ff_node *> &) { return -1;}
    virtual inline int set_output(ff_node *n) {
        return set_output_buffer(n->get_in_buffer());
    }
    virtual inline int set_output_feedback(ff_node *) { return -1;}
    virtual inline void set_input_channelid(ssize_t, bool=true) {}

    virtual int prepare() { prepared=true; return 0; }
    virtual int dryrun() { if (!prepared) return prepare(); return 0; }

    virtual void set_scheduling_ondemand(const int /*inbufferentries*/=1) {}
    virtual int ondemand_buffer() const { return 0;}


    /**
     * \brief Run the ff_node
     *
     * \return 0 success, -1 otherwise
     */
    virtual int run(bool=false) {
        if (thread) delete reinterpret_cast<thWorker*>(thread);
        thread = new thWorker(this,neos);
        if (!thread) return -1;
        return thread->run();
    }

    #ifdef DFF_ENABLED
    virtual int run(ff_node*, bool=false) {return 0;}
    #endif

    /**
     * \brief Suspend (freeze) the ff_node and run it
     *
     * Only initialisation will be performed
     *
     * \return 0 success, -1 otherwise
     */
    virtual int freeze_and_run(bool=false) {
        if (thread) delete reinterpret_cast<thWorker*>(thread);
        thread = new thWorker(this,neos);
        if (!thread) return 0;
        freeze();
        return thread->run();
    }

    /**
     * \brief Wait ff_node termination
     *
     * \return 0 success, -1 otherwise
     */
    virtual int  wait() {
        if (!thread) return 0;
        return thread->wait();
    }

    /**
     * \brief Wait the freezing state
     *
     * It will happen on EOS arrival on the input channel
     *
     * \return 0 success, -1 otherwise
     */
    virtual int  wait_freezing() {
        if (!thread) return 0;
        return thread->wait_freezing();
    }

    virtual void stop() {
        if (!thread) return;
        thread->stop();
    }

    /**
     * \brief Freeze (suspend) a ff_node
     */
    virtual void freeze() {
        if (!thread) return;
        thread->freeze();
    }

    /**
     * \brief Thaw (resume) a ff_node
     */
    virtual void thaw(bool _freeze=false, ssize_t=-1) {
        if (!thread) return;
        thread->thaw(_freeze);
    }

    /**
     * \brief Checks if a ff_node is frozen
     * \return \p true is it frozen
     */
    virtual bool isfrozen() const {
        if (!thread) return false;
        return thread->isfrozen();
    }

    /**
     * \brief checks if the node is running
     *
     */
    virtual bool done() const  {
        if (!thread) return true;
        return thread->done();
    }


    virtual bool isoutbuffermine() const { return myoutbuffer;}

    virtual int  cardinality(BARRIER_T * const b) {
        barrier = b;
        return 1;
    }
    virtual int  cardinality() const { return 1; }

    virtual inline void setlb(ff_loadbalancer*,bool=false) {}
    virtual inline void setgt(ff_gatherer*,bool=false) {}


    /**
     * \brief Misure \ref ff::ff_node execution time
     *
     * \return time (ms)
     */
    virtual double ffTime() {
        return diffmsec(tstop,tstart);
    }

    /**
     * \brief Misure \ref ff_node::svc execution time
     *
     * \return time (ms)
     */
    virtual double wffTime() {
        return diffmsec(wtstop,wtstart);
    }

public:
    /*
     * \brief Default retry delay in nonblocking get/put on channels
     */
    enum {TICKS2WAIT=1000};

    void *const GO_ON        = FF_GO_ON;
    void *const GO_OUT       = FF_GO_OUT;
    void *const EOS_NOFREEZE = FF_EOS_NOFREEZE;
    void *const EOS          = FF_EOS;
    void *const EOSW         = FF_EOSW;


    ff_node(const ff_node&):ff_node() {}

    /**
     *  \brief Destructor, polymorphic deletion through base pointer is allowed.
     *
     *
     */
    virtual  ~ff_node() {
        if (in && myinbuffer) delete in;
        if (out && myoutbuffer) delete out;
        if (thread && my_own_thread) delete reinterpret_cast<thWorker*>(thread);
        if (cons_c && cons_m) {
            pthread_cond_destroy(cons_c);
            free(cons_c);
            cons_c = nullptr;
        }
        if (cons_m) {
            pthread_mutex_destroy(cons_m);
            free(cons_m);
            cons_m = nullptr;
        }
        if (prod_m) {
            pthread_mutex_destroy(prod_m);
            free(prod_m);
            prod_m = nullptr;
        }
        if (prod_c) {
            pthread_cond_destroy(prod_c);
            free(prod_c);
            prod_c = nullptr;
        }
    };

    /**
     * \brief The service callback (should be filled by user with parallel activity business code)
     *
     * \param task is a the input data stream item pointer (task)
     * \return output data stream item pointer
     */
    virtual void* svc(void * task) = 0;

    /**
     * \brief Service initialisation
     *
     * Called after run-time initialisation (e.g. thread spawning) but before
     * to start to get items from input stream (can be useful for initialisation
     * of parallel activities, e.g. manual thread pinning that cannot be done in
     * the costructor because threads stil do not exist).
     *
     * \return 0
     */
    virtual int svc_init() { return 0; }

    /**
     *
     * \brief Service finalisation
     *
     * Called after EOS arrived (logical termination) but before shutdding down
     * runtime support (can be useful for housekeeping)
     */
    virtual void  svc_end() {}


    /**
     * \brief Node initialisation
     *
     * This is a different initialization method with respect to svc_init (the default method).
     * This can be used to explicitly initialize the object when the node is not running as a thread.
     *
     * \return 0
     */
    virtual int   nodeInit() { return 0; }

    /**
     * \brief Node finalisation.
     *
     * This is a different finalisation method with respect to svc_end (the default method).
     * This can be used to explicitly finalise the object when the node is not running as a thread.
     */
    virtual void  nodeEnd()  { }

    /**
     * \brief EOS callback
     *
     * This method is called when an EOS has just been received from one input channel.
     * Inside this method it is possible to call ff_send_out to produce data elements in output
     * (this is not possible in the svc_end method).
     * The parameter \param id is the ID of the channel that received the EOS.
     */
    virtual void eosnotify(ssize_t /*id*/=-1) {}

    /**
     * \brief Returns the number of EOS the node has to receive before terminating.
     */
    virtual ssize_t get_neos() const { return neos;}

    /**
     *  \brief Returns the identifier of the node (not unique)
     */
    virtual ssize_t get_my_id() const { return myid; };

    /**
     * \brief Returns the OS specific thread id of the node.
     *
     * The returned id is valid (>0) only if the node is an active node (i.e. the thread has been created).
     *
     */
    inline size_t getOSThreadId() const { if (thread) return thread->getOSThreadId(); return 0; }

    virtual bool change_node(ff_node* old, ff_node* n, bool cleanup=false, bool remove_from_cleanuplist=false) { return false;}

    /**
     * Change the size of the outputchannel.
     * WARNING: this method should not be used if the queue is being used!!!!
     *
     */
    virtual bool change_outputqueuesize(size_t newsz, size_t &oldsz) {
        if (!out) { oldsz=0; return false; }
        oldsz = out->changesize(newsz);
        return true;
    }
    /**
     * Change the size of the inputchannel.
     * WARNING: this method should not be used if the queue is being used!!!!
     *
     */
    virtual bool change_inputqueuesize(size_t newsz, size_t &oldsz) {
        if (!in) { oldsz=0; return false; }
        oldsz = in->changesize(newsz);
        return true;
    }


#if defined(FF_TASK_CALLBACK)
    virtual void callbackIn(void * =NULL)  { }
    virtual void callbackOut(void * =NULL) { }
#endif

    virtual inline void get_out_nodes(svector<ff_node*>&w) { w.push_back(this); }
    virtual inline void get_out_nodes_feedback(svector<ff_node*>&) {}
    virtual inline void get_in_nodes(svector<ff_node*>&w) { w.push_back(this); }
    virtual inline void get_in_nodes_feedback(svector<ff_node*>&) {}


    /**
     * \brief Force ff_node-to-core pinning
     *
     * \param cpuID is the ID of the CPU to which the thread will be pinned.
     */
    virtual void  setAffinity(int cpuID) {
        if (cpuID<0 || !threadMapper::instance()->checkCPUId(cpuID) ) {
            error("setAffinity, invalid cpuID\n");
        }
        CPUId=cpuID;
    }

    virtual void set_barrier(BARRIER_T * const b) {
        barrier = b;
    }
    virtual BARRIER_T* get_barrier() const { return barrier; }

    /**
     * \internal
     * \brief Gets the CPU id (if set) of this node is pinned
     *
     * It gets the ID of the CPU where the ff_node is running.
     *
     * \return The identifier of the CPU.
     */
    virtual int getCPUId() const { return CPUId; }

    /**
     * \brief Nonblocking put into the input channel
     *
     * Wait-free and fence-free (under TSO)
     * This is called by a different node (e.g., lb) to push data
     * into the node's input queue.
     *
     * \param ptr is a pointer to the task
     *
     */
    virtual inline bool  put(void * ptr) {
        //return in->push(ptr);
        return (in->*in->pushPMF)(ptr);
    }

    /**
     * \brief Noblocking pop from the output channel
     *
     * Wait-free and fence-free (under TSO)
     *
     * \param ptr is a pointer to the task
     *
     */
    virtual inline bool  get(void **ptr) { return out->pop(ptr);}

    virtual inline void losetime_out(unsigned long ticks=ff_node::TICKS2WAIT) {
        FFTRACE(lostpushticks+=ticks; ++pushwait);
#if defined(SPIN_USE_PAUSE)
        const long n = (long)ticks/2000;
        for(int i=0;i<=n;++i) PAUSE();
#else
        ticks_wait(ticks);
#endif /* SPIN_USE_PAUSE */
    }

    virtual inline void losetime_in(unsigned long ticks=ff_node::TICKS2WAIT) {
        FFTRACE(lostpopticks+=ticks; ++popwait);
#if defined(SPIN_USE_PAUSE)
        const long n = (long)ticks/2000;
        for(int i=0;i<=n;++i) PAUSE();
#else
        ticks_wait(ticks);
#endif /* SPIN_USE_PAUSE */
    }

    /**
     * \brief Gets input channel
     *
     * It returns a pointer to the input buffer.
     *
     * \return A pointer to the input buffer
     */
    virtual FFBUFFER * get_in_buffer() const { return in;}

    /**
     * \brief Gets pointer to the output channel
     *
     * It returns a pointer to the output buffer.
     *
     * \return A pointer to the output buffer.
     */
    virtual FFBUFFER * get_out_buffer() const { return out;}

    virtual const struct timeval getstarttime() const { return tstart;}

    virtual const struct timeval getstoptime()  const { return tstop;}

    virtual const struct timeval getwstartime() const { return wtstart;}

    virtual const struct timeval getwstoptime() const { return wtstop;}

#if defined(TRACE_FASTFLOW)
    virtual void ffStats(std::ostream & out) {
        out << "ID: " << get_my_id()
            << "  work-time (ms): " << wttime    << "\n"
            << "  n. tasks      : " << taskcnt   << "\n"
            << "  svc ticks     : " << tickstot  << " (min= " << ticksmin << " max= " << ticksmax << ")\n"
            << "  n. push lost  : " << pushwait  << " (ticks=" << lostpushticks << ")" << "\n"
            << "  n. pop lost   : " << popwait   << " (ticks=" << lostpopticks  << ")" << "\n";
    }

    virtual double getworktime() const { return wttime; }
    virtual size_t getnumtask()  const { return taskcnt; }
    virtual ticks  getsvcticks() const { return tickstot; }
    virtual size_t getpushlost() const { return pushwait;}
    virtual size_t getpoplost()  const { return popwait; }
#endif

    /**
     * \brief Sends out the task
     *
     * It allows to emit tasks on output stream without returning from the \p svc method.
     * Make the ff_node to emit zero or more tasks per input task
     *
     * \param task a pointer to the task
     * \param retry number of tries to put (nonbloking partial) the task to output channel
     * \param ticks delay between successive retries
     *
     */
    virtual bool ff_send_out(void * task, int id=-1,
                             unsigned long retry=((unsigned long)-1),
                             unsigned long ticks=(TICKS2WAIT)) {
        if (callback) return  callback(task,id,retry,ticks,callback_arg);
        bool r =Push(task,retry,ticks);
#if defined(FF_TASK_CALLBACK)
        if (r) callbackOut();
#endif
        return r;
    }

    // Warning resetting queues while the node is running may produce unexpected results.
    virtual void reset() {
        if (in)  in->reset();
        if (out) out->reset();
    }

    /**
     *  checking for multi-input/output, all-to-all, farm, pipe
     *
     */
    virtual inline bool isMultiInput() const {  return false; }
    virtual inline bool isMultiOutput() const { return false; }
    virtual inline bool isAll2All() const     { return false; }
    virtual inline bool isFarm() const        { return false; }
    virtual inline bool isOFarm() const       { return false; }
    virtual inline bool isComp() const        { return false; }
    virtual inline bool isPipe() const        { return false; }

    virtual inline void set_multiinput()  {}

#if defined(FF_REPARA)
    struct rpr_measure_t {
        size_t schedule_id;
        size_t time_before, time_after;
        size_t problemSize;  // computed if the rpr::task_size attribute is defined otherwise is 0
        size_t bytesIn, bytesOut;
        size_t vmSize, vmPeak;
        double energy;
    };

    using RPR_devices_measure = std::vector<std::pair<int, std::vector<rpr_measure_t> > >;
    using RPR_measures_vector = std::vector<std::vector<RPR_devices_measure> >;

    /**
     *  Returns input data size
     */
    virtual size_t rpr_get_sizeIn()  const { return rpr_sizeIn; }

    /**
     *  Returns output data size
     */
    virtual size_t rpr_get_sizeOut() const { return rpr_sizeOut; }

    /**
     *  gets/sets energy flag
     */
    virtual bool rpr_get_measure_energy() const { return measureEnergy; }
    virtual void rpr_set_measure_energy(bool v) { measureEnergy = v; }

    /**
     *  Returns all measures collected by the node.
     *  The structure is:
     *    - the outermost vector is greater than 1 if the node is a pipeline or a farm
     *    - each stage of a pipeline or a worker of a farm can be a pipeline or a farm as well
     *      therefore the second level vector is grater than 1 only if the stage is a pipeline or a farm
     *    - each entry of a stage is a vector containing info for each device associated to the stage.
     *      The device is identified by the first entry of the std::pair, the second element of the pair
     *      is a vector containing the measurments for the period considered.
     */
    virtual RPR_measures_vector rpr_get_measures() { return RPR_measures_vector(); }


protected:
    bool   measureEnergy = false;
    size_t rpr_sizeIn      = {0};
    size_t rpr_sizeOut     = {0};
#endif  /* FF_REPARA */

    /**
     *  used for composition (see ff_comb)
     */
    static inline bool ff_send_out_comp(void * task, int, unsigned long /*retry*/,unsigned long /*ticks*/, void *obj) {
        return ((ff_node *)obj)->push_comp_local(task);
    }


    virtual bool push_comp_local(void *task) {
        (void)task;
        abort();  // to be removed, just for debugging purposes
    }


    virtual inline ssize_t get_channel_id() const           { return -1; }
    /** returns the total number of output channels */
    virtual inline size_t  get_num_outchannels() const      { return 0; }
    /** returns the total number of input channels */
    virtual inline size_t  get_num_inchannels() const       { return 0; } //(in?1:0); }
    virtual inline size_t  get_num_feedbackchannels() const { return 0; } //(out?1:0);}

    virtual void propagateEOS(void* task=FF_EOS) { (void)task; }

#ifdef DFF_ENABLED
    std::function<bool(void*, dataBuffer&)> serializeF;
    std::function<void(void*)> freetaskF;
    std::function<void*(dataBuffer&, bool&)> deserializeF;
    std::function<void*(char*, size_t)> alloctaskF;


    virtual bool isSerializable(){ return (bool)serializeF; }
    virtual bool isDeserializable(){ return (bool)deserializeF; }
    virtual std::pair<decltype(serializeF), decltype(freetaskF)> getSerializationFunction(){return std::make_pair(serializeF,freetaskF);}
    virtual std::pair<decltype(deserializeF), decltype(alloctaskF)> getDeserializationFunction(){ return std::make_pair(deserializeF,alloctaskF);}

#endif
    // always defined, the body will implement a no-op if the distributed runtime is disabled
    GroupInterface createGroup(std::string);

protected:

    ff_node():in(0),out(0),myid(-1),CPUId(-1),
              myoutbuffer(false),myinbuffer(false),
              skip1pop(false), in_active(true),
              my_own_thread(true),
              thread(NULL),callback(NULL),barrier(NULL) {
        time_setzero(tstart);time_setzero(tstop);
        time_setzero(wtstart);time_setzero(wtstop);
        wttime=0;
        FFTRACE(taskcnt=0;lostpushticks=0;pushwait=0;lostpopticks=0;popwait=0;ticksmin=(ticks)-1;ticksmax=0;tickstot=0);

        p_cons_c = NULL;

        blocking_in = blocking_out = FF_RUNTIME_MODE;
    };


    // move constructor
    ff_node(ff_node &&n) {
        tstart = n.tstart;
        tstop  = n.tstop;
        wtstart = n.wtstart;
        wtstop = n.wtstop;
        wttime = n.wttime;
        p_cons_c = n.p_cons_c;
        blocking_in = n.blocking_in;
        blocking_out = n.blocking_out;
        default_mapping = n.default_mapping;
        in_active = n.in_active;
        cons_m = n.cons_m;  cons_c = n.cons_c;
        prod_m = n.prod_m;  prod_c = n.prod_c;
        barrier = n.barrier;

        // TODO trace <------

        in = n.in;
        myinbuffer = n.myinbuffer;
        out = n.out;
        myoutbuffer = n.myoutbuffer;
        thread = n.thread;
        my_own_thread = n.my_own_thread;

        n.in = nullptr;
        n.myinbuffer = false;
        n.out = nullptr;
        n.myoutbuffer = false;
        n.thread = nullptr;
        n.my_own_thread = false;
        n.barrier = nullptr;
        n.cons_m = nullptr; n.cons_c = nullptr;
        n.prod_m = nullptr; n.prod_c = nullptr;
    }

    virtual inline void input_active(const bool onoff) {
        if (in_active != onoff)
            in_active= onoff;
    }

    virtual void registerCallback(bool (*cb)(void *,int,unsigned long,unsigned long,void *), void * arg) {
        callback=cb;
        callback_arg=arg;
    }
    virtual void registerAllGatherCallback(int (* /*cb*/)(void *,void **, void*), void * /*arg*/) {}

    /* WARNING: these method must be called before the run() method */
    virtual void blocking_mode(bool blk=true) {
        blocking_in = blocking_out = blk;
    }
    virtual void no_barrier() {
        initial_barrier=false;
    }
    virtual void no_mapping() {
        default_mapping=false;
    }

private:
    /* ------------------------------------------------------------------------------------- */
    class thWorker: public ff_thread {
    public:
        thWorker(ff_node * const filter, const ssize_t input_neos=1):
            ff_thread(filter->barrier, filter->default_mapping),filter(filter),input_neos(input_neos) {}

        inline bool push(void * task) {
            /* NOTE: filter->push and not buffer->push because of the filter can be a dnode
             *
             * It is not correct to call filter->Push because the filter could be a composition
             * so the ff_send_out allows to call the callback
             */
            //return filter->Push(task);
            return filter->ff_send_out(task);
        }

        inline bool pop(void ** task) {
            /*
             * NOTE: filter->pop and not buffer->pop because of the filter can be a dnode
             */
            return filter->Pop(task);
        }

        inline bool put(void * ptr) { return filter->put(ptr);}

        inline bool get(void **ptr) { return filter->get(ptr);}

        inline void* svc(void * ) {
            void * task = NULL;
            void * ret  = FF_EOS;
            bool inpresent  = (filter->get_in_buffer() != NULL);
            bool outpresent = (filter->get_out_buffer() != NULL);
            bool skipfirstpop = filter->skipfirstpop();
            bool exit=false;
            bool filter_outpresent = false;
            size_t neos=input_neos;


            // if the node is a combine where the last stage is a multi-output
            if ( filter && ( !outpresent && filter->isMultiOutput() ) ) {
                filter_outpresent=true;
            }
            gettimeofday(&filter->wtstart,NULL);
            do {
#ifdef DFF_ENABLED
                if (!filter->skipallpop() && inpresent){
#else
                if (inpresent) {
#endif
                    if (!skipfirstpop) pop(&task);
                    else skipfirstpop=false;
                    if ((task == FF_EOS) || (task == FF_EOSW) ||
                        (task == FF_EOS_NOFREEZE)) {
                        ret = task;

                        if (--neos > 0) continue;
                        filter->eosnotify();

                        // only EOS and EOSW are propagated
                        if ( (task == FF_EOS) || (task == FF_EOSW) )  {
                            if (outpresent)  push(task);
                            if (filter_outpresent) filter->propagateEOS();
                        }
                        break;
                    }
                    if (task == FF_GO_OUT) break;
                }
                FFTRACE(++filter->taskcnt);
                FFTRACE(ticks t0 = getticks());

#if defined(FF_TASK_CALLBACK)
                if (filter) callbackIn();
#endif

                ret = filter->svc(task);

#if defined(TRACE_FASTFLOW)
                ticks diff=(getticks()-t0);
                filter->tickstot +=diff;
                filter->ticksmin=(std::min)(filter->ticksmin,diff); // (std::min) for win portability)
                filter->ticksmax=(std::max)(filter->ticksmax,diff);
#endif

                if (ret == FF_GO_OUT) break;
                if (!ret || (ret >= FF_EOSW)) { // EOS or EOS_NOFREEZE or EOSW
                    // NOTE: The EOS is gonna be produced in the output queue
                    // and the thread exits even if there might be some tasks
                    // in the input queue !!!
                    if (!ret) ret = FF_EOS;
                    exit=true;
                }
                if ( outpresent && ((ret != FF_GO_ON) && (ret != FF_EOS_NOFREEZE)) ) {
                    push(ret);
#if defined(FF_TASK_CALLBACK)
                    if (filter) callbackOut();
#endif
                }
            } while(!exit);

            gettimeofday(&filter->wtstop,NULL);
            filter->wttime+=diffmsec(filter->wtstop,filter->wtstart);

            return ret;
        }

        int svc_init() {
#if !defined(HAVE_PTHREAD_SETAFFINITY_NP) && !defined(NO_DEFAULT_MAPPING)
            if (filter->default_mapping) {
                int cpuId = filter->getCPUId();
                if (ff_mapThreadToCpu((cpuId<0) ? (cpuId=threadMapper::instance()->getCoreId(tid)) : cpuId)!=0)
                    error("Cannot map thread %d to CPU %d, mask is %u,  size is %u,  going on...\n",tid, (cpuId<0) ? threadMapper::instance()->getCoreId(tid) : cpuId, threadMapper::instance()->getMask(), threadMapper::instance()->getCListSize());
                filter->setCPUId(cpuId);
            }
#endif
            gettimeofday(&filter->tstart,NULL);
            return filter->svc_init();
        }

        void svc_end() {
            filter->svc_end();
            gettimeofday(&filter->tstop,NULL);
        }

        int run(bool=false) {
            int CPUId = ff_thread::spawn(filter->getCPUId());
            filter->setCPUId(CPUId);
            return (CPUId==-2)?-1:0;
        }

        inline int  wait() { return ff_thread::wait();}
        inline int  wait_freezing() { return ff_thread::wait_freezing();}
        inline void freeze() { ff_thread::freeze();}
        inline bool isfrozen() const { return ff_thread::isfrozen();}
        inline bool done()     const { return ff_thread::done();}
        inline int  get_my_id() const { return filter->get_my_id(); };

    protected:
#if defined(FF_TASK_CALLBACK)
        void callbackIn(void  *t=NULL) { filter->callbackIn(t);  }
        void callbackOut(void *t=NULL) { filter->callbackOut(t); }
#endif
    protected:
        ff_node * const filter;
        const ssize_t input_neos;
    };
    /* ------------------------------------------------------------------------------------- */

    inline void   setCPUId(int id) { CPUId = id;}
    inline void   setThread(ff_thread *const th) { my_own_thread = false; thread = th; }
    inline size_t getTid() const {
        if (!thread) return (size_t)-1;
        return thread->getTid();
    }

protected:

#if defined(TRACE_FASTFLOW)
    size_t        taskcnt;
    ticks         lostpushticks;
    size_t        pushwait;
    ticks         lostpopticks;
    size_t        popwait;
    ticks         ticksmin;
    ticks         ticksmax;
    ticks         tickstot;
#endif

    // for the input queue
    pthread_mutex_t    *cons_m = nullptr;
    pthread_cond_t     *cons_c = nullptr;


    // for the output queue
    pthread_mutex_t    *prod_m = nullptr;
    pthread_cond_t     *prod_c = nullptr;

    // for synchronizing with the next multi-input stage
    pthread_cond_t     *p_cons_c = nullptr;

    bool               FF_MEM_ALIGN(blocking_in,32);
    bool               FF_MEM_ALIGN(blocking_out,32);

    bool                  prepared = false;
    bool                  initial_barrier = true;
    bool                  default_mapping = true;
};  // ff_node


/* *************************** Typed node ************************* */

//#ifndef WIN32 //VS12
/*!
 *  \class ff_node_base_t
 *  \ingroup building_blocks
 *
 *  \brief The FastFlow typed abstract contanier for a parallel activity (actor).
 *
 *  Key method is: \p svc (pure virtual).
 *
 *  This class is defined in \ref node.hpp
 */

template<typename IN_t, typename OUT_t = IN_t>
struct ff_node_t: ff_node {
    typedef IN_t  in_type;
    typedef OUT_t out_type;

    using ff_node::registerCallback;
    using ff_node::ff_send_out;

    ff_node_t():
        GO_ON((OUT_t*)FF_GO_ON),
        EOS((OUT_t*)FF_EOS),
        EOSW((OUT_t*)FF_EOSW),
        GO_OUT((OUT_t*)FF_GO_OUT),
        EOS_NOFREEZE((OUT_t*) FF_EOS_NOFREEZE) {
#ifdef DFF_ENABLED

        /* WARNING:
         *    the definition of functions alloctaskF, freetaskF, serializeF, deserializeF
         *    IS DUPLICATED for the ff_minode_t and ff_monode_t (see file multinode.hpp).
         *
         */
     if constexpr (traits::has_alloctask_v<IN_t>) {
         this->alloctaskF = [](char* ptr, size_t sz) -> void* {
                                IN_t* p = nullptr;
                                alloctaskWrapper<IN_t>(ptr, sz, p);
                                assert(p);
                                return p;
                           };
     } else {
         this->alloctaskF = [](char*, size_t ) -> void* {
                               IN_t* o = new IN_t;
                               assert(o);
                               return o;
                           };
     }

     if constexpr (traits::has_freetask_v<OUT_t>) {
        this->freetaskF = [](void* o) {
                              freetaskWrapper<OUT_t>(reinterpret_cast<OUT_t*>(o));
                          };

     } else {
         this->freetaskF = [](void* o) {
                               if constexpr (!std::is_void_v<OUT_t>) {
                                       OUT_t* obj = reinterpret_cast<OUT_t*>(o);
                                       delete obj;
                               }
                           };
     }

    // check on Serialization capabilities on the OUTPUT type!
    if constexpr (traits::is_serializable_v<OUT_t>){
        this->serializeF = [](void* o, dataBuffer& b) -> bool {
                               bool datacopied = true;
                               std::pair<char*, size_t> p = serializeWrapper<OUT_t>(reinterpret_cast<OUT_t*>(o,datacopied));
                               b.setBuffer(p.first, p.second);
                               return datacopied;
                           };
    } else if constexpr (cereal::traits::is_output_serializable<OUT_t, cereal::PortableBinaryOutputArchive>::value){
        this->serializeF = [](void* o, dataBuffer& b) -> bool {
                               std::ostream oss(&b);
                               cereal::PortableBinaryOutputArchive ar(oss);
                               ar << *reinterpret_cast<OUT_t*>(o);
                               return true;
                           };
    }

    // check on Serialization capabilities on the INPUT type!
    if constexpr (traits::is_deserializable_v<IN_t>) {
        this->deserializeF = [this](dataBuffer& b, bool& datacopied) -> void* {
                                 IN_t* ptr=(IN_t*)this->alloctaskF(b.getPtr(), b.getLen());
                                 datacopied = deserializeWrapper<IN_t>(b.getPtr(), b.getLen(), ptr);
                                 assert(ptr);
                                 return ptr;
                             };
    } else if constexpr(cereal::traits::is_input_serializable<IN_t, cereal::PortableBinaryInputArchive>::value){
            this->deserializeF = [this](dataBuffer& b, bool& datacopied) -> void* {
                                     std::istream iss(&b);
                                     cereal::PortableBinaryInputArchive ar(iss);
                                     IN_t* o = (IN_t*)this->alloctaskF(nullptr,0);
                                     assert(o);
                                     ar >> *o;
                                     datacopied = true;
                                     return o;
                                 };
        }
#endif

	}
    OUT_t * const GO_ON,  *const EOS, *const EOSW, *const GO_OUT, *const EOS_NOFREEZE;
    virtual ~ff_node_t()  {}
    virtual OUT_t* svc(IN_t*)=0;
    inline  void *svc(void *task) { return svc(reinterpret_cast<IN_t*>(task)); };
private:
    // deleting some functions that do not have to be used in the svc
    using ff_node::push;
    using ff_node::pop;
    using ff_node::Push;
    using ff_node::Pop;

};

#if (__cplusplus >= 201103L) || (defined __GXX_EXPERIMENTAL_CXX0X__) || (defined(HAS_CXX11_VARIADIC_TEMPLATES))

/*!
 *  \class ff_node_F
 *  \ingroup building_blocks
 *
 *  \brief The FastFlow typed abstract contanier for a parallel activity (actor).
 *
 *  Creates an ff_node_t from a lambdas, function pointer, etc
 *
 *  This class is defined in \ref node.hpp
 */
template<typename TIN, typename TOUT=TIN,
         typename FUNC=std::function<TOUT*(TIN*,ff_node*const)> >
struct ff_node_F: public ff_node_t<TIN,TOUT> {
   ff_node_F(FUNC f):F(f) {}
   TOUT* svc(TIN* task) { return F(task, this); }
   FUNC F;
};

#endif
//#endif


/* ------------------------ internal node implementations, should not be used -------- */

/* just a node interface for the input and output buffers
 * This is used in the internal implementation but can be used also
 * at the user level. In this second case
 */
struct ff_buffernode: ff_node {
    ff_buffernode() {}
    ff_buffernode(int nentries, bool fixedsize=FF_FIXED_SIZE, int id=-1, int multi_producer_eos=-1) {
        set(nentries,fixedsize,id, multi_producer_eos);
    }
    // NOTE: this constructor is supposed to be used only for implementing
    // internal FastFlow features!
    ff_buffernode(int id, FFBUFFER *in, FFBUFFER *out) {
        set_id(id);
        set_input_buffer(in);
        set_output_buffer(out);
    }
    void set(int nentries, bool fixedsize=FF_FIXED_SIZE, int id=-1, int multi_producer_eos=-1) {
        set_id(id);
        if (multi_producer_eos<0) {
            if (create_input_buffer(nentries,fixedsize) < 0) {
                error("FATAL ERROR: ff_buffernode::set: create_input_buffer fails!\n");
                abort();
            }
            set_output_buffer(ff_node::get_in_buffer());
        } else {
            if (create_input_buffer_mp(nentries,fixedsize, multi_producer_eos) < 0) {
                error("FATAL ERROR: ff_buffernode::set: create_input_buffer_mp fails!\n");
                abort();
            }
            set_output_buffer(ff_node::get_in_buffer());
        }
    }

    int init_blocking_stuff() {
        // blocking stuff
        pthread_mutex_t   *m        = NULL;
        pthread_cond_t    *c        = NULL;
        if (!ff_node::init_output_blocking(m,c)) {
            error("buffernode, FATAL ERROR, init input blocking mode for accelerator\n");
            return -1;
        }
        if (!ff_node::init_input_blocking(m,c)) {
            error("buffernode, FATAL ERROR, init input blocking mode for accelerator\n");
            return -1;
        }
        return 0;
    }

    void reset_blocking_out() { blocking_out = false; }

    bool ff_send_out(void *ptr, int id=-1,
                     unsigned long retry=((unsigned long)-1), unsigned long ticks=(ff_node::TICKS2WAIT)) {
        return ff_node::ff_send_out(ptr,id,retry,ticks);
    }
    bool gather_task(void **task, unsigned long retry=((unsigned long)-1), unsigned long ticks=(ff_node::TICKS2WAIT)) {
        bool r =ff_node::Pop(task,retry,ticks);
        return r;
    }

    template<typename T>
    bool gather_task(T *&task, unsigned long retry=((unsigned long)-1), unsigned long ticks=(ff_node::TICKS2WAIT)) {
        return gather_task((void **)&task, retry, ticks);
    }


protected:
    void* svc(void*){return NULL;}

    pthread_cond_t    &get_cons_c()       { return *p_cons_c;}


    // New Blocking protocol (both for bounded and unbounded buffer):
    // init_output_blocking initializes prod_*
    // set_output_blocking sets p_cons_*
    // init_input_blocking initializes cons_*

    // sender:
    //   empty=channel.empty();
    //   r=channel.send()
    //   if (!r) timewait(prod_c);
    //   if empty then signal(p_cons_c) // the channel was empty

    // receive:
    //  r=channel.receive()
    //  if (!r) timewait(cons_c);       // channel empty
};


} // namespace ff

#endif /* FF_NODE_HPP */