mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/mpmc/MPMCqueues.hpp

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*!
 *  \file MPMCqueues.hpp
 *  \ingroup aux_classes
 *
 *  \brief This file contains several MPMC queue implementations. Not
 *  currently used.
 *
 * This file contains the following
 * Multi-Producer/Multi-Consumer queue implementations:
 * \li  MPMC_Ptr_Queue   bounded MPMC queue by Dmitry Vyukov
 * \li  uMPMC_Ptr_Queue  unbounded MPMC queue by Massimo Torquati
 * \li  uMPMC_Ptr_Queue  unbounded MPMC queue by Massimo Torquati
 */
/* ***************************************************************************
 *
 *  FastFlow is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License version 3 as
 *  published by the Free Software Foundation.
 *  Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
 *  or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
 *
 *  This program is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 *  License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 ****************************************************************************
 */

#ifndef FF_MPMCQUEUE_HPP
#define FF_MPMCQUEUE_HPP

/*
 * This file contains Multi-Producer/Multi-Consumer queue implementations.
 *
 *   * MPMC_Ptr_Queue   bounded MPMC queue by Dmitry Vyukov
 *   * uMPMC_Ptr_Queue  unbounded MPMC queue by Massimo Torquati
 *   * MSqueue          unbounded MPMC queue by Michael & Scott
 *
 *  - Author:
 *     Massimo Torquati <torquati@di.unipi.it> <massimotor@gmail.com>
 *
 *  - History
 *    10 Jul 2012: M. Aldinucci: Minor fixes
 *     4 Oct 2015: M. Aldinucci: cleaning related to better c++11 compliance
 */


#include <cstdlib>
#include <vector>
#include <ff/buffer.hpp>
#include <ff/sysdep.h>
#include <ff/allocator.hpp>
#include <ff/platforms/platform.h>
#include <ff/mpmc/asm/abstraction_dcas.h>
#include <ff/spin-lock.hpp>


/*
 * NOTE: You should define NO_STD_C0X if you want to avoid c++0x and c++11
 *
 */

#if ( (!defined(NO_STD_C0X))  &&  !(__cplusplus >= 201103L))
#pragma message ("Define -DNO_STD_C0X to use a non c++0x/c++11 compiler")
#endif

//#define NO_STD_C0X


// // Check for g++ version >= 4.5
// #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
//  #include <atomic>
// #else
//  // Check for g++ version >= 4.4
//  #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
//   #include <cstdatomic>
//  #else
//   #define USE_STD_0X
//  #endif
// #endif
//#endif // USE_STD_C0X


#define CAS abstraction_cas

namespace ff {

/*
 *  In the following we implement two kinds of queues:
 *   - the MPMC_Ptr_Queue is an implementation of the ** bounded **
 *     Multi-Producer/Multi-Consumer queue algorithm by Dmitry Vyukov
 *     (www.1024cores.net). It stores pointers.
 *
 *   - the uMPMC_Ptr_Queue implements an ** unbounded **
 *     Multi-Producer/Multi-Consumer queue which does not require
 *     any special memory allocator to avoid dangling pointers.
 *     The implementation blends together the MPMC_Ptr_Queue and the
 *     uSWSR_Ptr_Buffer.
 *
 */

/*!
 * \class MPMC_Ptr_Queue
 *  \ingroup aux_classes
 *
 * \brief An implementation of the \a bounded Multi-Producer/Multi-Consumer queue. Not currently used.
 *
 * This class describes an implementation of the MPMC queue inspired by the solution
 * proposed by <a href="https://sites.google.com/site/1024cores/home/lock-free-algorithms/queues/bounded-mpmc-queue" target="_blank">Dmitry Vyukov</a>. \n
 *
 * \note There are two versions 1) with atomic operations 2) using new C++0X standard (compile with -DUSE_STD_C0X).
 *
 *
 */
#if !defined(NO_STD_C0X)
#include <atomic>

class MPMC_Ptr_Queue {
private:
    struct element_t {
        std::atomic<unsigned long> seq;
        void *                     data;
    };

public:
    /*
     * \brief Constructor
     */
    MPMC_Ptr_Queue() {}

    /*
     * \brief Destructor
     */
    ~MPMC_Ptr_Queue() {
        if (buf) {
            delete [] buf;
            buf=NULL;
        }
    }

    /*    |  data  | seq |        |  data  | seq |        |  data  | seq |
     *    |  NULL  |  0  | ------ |  NULL  |  1  | ------ |  NULL  | ... |
     *    ||||||||||||||||        ||||||||||||||||        ||||||||||||||||
     *                |
     *                |
     *                |
     *          pwrite pread
     */

    /**
     * \brief init
     */
    inline bool init(size_t size) {
        if (size<2) size=2;
        // we need a size that is a power 2 in order to set the mask
        if (!isPowerOf2(size)) size = nextPowerOf2(size);
        mask = size-1;

        buf = new element_t[size];
        if (!buf) return false;
        for(size_t i=0;i<size;++i) {
            buf[i].data = NULL;
            buf[i].seq.store(i,std::memory_order_relaxed);

            // store method
            // Atomically stores the value 'i'.
            //
            // Memory is affected according to the value of memory_order:
            // memory_order must be one of
            //      std::memory_order_relaxed
            //      std::memory_order_release
            //      std::memory_order_seq_cst.
            // Otherwise the behavior is undefined.
        }
        pwrite.store(0,std::memory_order_relaxed);
        pread.store(0,std::memory_order_relaxed);
        return true;
    }

    /**
     * \brief push: enqueue data
     *
     * This method is non-blocking and costs one CAS per operation.
     */
    inline bool push(void *const data) {
        unsigned long pw, seq;
        element_t * node;
        unsigned long bk = BACKOFF_MIN;
        do {
            pw    = pwrite.load(std::memory_order_relaxed);
            node  = &buf[pw & mask];
            seq   = node->seq.load(std::memory_order_acquire);

            // load method
            // Atomically loads and returns the current value of the atomic variable.
            // Memory is affected according to the value of memory_order.

            if (pw == seq) { // CAS
                if (pwrite.compare_exchange_weak(pw, pw+1, std::memory_order_relaxed))
                    break;

                // exponential delay with max value
                for(volatile unsigned i=0;i<bk;++i) ;
                bk <<= 1;
                bk &= BACKOFF_MAX;
            } else
                if (pw > seq) return false; // queue full
        } while(1);
        node->data = data;
        node->seq.store(seq+1,std::memory_order_release);
        return true;
    }

    /**
     * pop method: dequeue data from the queue.
     *
     * This is a non-blocking method.
     *
     */
    inline bool pop(void** data) {
        unsigned long pr, seq;
        element_t * node;
        unsigned long bk = BACKOFF_MIN;

        do {
            pr    = pread.load(std::memory_order_relaxed);
            node  = &buf[pr & mask];
            seq   = node->seq.load(std::memory_order_acquire);

            long diff = seq - (pr+1);
            if (diff == 0) { // CAS
                if (pread.compare_exchange_weak(pr, (pr+1), std::memory_order_relaxed))
                    break;

                // exponential delay with max value
                for(volatile unsigned i=0;i<bk;++i) ;
                bk <<= 1;
                bk &= BACKOFF_MAX;
            } else {
                if (diff < 0) return false; // queue empty
            }
        } while(1);
        *data = node->data;
        node->seq.store((pr+mask+1), std::memory_order_release);
        return true;
    }

private:
    union {
        std::atomic<unsigned long>  pwrite; /// Pointer to the location where to write to
        char padding1[CACHE_LINE_SIZE];
    };
    union {
        std::atomic<unsigned long>  pread;  /// Pointer to the location where to read from
        char padding2[CACHE_LINE_SIZE];
    };
    element_t *                 buf;
    unsigned long               mask;
};


#else  // using internal atomic operations
#include <ff/mpmc/asm/atomic.h>

class MPMC_Ptr_Queue {
protected:

    struct element_t {
        atomic_long_t seq;
        void *        data;
    };

public:
    /**
     *  \brief Constructor
     */
    MPMC_Ptr_Queue() {}

    /**
     *
     * \brief Destructor
     */
    ~MPMC_Ptr_Queue() {
        if (buf) {
            freeAlignedMemory(buf);
            buf = NULL;
        }
    }

    /*    |  data  | seq |        |  data  | seq |        |  data  | seq |
     *    |  NULL  |  0  | ------ |  NULL  |  1  | ------ |  NULL  | ... |
     *    ||||||||||||||||        ||||||||||||||||        ||||||||||||||||
     *                |
     *                |
     *                |
     *          pwrite pread
     */

    /**
     * \brief init
     */
    inline bool init(size_t size) {
        if (size<2) size=2;
        // we need a size that is a power 2 in order to set the mask
        if (!isPowerOf2(size)) size = nextPowerOf2(size);
        mask = (unsigned long) (size-1);

        buf=(element_t*)getAlignedMemory(longxCacheLine*sizeof(long),size*sizeof(element_t));
        if (!buf) return false;
        for(size_t i=0;i<size;++i) {
            buf[i].data = NULL;
            atomic_long_set(&buf[i].seq,long(i));
        }
        atomic_long_set(&pwrite,0);
        atomic_long_set(&pread,0);

        return true;
    }

    /**
     * Push method: enqueue data in the queue.
     *
     * This method is non-blocking and costs one CAS per operation.
     *
     */
    inline bool push(void *const data) {
        unsigned long pw, seq;
        element_t * node;
        unsigned long bk = BACKOFF_MIN;

        do {
            pw    = atomic_long_read(&pwrite);
            node  = &buf[pw & mask];
            seq   = atomic_long_read(&node->seq);

            if (pw == seq) {
                if (abstraction_cas((volatile atom_t*)&pwrite, (atom_t)(pw+1), (atom_t)pw)==(atom_t)pw)
                    break;

                // exponential delay with max value
                for(volatile unsigned i=0;i<bk;++i) ;
                bk <<= 1;
                bk &= BACKOFF_MAX;
            } else
                if (pw > seq) return false;

        } while(1);
        node->data = data;
        //atomic_long_inc(&node->seq);
        atomic_long_set(&node->seq, (seq+1));
        return true;
    }

    /**
     * Pop method: dequeue data from the queue.
     *
     * This is a non-blocking method.
     *
     */
    inline bool pop(void** data) {
        unsigned long pr , seq;
        element_t * node;
        unsigned long bk = BACKOFF_MIN;

        do {
            pr    = atomic_long_read(&pread);
            node  = &buf[pr & mask];
            seq   = atomic_long_read(&node->seq);
            long diff = seq - (pr+1);
            if (diff == 0) {
                if (abstraction_cas((volatile atom_t*)&pread, (atom_t)(pr+1), (atom_t)pr)==(atom_t)pr)
                    break;

                // exponential delay with max value
                for(volatile unsigned i=0;i<bk;++i) ;
                bk <<= 1;
                bk &= BACKOFF_MAX;
            } else {
                if (diff < 0) return false;
            }

        } while(1);
        *data = node->data;
        atomic_long_set(&node->seq,(pr+mask+1));
        return true;
    }

private:
    // WARNING: on 64bit Windows platform sizeof(unsigned long) = 32 !!
    union {
        atomic_long_t  pwrite;
        char           padding1[CACHE_LINE_SIZE];
    };
    union {
        atomic_long_t  pread;
        char           padding2[CACHE_LINE_SIZE];
    };
protected:
    element_t *    buf;
    unsigned long  mask;
};


/*!
 * \class uMPMC_Ptr_Queue
 *  \ingroup building_blocks
 *
 * \brief An implementation of the \a unbounded Multi-Producer/Multi-Consumer queue
 *
 * This class implements an \a unbounded  MPMC queue which does not require
 * any special memory allocator to avoid dangling pointers. The implementation blends
 * together the MPMC_Ptr_Queue and the uSWSR_Ptr_Buffer. \n
 *
 * It uses internal atomic operations.
 *
 * This class is defined in \ref MPMCqueues.hpp
 *
 */
class uMPMC_Ptr_Queue {
protected:
    enum {DEFAULT_NUM_QUEUES=4, DEFAULT_uSPSC_SIZE=2048};

    typedef void *        data_element_t;
    typedef atomic_long_t sequenceP_t;
    typedef atomic_long_t sequenceC_t;

public:
    /**
     * \brief Constructor
     */
    uMPMC_Ptr_Queue() {}

    /**
     * \brief Destructor
     */
    ~uMPMC_Ptr_Queue() {
        if (buf) {
            for(size_t i=0;i<(mask+1);++i) {
                if (buf[i]) delete (uSWSR_Ptr_Buffer*)(buf[i]);
            }
            freeAlignedMemory(buf);
            buf = NULL;
        }
        if (seqP) freeAlignedMemory(seqP);
        if (seqC) freeAlignedMemory(seqC);
    }

    /**
     * \brief init
     */
    inline bool init(unsigned long nqueues=DEFAULT_NUM_QUEUES, size_t size=DEFAULT_uSPSC_SIZE) {
        if (nqueues<2) nqueues=2;
        if (!isPowerOf2(nqueues)) nqueues = nextPowerOf2(nqueues);
        mask = nqueues-1;

        buf=(data_element_t*)getAlignedMemory(longxCacheLine*sizeof(long),nqueues*sizeof(data_element_t));
        seqP=(sequenceP_t*)getAlignedMemory(longxCacheLine*sizeof(long),nqueues*sizeof(sequenceP_t));
        seqC=(sequenceP_t*)getAlignedMemory(longxCacheLine*sizeof(long),nqueues*sizeof(sequenceC_t));

        for(size_t i=0;i<nqueues;++i) {
            buf[i]= new uSWSR_Ptr_Buffer(size);
            ((uSWSR_Ptr_Buffer*)(buf[i]))->init();
            atomic_long_set(&(seqP[i]),long(i));
            atomic_long_set(&(seqC[i]),long(i));
        }
        atomic_long_set(&preadP,0);
        atomic_long_set(&preadC,0);
        return true;
    }

    /**
     * \brief nonblocking push
     *
     * \return It always returns true
     */
    inline bool push(void *const data) {
        unsigned long pw,seq,idx;
        unsigned long bk = BACKOFF_MIN;
        do {
            pw    = atomic_long_read(&preadP);
            idx   = pw & mask;
            seq   = atomic_long_read(&seqP[idx]);
            if (pw == seq) {
                if (abstraction_cas((volatile atom_t*)&preadP, (atom_t)(pw+1), (atom_t)pw)==(atom_t)pw)
                    break;

                // exponential delay with max value
                for(volatile unsigned i=0;i<bk;++i) ;
                bk <<= 1;
                bk &= BACKOFF_MAX;
            }
        } while(1);
        ((uSWSR_Ptr_Buffer*)(buf[idx]))->push(data); // cannot fail
        atomic_long_set(&seqP[idx],(pw+mask+1));
        return true;
    }

    /**
     * \brieg nonblocking pop
     *
     */
    inline bool pop(void ** data) {
        unsigned long pr,idx;
		long seq;
        unsigned long bk = BACKOFF_MIN;

        do {
            pr     = atomic_long_read(&preadC);
            idx    = pr & mask;
            seq    = atomic_long_read(&seqC[idx]);
            if (pr == (unsigned long)seq) {
                if (atomic_long_read(&seqP[idx]) <= (unsigned long)seq) return false; // queue
                if (abstraction_cas((volatile atom_t*)&preadC, (atom_t)(pr+1), (atom_t)pr)==(atom_t)pr)
                    break;

                // exponential delay with max value
                for(volatile unsigned i=0;i<bk;++i) ;
                bk <<= 1;
                bk &= BACKOFF_MAX;
            }
        } while(1);
        ((uSWSR_Ptr_Buffer*)(buf[idx]))->pop(data);
        atomic_long_set(&seqC[idx],(pr+mask+1));
        return true;
    }

private:
    union {
        atomic_long_t  preadP;
        char           padding1[CACHE_LINE_SIZE];
    };
    union {
        atomic_long_t  preadC;
        char           padding2[CACHE_LINE_SIZE];
    };
protected:
    data_element_t *  buf;
    sequenceP_t    *  seqP;
    sequenceC_t    *  seqC;
    unsigned long     mask;

};


/*!
 * \class MSqueue
 * \ingroup aux_classes
 *
 * \brief Michael and Scott MPMC. Not currently used.
 *
 * See:  M. Michael and M. Scott, "Simple, Fast, and Practical
 * Non-Blocking and Blocking Concurrent Queue Algorithms", PODC 1996.
 *
 * The MSqueue implementation is inspired to the one in the \p liblfds
 * libraly that is a portable, license-free, lock-free data structure
 * library written in C. The liblfds implementation uses double-word CAS
 * (aka DCAS) whereas this implementation uses only single-word CAS
 * since it relies on a implementation of a memory allocator (used to
 * allocate internal queue nodes) which implements a deferred reclamation
 * algorithm able to solve both the ABA problem and the dangling pointer
 * problem.
 *
 * More info about liblfds can be found at http://www.liblfds.org
 *
 */
class MSqueue {
private:
    enum {MSQUEUE_PTR=0 };

    // forward decl of Node type
    struct Node;

    struct Pointer {
        Pointer() { ptr[MSQUEUE_PTR]=0;}

        inline bool operator !() {
            return (ptr[MSQUEUE_PTR]==0);
        }
        inline Pointer& operator=(const Pointer & p) {
            ptr[MSQUEUE_PTR]=p.ptr[MSQUEUE_PTR];
            return *this;
        }

        inline Pointer& operator=(Node & node) {
            ptr[MSQUEUE_PTR]=&node;
            return *this;
        }

        inline Pointer & getNodeNext() {
            return ptr[MSQUEUE_PTR]->next;
        }
        inline Node * getNode() { return  ptr[MSQUEUE_PTR]; }

        inline bool operator==( const Pointer& r ) const {
            return ((ptr[MSQUEUE_PTR]==r.ptr[MSQUEUE_PTR]));
        }

        inline operator volatile atom_t * () const {
            union { Node* const volatile* p1; volatile atom_t * p2;} pn;
            pn.p1 = ptr;
            return pn.p2;
        }
        inline operator atom_t * () const {
            union { Node* const volatile* p1; atom_t * p2;} pn;
            pn.p1 = ptr;
            return pn.p2;
        }

        inline operator atom_t () const {
            union { Node* volatile p1; atom_t p2;} pn;
            pn.p1 = ptr[MSQUEUE_PTR];
            return pn.p2;
        }

        inline void set(Node & node) {
            ptr[MSQUEUE_PTR]=&node;
        }

        inline void * getData() const { return ptr[MSQUEUE_PTR]->getData(); }

        Node * volatile ptr[1];
    } ALIGN_TO_POST(ALIGN_SINGLE_POINTER);

    struct Node {
        Node():data(0) { next.ptr[MSQUEUE_PTR]=0;}
        Node(void * data):data(data) {
            next.ptr[MSQUEUE_PTR]=0;
        }

        inline operator atom_t * () const { return (atom_t *)next; }

        inline void   setData(void * const d) { data=d;}
        inline void * getData() const { return data; }

        Pointer   next;
        void    * data;
    } ALIGN_TO_POST(ALIGN_DOUBLE_POINTER);

    Pointer  head;
    long     padding1[longxCacheLine-1];
    Pointer  tail;
    long     padding2[longxCacheLine-1];;
    FFAllocator *delayedAllocator;

private:
    inline void allocnode(Pointer & p, void * data) {
        union { Node * p1; void * p2;} pn;

        if (delayedAllocator->posix_memalign((void**)&pn.p2,ALIGN_DOUBLE_POINTER,sizeof(Node))!=0) {
            abort();
        }
        new (pn.p2) Node(data);
        p.set(*pn.p1);
    }

    inline void deallocnode( Node * n) {
        n->~Node();
        delayedAllocator->free(n);
    }

public:
    MSqueue(): delayedAllocator(NULL) { }

    ~MSqueue() {
        if (delayedAllocator)  {
            delete delayedAllocator;
            delayedAllocator = NULL;
        }
    }

    MSqueue& operator=(const MSqueue& v) {
        head=v.head;
        tail=v.tail;
        return *this;
    }

    /** initialize the MSqueue */
    int init() {
        if (delayedAllocator) return 0;
        delayedAllocator = new FFAllocator(2);
        if (!delayedAllocator) {
            error("MSqueue::init, cannot allocate FFAllocator\n");
            return -1;
        }

        // create the first NULL node
        // so the queue is never really empty
        Pointer dummy;
        allocnode(dummy,NULL);

        head = dummy;
        tail = dummy;
        return 1;
    }

    // insert method, it never fails
    inline bool push(void * const data) {
        bool done = false;

        Pointer tailptr ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
        Pointer next    ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
        Pointer node    ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
        allocnode(node,data);

        do {
            tailptr = tail;
            next    = tailptr.getNodeNext();

            if (tailptr == tail) {
                if (!next) { // tail was pointing to the last node
                    done = (CAS((volatile atom_t *)(tailptr.getNodeNext()),
                                (atom_t)node,
                                (atom_t)next) == (atom_t)next);
                } else {     // tail was not pointing to the last node
                    CAS((volatile atom_t *)tail, (atom_t)next, (atom_t)tailptr);
                }
            }
        } while(!done);
        CAS((volatile atom_t *)tail, (atom_t)node, (atom_t) tailptr);
        return true;
    }

    // extract method, it returns false if the queue is empty
    inline bool  pop(void ** data) {
        bool done = false;

        ALIGN_TO_PRE(ALIGN_SINGLE_POINTER) Pointer headptr;
        ALIGN_TO_PRE(ALIGN_SINGLE_POINTER) Pointer tailptr;
        ALIGN_TO_PRE(ALIGN_SINGLE_POINTER) Pointer next;

        do {
            headptr = head;
            tailptr = tail;
            next    = headptr.getNodeNext();

            if (head == headptr) {
                if (headptr.getNode() == tailptr.getNode()) {
                    if (!next) return false; // empty
                    CAS((volatile atom_t *)tail, (atom_t)next, (atom_t)tailptr);
                } else {
                    *data = next.getData();
                    done = (CAS((volatile atom_t *)head, (atom_t)next, (atom_t)headptr) == (atom_t)headptr);
                }
            }
        } while(!done);

        deallocnode(headptr.getNode());
        return true;
    }

    // return true if the queue is empty
    inline bool empty() {
        if ((head.getNode() == tail.getNode()) && !(head.getNodeNext()))
            return true;
        return false;
    }
};


/* ---------------------- experimental code -------------------------- */


class multiSWSR {
protected:
    enum {DEFAULT_NUM_QUEUES=4, DEFAULT_uSPSC_SIZE=2048};

public:
    multiSWSR() {}

    ~multiSWSR() {
        if (buf) {
            for(size_t i=0;i<(mask+1);++i) {
                if (buf[i]) delete buf[i];
            }
            freeAlignedMemory(buf);
            buf = NULL;
        }
        if (PLock) freeAlignedMemory(PLock);
        if (CLock) freeAlignedMemory(CLock);
    }

    inline bool init(unsigned long nqueues=DEFAULT_NUM_QUEUES, size_t size=DEFAULT_uSPSC_SIZE) {
        if (nqueues<2) nqueues=2;
        if (!isPowerOf2(nqueues)) nqueues = nextPowerOf2(nqueues);
        mask = nqueues-1;

        buf=(uSWSR_Ptr_Buffer**)getAlignedMemory(CACHE_LINE_SIZE,nqueues*sizeof(uSWSR_Ptr_Buffer*));
        PLock=(CLHSpinLock*)getAlignedMemory(CACHE_LINE_SIZE,nqueues*sizeof(CLHSpinLock));
        CLock=(CLHSpinLock*)getAlignedMemory(CACHE_LINE_SIZE,nqueues*sizeof(CLHSpinLock));

        for(size_t i=0;i<nqueues;++i) {
            buf[i]= new uSWSR_Ptr_Buffer(size);
            buf[i]->init();
            PLock[i].init();
            CLock[i].init();
        }
        atomic_long_set(&count, 0);
        atomic_long_set(&enqueue,0);
        atomic_long_set(&dequeue,0);
        return true;
    }

    // it always returns true
    inline bool push(void *const data, int tid) {
        long q = atomic_long_inc_return(&enqueue) & mask;
        PLock[q].spin_lock(tid);
        buf[q]->push(data);
        PLock[q].spin_unlock(tid);
        atomic_long_inc(&count);
        return true;
    }

    // non-blocking pop
    inline bool pop(void ** data, int tid) {
        if (!atomic_long_read(&count))  return false; // empty

        long q = atomic_long_inc_return(&dequeue) & mask;
        CLock[q].spin_lock(tid);
        bool r = buf[q]->pop(data);
        CLock[q].spin_unlock(tid);
        if (r) { atomic_long_dec(&count); return true;}
        return false;
    }

private:
    union {
        atomic_long_t  enqueue;
        char           padding1[CACHE_LINE_SIZE];
    };
    union {
        atomic_long_t  dequeue;
        char           padding2[CACHE_LINE_SIZE];
    };
    union {
        atomic_long_t  count;
        char           padding3[CACHE_LINE_SIZE];
    };
protected:
    uSWSR_Ptr_Buffer **buf;
    CLHSpinLock *PLock;
    CLHSpinLock *CLock;
    size_t   mask;
};


/*
 * Simple and scalable Multi-Producer/Multi-Consumer queue.
 * By defining at compile time MULTI_MPMC_RELAX_FIFO_ORDERING it is possible
 * to improve performance relaxing FIFO ordering in the pop method.
 *
 * The underling MPMC queue (the Q template parameter) should export at least
 * the following methods:
 *
 *   bool push(T)
 *   bool pop(T&)
 *   bool empty()
 *
 *
 */
template <typename Q>
class scalableMPMCqueue {
public:
    enum {DEFAULT_POOL_SIZE=4};

    scalableMPMCqueue() {
        //enqueue.store(0);
        //count.store(0);
        atomic_long_set(&enqueue,0);
        atomic_long_set(&count,0);

#if !defined(MULTI_MPMC_RELAX_FIFO_ORDERING)
        // NOTE: dequeue must start from 1 because enqueue is incremented
        //       using atomic_long_inc_return which first increments and than
        //       return the value.
        //dequeue.store(1);
        atomic_long_set(&dequeue,1);
#else
        //dequeue.store(0);
        atomic_long_set(&dequeue,0);
#endif
    }

    int init(size_t poolsize = DEFAULT_POOL_SIZE) {
        if (poolsize > pool.size()) {
            pool.resize(poolsize);
        }

        // WARNING: depending on Q, pool elements may need to be initialized

        return 1;
    }

    // insert method, it never fails if data is not NULL
    inline bool push(void * const data) {
        //long q = (1 + enqueue.fetch_add(1)) % pool.size();
        long q = atomic_long_inc_return(&enqueue) % pool.size();
        bool r = pool[q].push(data);
        if (r) atomic_long_inc(&count);
        //if (r) count.fetch_add(1);
        return r;
    }

    // extract method, it returns false if the queue is empty
    inline bool  pop(void ** data) {
        if (!atomic_long_read(&count))  return false; // empty
        //if (!count.load()) return false;
#if !defined(MULTI_MPMC_RELAX_FIFO_ORDERING)
        unsigned long bk = BACKOFF_MIN;
        //
        // enforce FIFO ordering for the consumers
        //
        long q, q1;
        do {
            q  = atomic_long_read(&dequeue), q1 = atomic_long_read(&enqueue);
            //q = dequeue.load(); q1 = enqueue.load();
            if (q > q1) return false;
            if (CAS((volatile atom_t *)&dequeue, (atom_t)(q+1), (atom_t)q) == (atom_t)q) break;
            //if(dequeue.compare_exchange_strong(<#long &__e#>, <#long __d#>)
            // exponential delay with max value
            for(volatile unsigned i=0;i<bk;++i) ;
            bk <<= 1;
            bk &= BACKOFF_MAX;
        } while(1);

        q %= pool.size();
        if (pool[q].pop(data)) {
            atomic_long_dec(&count);
            //count.fetch_sub(1);
            return true;
        }
        return false;

#else  // MULTI_MPMC_RELAX_FIFO_ORDERING
        long q = atomic_long_inc_return(&dequeue) % pool.size();
        bool r = pool[q].pop(data);
        if (r) { atomic_long_dec(&count); return true;}
        return false;
#endif
    }

    // check if the queue is empty
    inline bool empty() {
        for(size_t i=0;i<pool.size();++i)
            if (!pool[i].empty()) return false;
        return true;
    }
private:
    // std::atomic<long> enqueue;
    atomic_long_t enqueue;
    long padding1[longxCacheLine-sizeof(atomic_long_t)];
    //std::atomic<long> dequeue;
    atomic_long_t dequeue;
    long padding2[longxCacheLine-sizeof(atomic_long_t)];
    //std::atomic<long> count;
    atomic_long_t count;
    long padding3[longxCacheLine-sizeof(atomic_long_t)];
protected:
    std::vector<Q> pool;
};

/*
 * multiMSqueue is a specialization of the scalableMPMCqueue which uses the MSqueue
*/
    class multiMSqueue: public scalableMPMCqueue<MSqueue> {
    public:

        multiMSqueue(size_t poolsize = scalableMPMCqueue<MSqueue>::DEFAULT_POOL_SIZE) {
            if (! scalableMPMCqueue<MSqueue>::init(poolsize)) {
                error("multiMSqueue init ERROR\n");
                abort();
            }

            for(size_t i=0;i<poolsize;++i)
                if (pool[i].init()<0) {
                    error("multiMSqueue init ERROR\n");
                    abort();
                }
        }
    };


#endif // USE_STD_C0X


/* ---------------------- MaX experimental code -------------------------- */
#if 0
/*
 *
 *   bool push(T)
 *   bool pop(T&)
 *   bool empty()
 *
 *
 */
    typedef struct{
        unsigned long data;
        unsigned long next;
        long padding1[64-2*sizeof(unsigned long)];
    }utMPMC_list_node_t;

    typedef struct{
        /*HEAD*/
        utMPMC_list_node_t* head;
        long padding0[64-sizeof(unsigned long)];
        /*TAIL*/
        utMPMC_list_node_t* tail;
        long padding1[64-sizeof(unsigned long)];
    }utMPMC_list_info_t;

    typedef struct{
        /*address*/
        utMPMC_list_info_t l;
        /*status*/
        unsigned long s;
        long padding0[64-sizeof(unsigned long)];
    }utMPMC_VB_note_t;

#if !defined(NEXT_SMALLEST_2_POW)
#define NEXT_SMALLEST_2_POW(A) (1 << (32 - __builtin_clz((A)-1)))
#endif

#if !defined(VOLATILE_READ)
#define VOLATILE_READ(X)  (*(volatile typeof(X)*)&X)

#if !defined(OPTIMIZED_MOD_ON_2_POW)
#define OPTIMIZED_MOD_ON_2_POW(X,Y) ((X) & (Y))
#endif

#define IS_WRITABLE(STATUS,MYEQC) (STATUS==MYEQC)
#define WRITABLE_STATUS(STATUS,MYEQC) (MYEQC)
#define UPDATE_AFTER_WRITE(STATUS) (STATUS+1)

#define IS_READABLE(STATUS,MYDQC) (STATUS==MYDQC+1)
#define READABLE_STATUS(STATUS,MYDQC) (MYDQC+1)
#define UPDATE_AFTER_READ(STATUS,LEN) (STATUS+LEN-1)
#endif

    template <typename Q>
    class utMPMC_VB {
    public:
        enum {DEFAULT_POOL_SIZE=4};

        utMPMC_VB() {
            dqc =0;
            eqc = 0;
            /*
             * Both push and pop start from index 0
             */
            dqc = 0;
            eqc = 0;
        }

        int init(size_t vector_len) {

            len_v = NEXT_SMALLEST_2_POW(vector_len);
            len_v_minus_one = len_v-1;
            /*
             * Allocation and Init of the Vector
             */
            int done = posix_memalign((void **) v, longxCacheLine,
                                      sizeof(utMPMC_VB_note_t) * len_v);
            if (done != 0) {
                return 0;
            }
            int i = 0;
            for (i = 0; i < len_v; i++) {
                v[i].s = i;
                utMPMC_list_node_t * new_node;
                do{new_node = (utMPMC_list_node_t *)
                    malloc (sizeof(utMPMC_list_node_t));}while(new_node);
                new_node->data=NULL;
                new_node->next=NULL;
                v[i].l.tail=new_node;
                v[i].l.head=new_node;
            }

            return 1;
        }

    // insert method, it never fails!!
    inline bool push(void * const p) {
        utMPMC_list_node_t * new_node;
        do{new_node = (utMPMC_list_node_t *)
            malloc (sizeof(utMPMC_list_node_t));}while(new_node);
        new_node->data= (unsigned long) p;
        new_node->next=NULL;

		unsigned long myEQC = __sync_fetch_and_add (&eqc, 1UL);;
		unsigned long myI = OPTIMIZED_MOD_ON_2_POW(myEQC, len_v_minus_one);

		unsigned long target_status = WRITABLE_STATUS(target_status, myEQC);
		do{}while(VOLATILE_READ(v[myI].s) != target_status);

        /* List Stuff TODO*/
        v[myI].l.tail->next = new_node;
        v[myI].l.tail = new_node;
        target_status = UPDATE_AFTER_WRITE(target_status);
        /*barrier*/
        __sync_synchronize();
        v[myI].s = target_status;

		return true;
    }

    // extract method, it returns false if the queue is empty
    inline bool  pop(void ** ret_val) {
        	for (;;) {
		unsigned long myDQC = VOLATILE_READ(dqc);
		unsigned long myI = OPTIMIZED_MOD_ON_2_POW(myDQC, len_v_minus_one);
		unsigned long target_status = v[myI].s;


		if (IS_READABLE(target_status,myDQC) && (v[myI].l.tail!=v[myI].l.head)) {
			int atomic_result = __sync_bool_compare_and_swap(&dqc, myDQC,
					myDQC + 1);
			if (atomic_result) {
				/*
				 * that is my lucky day!! I've fished something...
				 */
                utMPMC_list_node_t* to_be_remoed =  v[myI].l.head;
                /* First Advance */
                v[myI].l.head = v[myI].l.head->next;
                /* Secondly Extract elem */
                *ret_val = v[myI].l.head->data;
                /* update the rest */
				target_status = UPDATE_AFTER_READ(target_status,len_v);
                __sync_synchronize();
				v[myI].s = target_status;
                free(to_be_remoed);
				return true;
			} else {
				continue;
			}
		} else {
			/*
			 * Check if someone changed the card while I was playing
			 */
			if (myDQC != VOLATILE_READ(dqc)) {
				continue;
			}
			if (VOLATILE_READ(eqc) != VOLATILE_READ(dqc)) {
				continue;
			}
			/*
			 * Sorry.. no space for you...
			 */
			return false;
		}
	}
	/*
	 * Impossible to reach this point!!!
	 */
	return true;
    }

//     inline bool empty() {
//         for(size_t i=0;i<pool.size();++i)
//             if (!pool[i].empty()) return false;
//          return true;
//     }
private:
        long padding0[64 - sizeof(unsigned long)];
        unsigned long eqc;
        long padding1[64 - sizeof(unsigned long)];
        unsigned long dqc;
        long padding2[64 - sizeof(unsigned long)];
        unsigned long len_v;
        unsigned long len_v_minus_one;
        utMPMC_VB_note_t * v;
        long padding3[64 - 3*sizeof(unsigned long)];
    };

// /*
//  * multiMSqueue is a specialization of the scalableMPMCqueue which uses the MSqueue
// */
// class multiMSqueue: public scalableMPMCqueue<MSqueue> {
// public:

//     multiMSqueue(size_t poolsize = scalableMPMCqueue<MSqueue>::DEFAULT_POOL_SIZE) {
//         if (! scalableMPMCqueue<MSqueue>::init(poolsize)) {
//             std::cerr << "multiMSqueue init ERROR, abort....\n";
//             abort();
//         }

//         for(size_t i=0;i<poolsize;++i)
//             if (pool[i].init()<0) {
//                 std::cerr << "ERROR initializing MSqueue, abort....\n";
//                 abort();
//             }
//     }
// };
#endif

} // namespace

#endif /* FF_MPMCQUEUE_HPP */