mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/mpmc/MPMCqueues.hpp
2025-01-04 01:25:05 +01:00

1210 lines
35 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/*!
* \file MPMCqueues.hpp
* \ingroup aux_classes
*
* \brief This file contains several MPMC queue implementations. Not
* currently used.
*
* This file contains the following
* Multi-Producer/Multi-Consumer queue implementations:
* \li MPMC_Ptr_Queue bounded MPMC queue by Dmitry Vyukov
* \li uMPMC_Ptr_Queue unbounded MPMC queue by Massimo Torquati
* \li uMPMC_Ptr_Queue unbounded MPMC queue by Massimo Torquati
*/
/* ***************************************************************************
*
* FastFlow is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License version 3 as
* published by the Free Software Foundation.
* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
****************************************************************************
*/
#ifndef FF_MPMCQUEUE_HPP
#define FF_MPMCQUEUE_HPP
/*
* This file contains Multi-Producer/Multi-Consumer queue implementations.
*
* * MPMC_Ptr_Queue bounded MPMC queue by Dmitry Vyukov
* * uMPMC_Ptr_Queue unbounded MPMC queue by Massimo Torquati
* * MSqueue unbounded MPMC queue by Michael & Scott
*
* - Author:
* Massimo Torquati <torquati@di.unipi.it> <massimotor@gmail.com>
*
* - History
* 10 Jul 2012: M. Aldinucci: Minor fixes
* 4 Oct 2015: M. Aldinucci: cleaning related to better c++11 compliance
*/
#include <cstdlib>
#include <vector>
#include <ff/buffer.hpp>
#include <ff/sysdep.h>
#include <ff/allocator.hpp>
#include <ff/platforms/platform.h>
#include <ff/mpmc/asm/abstraction_dcas.h>
#include <ff/spin-lock.hpp>
/*
* NOTE: You should define NO_STD_C0X if you want to avoid c++0x and c++11
*
*/
#if ( (!defined(NO_STD_C0X)) && !(__cplusplus >= 201103L))
#pragma message ("Define -DNO_STD_C0X to use a non c++0x/c++11 compiler")
#endif
//#define NO_STD_C0X
// // Check for g++ version >= 4.5
// #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
// #include <atomic>
// #else
// // Check for g++ version >= 4.4
// #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
// #include <cstdatomic>
// #else
// #define USE_STD_0X
// #endif
// #endif
//#endif // USE_STD_C0X
#define CAS abstraction_cas
namespace ff {
/*
* In the following we implement two kinds of queues:
* - the MPMC_Ptr_Queue is an implementation of the ** bounded **
* Multi-Producer/Multi-Consumer queue algorithm by Dmitry Vyukov
* (www.1024cores.net). It stores pointers.
*
* - the uMPMC_Ptr_Queue implements an ** unbounded **
* Multi-Producer/Multi-Consumer queue which does not require
* any special memory allocator to avoid dangling pointers.
* The implementation blends together the MPMC_Ptr_Queue and the
* uSWSR_Ptr_Buffer.
*
*/
/*!
* \class MPMC_Ptr_Queue
* \ingroup aux_classes
*
* \brief An implementation of the \a bounded Multi-Producer/Multi-Consumer queue. Not currently used.
*
* This class describes an implementation of the MPMC queue inspired by the solution
* proposed by <a href="https://sites.google.com/site/1024cores/home/lock-free-algorithms/queues/bounded-mpmc-queue" target="_blank">Dmitry Vyukov</a>. \n
*
* \note There are two versions 1) with atomic operations 2) using new C++0X standard (compile with -DUSE_STD_C0X).
*
*
*/
#if !defined(NO_STD_C0X)
#include <atomic>
class MPMC_Ptr_Queue {
private:
struct element_t {
std::atomic<unsigned long> seq;
void * data;
};
public:
/*
* \brief Constructor
*/
MPMC_Ptr_Queue() {}
/*
* \brief Destructor
*/
~MPMC_Ptr_Queue() {
if (buf) {
delete [] buf;
buf=NULL;
}
}
/* | data | seq | | data | seq | | data | seq |
* | NULL | 0 | ------ | NULL | 1 | ------ | NULL | ... |
* |||||||||||||||| |||||||||||||||| ||||||||||||||||
* |
* |
* |
* pwrite pread
*/
/**
* \brief init
*/
inline bool init(size_t size) {
if (size<2) size=2;
// we need a size that is a power 2 in order to set the mask
if (!isPowerOf2(size)) size = nextPowerOf2(size);
mask = size-1;
buf = new element_t[size];
if (!buf) return false;
for(size_t i=0;i<size;++i) {
buf[i].data = NULL;
buf[i].seq.store(i,std::memory_order_relaxed);
// store method
// Atomically stores the value 'i'.
//
// Memory is affected according to the value of memory_order:
// memory_order must be one of
// std::memory_order_relaxed
// std::memory_order_release
// std::memory_order_seq_cst.
// Otherwise the behavior is undefined.
}
pwrite.store(0,std::memory_order_relaxed);
pread.store(0,std::memory_order_relaxed);
return true;
}
/**
* \brief push: enqueue data
*
* This method is non-blocking and costs one CAS per operation.
*/
inline bool push(void *const data) {
unsigned long pw, seq;
element_t * node;
unsigned long bk = BACKOFF_MIN;
do {
pw = pwrite.load(std::memory_order_relaxed);
node = &buf[pw & mask];
seq = node->seq.load(std::memory_order_acquire);
// load method
// Atomically loads and returns the current value of the atomic variable.
// Memory is affected according to the value of memory_order.
if (pw == seq) { // CAS
if (pwrite.compare_exchange_weak(pw, pw+1, std::memory_order_relaxed))
break;
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
} else
if (pw > seq) return false; // queue full
} while(1);
node->data = data;
node->seq.store(seq+1,std::memory_order_release);
return true;
}
/**
* pop method: dequeue data from the queue.
*
* This is a non-blocking method.
*
*/
inline bool pop(void** data) {
unsigned long pr, seq;
element_t * node;
unsigned long bk = BACKOFF_MIN;
do {
pr = pread.load(std::memory_order_relaxed);
node = &buf[pr & mask];
seq = node->seq.load(std::memory_order_acquire);
long diff = seq - (pr+1);
if (diff == 0) { // CAS
if (pread.compare_exchange_weak(pr, (pr+1), std::memory_order_relaxed))
break;
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
} else {
if (diff < 0) return false; // queue empty
}
} while(1);
*data = node->data;
node->seq.store((pr+mask+1), std::memory_order_release);
return true;
}
private:
union {
std::atomic<unsigned long> pwrite; /// Pointer to the location where to write to
char padding1[CACHE_LINE_SIZE];
};
union {
std::atomic<unsigned long> pread; /// Pointer to the location where to read from
char padding2[CACHE_LINE_SIZE];
};
element_t * buf;
unsigned long mask;
};
#else // using internal atomic operations
#include <ff/mpmc/asm/atomic.h>
class MPMC_Ptr_Queue {
protected:
struct element_t {
atomic_long_t seq;
void * data;
};
public:
/**
* \brief Constructor
*/
MPMC_Ptr_Queue() {}
/**
*
* \brief Destructor
*/
~MPMC_Ptr_Queue() {
if (buf) {
freeAlignedMemory(buf);
buf = NULL;
}
}
/* | data | seq | | data | seq | | data | seq |
* | NULL | 0 | ------ | NULL | 1 | ------ | NULL | ... |
* |||||||||||||||| |||||||||||||||| ||||||||||||||||
* |
* |
* |
* pwrite pread
*/
/**
* \brief init
*/
inline bool init(size_t size) {
if (size<2) size=2;
// we need a size that is a power 2 in order to set the mask
if (!isPowerOf2(size)) size = nextPowerOf2(size);
mask = (unsigned long) (size-1);
buf=(element_t*)getAlignedMemory(longxCacheLine*sizeof(long),size*sizeof(element_t));
if (!buf) return false;
for(size_t i=0;i<size;++i) {
buf[i].data = NULL;
atomic_long_set(&buf[i].seq,long(i));
}
atomic_long_set(&pwrite,0);
atomic_long_set(&pread,0);
return true;
}
/**
* Push method: enqueue data in the queue.
*
* This method is non-blocking and costs one CAS per operation.
*
*/
inline bool push(void *const data) {
unsigned long pw, seq;
element_t * node;
unsigned long bk = BACKOFF_MIN;
do {
pw = atomic_long_read(&pwrite);
node = &buf[pw & mask];
seq = atomic_long_read(&node->seq);
if (pw == seq) {
if (abstraction_cas((volatile atom_t*)&pwrite, (atom_t)(pw+1), (atom_t)pw)==(atom_t)pw)
break;
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
} else
if (pw > seq) return false;
} while(1);
node->data = data;
//atomic_long_inc(&node->seq);
atomic_long_set(&node->seq, (seq+1));
return true;
}
/**
* Pop method: dequeue data from the queue.
*
* This is a non-blocking method.
*
*/
inline bool pop(void** data) {
unsigned long pr , seq;
element_t * node;
unsigned long bk = BACKOFF_MIN;
do {
pr = atomic_long_read(&pread);
node = &buf[pr & mask];
seq = atomic_long_read(&node->seq);
long diff = seq - (pr+1);
if (diff == 0) {
if (abstraction_cas((volatile atom_t*)&pread, (atom_t)(pr+1), (atom_t)pr)==(atom_t)pr)
break;
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
} else {
if (diff < 0) return false;
}
} while(1);
*data = node->data;
atomic_long_set(&node->seq,(pr+mask+1));
return true;
}
private:
// WARNING: on 64bit Windows platform sizeof(unsigned long) = 32 !!
union {
atomic_long_t pwrite;
char padding1[CACHE_LINE_SIZE];
};
union {
atomic_long_t pread;
char padding2[CACHE_LINE_SIZE];
};
protected:
element_t * buf;
unsigned long mask;
};
/*!
* \class uMPMC_Ptr_Queue
* \ingroup building_blocks
*
* \brief An implementation of the \a unbounded Multi-Producer/Multi-Consumer queue
*
* This class implements an \a unbounded MPMC queue which does not require
* any special memory allocator to avoid dangling pointers. The implementation blends
* together the MPMC_Ptr_Queue and the uSWSR_Ptr_Buffer. \n
*
* It uses internal atomic operations.
*
* This class is defined in \ref MPMCqueues.hpp
*
*/
class uMPMC_Ptr_Queue {
protected:
enum {DEFAULT_NUM_QUEUES=4, DEFAULT_uSPSC_SIZE=2048};
typedef void * data_element_t;
typedef atomic_long_t sequenceP_t;
typedef atomic_long_t sequenceC_t;
public:
/**
* \brief Constructor
*/
uMPMC_Ptr_Queue() {}
/**
* \brief Destructor
*/
~uMPMC_Ptr_Queue() {
if (buf) {
for(size_t i=0;i<(mask+1);++i) {
if (buf[i]) delete (uSWSR_Ptr_Buffer*)(buf[i]);
}
freeAlignedMemory(buf);
buf = NULL;
}
if (seqP) freeAlignedMemory(seqP);
if (seqC) freeAlignedMemory(seqC);
}
/**
* \brief init
*/
inline bool init(unsigned long nqueues=DEFAULT_NUM_QUEUES, size_t size=DEFAULT_uSPSC_SIZE) {
if (nqueues<2) nqueues=2;
if (!isPowerOf2(nqueues)) nqueues = nextPowerOf2(nqueues);
mask = nqueues-1;
buf=(data_element_t*)getAlignedMemory(longxCacheLine*sizeof(long),nqueues*sizeof(data_element_t));
seqP=(sequenceP_t*)getAlignedMemory(longxCacheLine*sizeof(long),nqueues*sizeof(sequenceP_t));
seqC=(sequenceP_t*)getAlignedMemory(longxCacheLine*sizeof(long),nqueues*sizeof(sequenceC_t));
for(size_t i=0;i<nqueues;++i) {
buf[i]= new uSWSR_Ptr_Buffer(size);
((uSWSR_Ptr_Buffer*)(buf[i]))->init();
atomic_long_set(&(seqP[i]),long(i));
atomic_long_set(&(seqC[i]),long(i));
}
atomic_long_set(&preadP,0);
atomic_long_set(&preadC,0);
return true;
}
/**
* \brief nonblocking push
*
* \return It always returns true
*/
inline bool push(void *const data) {
unsigned long pw,seq,idx;
unsigned long bk = BACKOFF_MIN;
do {
pw = atomic_long_read(&preadP);
idx = pw & mask;
seq = atomic_long_read(&seqP[idx]);
if (pw == seq) {
if (abstraction_cas((volatile atom_t*)&preadP, (atom_t)(pw+1), (atom_t)pw)==(atom_t)pw)
break;
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
}
} while(1);
((uSWSR_Ptr_Buffer*)(buf[idx]))->push(data); // cannot fail
atomic_long_set(&seqP[idx],(pw+mask+1));
return true;
}
/**
* \brieg nonblocking pop
*
*/
inline bool pop(void ** data) {
unsigned long pr,idx;
long seq;
unsigned long bk = BACKOFF_MIN;
do {
pr = atomic_long_read(&preadC);
idx = pr & mask;
seq = atomic_long_read(&seqC[idx]);
if (pr == (unsigned long)seq) {
if (atomic_long_read(&seqP[idx]) <= (unsigned long)seq) return false; // queue
if (abstraction_cas((volatile atom_t*)&preadC, (atom_t)(pr+1), (atom_t)pr)==(atom_t)pr)
break;
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
}
} while(1);
((uSWSR_Ptr_Buffer*)(buf[idx]))->pop(data);
atomic_long_set(&seqC[idx],(pr+mask+1));
return true;
}
private:
union {
atomic_long_t preadP;
char padding1[CACHE_LINE_SIZE];
};
union {
atomic_long_t preadC;
char padding2[CACHE_LINE_SIZE];
};
protected:
data_element_t * buf;
sequenceP_t * seqP;
sequenceC_t * seqC;
unsigned long mask;
};
/*!
* \class MSqueue
* \ingroup aux_classes
*
* \brief Michael and Scott MPMC. Not currently used.
*
* See: M. Michael and M. Scott, "Simple, Fast, and Practical
* Non-Blocking and Blocking Concurrent Queue Algorithms", PODC 1996.
*
* The MSqueue implementation is inspired to the one in the \p liblfds
* libraly that is a portable, license-free, lock-free data structure
* library written in C. The liblfds implementation uses double-word CAS
* (aka DCAS) whereas this implementation uses only single-word CAS
* since it relies on a implementation of a memory allocator (used to
* allocate internal queue nodes) which implements a deferred reclamation
* algorithm able to solve both the ABA problem and the dangling pointer
* problem.
*
* More info about liblfds can be found at http://www.liblfds.org
*
*/
class MSqueue {
private:
enum {MSQUEUE_PTR=0 };
// forward decl of Node type
struct Node;
struct Pointer {
Pointer() { ptr[MSQUEUE_PTR]=0;}
inline bool operator !() {
return (ptr[MSQUEUE_PTR]==0);
}
inline Pointer& operator=(const Pointer & p) {
ptr[MSQUEUE_PTR]=p.ptr[MSQUEUE_PTR];
return *this;
}
inline Pointer& operator=(Node & node) {
ptr[MSQUEUE_PTR]=&node;
return *this;
}
inline Pointer & getNodeNext() {
return ptr[MSQUEUE_PTR]->next;
}
inline Node * getNode() { return ptr[MSQUEUE_PTR]; }
inline bool operator==( const Pointer& r ) const {
return ((ptr[MSQUEUE_PTR]==r.ptr[MSQUEUE_PTR]));
}
inline operator volatile atom_t * () const {
union { Node* const volatile* p1; volatile atom_t * p2;} pn;
pn.p1 = ptr;
return pn.p2;
}
inline operator atom_t * () const {
union { Node* const volatile* p1; atom_t * p2;} pn;
pn.p1 = ptr;
return pn.p2;
}
inline operator atom_t () const {
union { Node* volatile p1; atom_t p2;} pn;
pn.p1 = ptr[MSQUEUE_PTR];
return pn.p2;
}
inline void set(Node & node) {
ptr[MSQUEUE_PTR]=&node;
}
inline void * getData() const { return ptr[MSQUEUE_PTR]->getData(); }
Node * volatile ptr[1];
} ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
struct Node {
Node():data(0) { next.ptr[MSQUEUE_PTR]=0;}
Node(void * data):data(data) {
next.ptr[MSQUEUE_PTR]=0;
}
inline operator atom_t * () const { return (atom_t *)next; }
inline void setData(void * const d) { data=d;}
inline void * getData() const { return data; }
Pointer next;
void * data;
} ALIGN_TO_POST(ALIGN_DOUBLE_POINTER);
Pointer head;
long padding1[longxCacheLine-1];
Pointer tail;
long padding2[longxCacheLine-1];;
FFAllocator *delayedAllocator;
private:
inline void allocnode(Pointer & p, void * data) {
union { Node * p1; void * p2;} pn;
if (delayedAllocator->posix_memalign((void**)&pn.p2,ALIGN_DOUBLE_POINTER,sizeof(Node))!=0) {
abort();
}
new (pn.p2) Node(data);
p.set(*pn.p1);
}
inline void deallocnode( Node * n) {
n->~Node();
delayedAllocator->free(n);
}
public:
MSqueue(): delayedAllocator(NULL) { }
~MSqueue() {
if (delayedAllocator) {
delete delayedAllocator;
delayedAllocator = NULL;
}
}
MSqueue& operator=(const MSqueue& v) {
head=v.head;
tail=v.tail;
return *this;
}
/** initialize the MSqueue */
int init() {
if (delayedAllocator) return 0;
delayedAllocator = new FFAllocator(2);
if (!delayedAllocator) {
error("MSqueue::init, cannot allocate FFAllocator\n");
return -1;
}
// create the first NULL node
// so the queue is never really empty
Pointer dummy;
allocnode(dummy,NULL);
head = dummy;
tail = dummy;
return 1;
}
// insert method, it never fails
inline bool push(void * const data) {
bool done = false;
Pointer tailptr ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
Pointer next ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
Pointer node ALIGN_TO_POST(ALIGN_SINGLE_POINTER);
allocnode(node,data);
do {
tailptr = tail;
next = tailptr.getNodeNext();
if (tailptr == tail) {
if (!next) { // tail was pointing to the last node
done = (CAS((volatile atom_t *)(tailptr.getNodeNext()),
(atom_t)node,
(atom_t)next) == (atom_t)next);
} else { // tail was not pointing to the last node
CAS((volatile atom_t *)tail, (atom_t)next, (atom_t)tailptr);
}
}
} while(!done);
CAS((volatile atom_t *)tail, (atom_t)node, (atom_t) tailptr);
return true;
}
// extract method, it returns false if the queue is empty
inline bool pop(void ** data) {
bool done = false;
ALIGN_TO_PRE(ALIGN_SINGLE_POINTER) Pointer headptr;
ALIGN_TO_PRE(ALIGN_SINGLE_POINTER) Pointer tailptr;
ALIGN_TO_PRE(ALIGN_SINGLE_POINTER) Pointer next;
do {
headptr = head;
tailptr = tail;
next = headptr.getNodeNext();
if (head == headptr) {
if (headptr.getNode() == tailptr.getNode()) {
if (!next) return false; // empty
CAS((volatile atom_t *)tail, (atom_t)next, (atom_t)tailptr);
} else {
*data = next.getData();
done = (CAS((volatile atom_t *)head, (atom_t)next, (atom_t)headptr) == (atom_t)headptr);
}
}
} while(!done);
deallocnode(headptr.getNode());
return true;
}
// return true if the queue is empty
inline bool empty() {
if ((head.getNode() == tail.getNode()) && !(head.getNodeNext()))
return true;
return false;
}
};
/* ---------------------- experimental code -------------------------- */
class multiSWSR {
protected:
enum {DEFAULT_NUM_QUEUES=4, DEFAULT_uSPSC_SIZE=2048};
public:
multiSWSR() {}
~multiSWSR() {
if (buf) {
for(size_t i=0;i<(mask+1);++i) {
if (buf[i]) delete buf[i];
}
freeAlignedMemory(buf);
buf = NULL;
}
if (PLock) freeAlignedMemory(PLock);
if (CLock) freeAlignedMemory(CLock);
}
inline bool init(unsigned long nqueues=DEFAULT_NUM_QUEUES, size_t size=DEFAULT_uSPSC_SIZE) {
if (nqueues<2) nqueues=2;
if (!isPowerOf2(nqueues)) nqueues = nextPowerOf2(nqueues);
mask = nqueues-1;
buf=(uSWSR_Ptr_Buffer**)getAlignedMemory(CACHE_LINE_SIZE,nqueues*sizeof(uSWSR_Ptr_Buffer*));
PLock=(CLHSpinLock*)getAlignedMemory(CACHE_LINE_SIZE,nqueues*sizeof(CLHSpinLock));
CLock=(CLHSpinLock*)getAlignedMemory(CACHE_LINE_SIZE,nqueues*sizeof(CLHSpinLock));
for(size_t i=0;i<nqueues;++i) {
buf[i]= new uSWSR_Ptr_Buffer(size);
buf[i]->init();
PLock[i].init();
CLock[i].init();
}
atomic_long_set(&count, 0);
atomic_long_set(&enqueue,0);
atomic_long_set(&dequeue,0);
return true;
}
// it always returns true
inline bool push(void *const data, int tid) {
long q = atomic_long_inc_return(&enqueue) & mask;
PLock[q].spin_lock(tid);
buf[q]->push(data);
PLock[q].spin_unlock(tid);
atomic_long_inc(&count);
return true;
}
// non-blocking pop
inline bool pop(void ** data, int tid) {
if (!atomic_long_read(&count)) return false; // empty
long q = atomic_long_inc_return(&dequeue) & mask;
CLock[q].spin_lock(tid);
bool r = buf[q]->pop(data);
CLock[q].spin_unlock(tid);
if (r) { atomic_long_dec(&count); return true;}
return false;
}
private:
union {
atomic_long_t enqueue;
char padding1[CACHE_LINE_SIZE];
};
union {
atomic_long_t dequeue;
char padding2[CACHE_LINE_SIZE];
};
union {
atomic_long_t count;
char padding3[CACHE_LINE_SIZE];
};
protected:
uSWSR_Ptr_Buffer **buf;
CLHSpinLock *PLock;
CLHSpinLock *CLock;
size_t mask;
};
/*
* Simple and scalable Multi-Producer/Multi-Consumer queue.
* By defining at compile time MULTI_MPMC_RELAX_FIFO_ORDERING it is possible
* to improve performance relaxing FIFO ordering in the pop method.
*
* The underling MPMC queue (the Q template parameter) should export at least
* the following methods:
*
* bool push(T)
* bool pop(T&)
* bool empty()
*
*
*/
template <typename Q>
class scalableMPMCqueue {
public:
enum {DEFAULT_POOL_SIZE=4};
scalableMPMCqueue() {
//enqueue.store(0);
//count.store(0);
atomic_long_set(&enqueue,0);
atomic_long_set(&count,0);
#if !defined(MULTI_MPMC_RELAX_FIFO_ORDERING)
// NOTE: dequeue must start from 1 because enqueue is incremented
// using atomic_long_inc_return which first increments and than
// return the value.
//dequeue.store(1);
atomic_long_set(&dequeue,1);
#else
//dequeue.store(0);
atomic_long_set(&dequeue,0);
#endif
}
int init(size_t poolsize = DEFAULT_POOL_SIZE) {
if (poolsize > pool.size()) {
pool.resize(poolsize);
}
// WARNING: depending on Q, pool elements may need to be initialized
return 1;
}
// insert method, it never fails if data is not NULL
inline bool push(void * const data) {
//long q = (1 + enqueue.fetch_add(1)) % pool.size();
long q = atomic_long_inc_return(&enqueue) % pool.size();
bool r = pool[q].push(data);
if (r) atomic_long_inc(&count);
//if (r) count.fetch_add(1);
return r;
}
// extract method, it returns false if the queue is empty
inline bool pop(void ** data) {
if (!atomic_long_read(&count)) return false; // empty
//if (!count.load()) return false;
#if !defined(MULTI_MPMC_RELAX_FIFO_ORDERING)
unsigned long bk = BACKOFF_MIN;
//
// enforce FIFO ordering for the consumers
//
long q, q1;
do {
q = atomic_long_read(&dequeue), q1 = atomic_long_read(&enqueue);
//q = dequeue.load(); q1 = enqueue.load();
if (q > q1) return false;
if (CAS((volatile atom_t *)&dequeue, (atom_t)(q+1), (atom_t)q) == (atom_t)q) break;
//if(dequeue.compare_exchange_strong(<#long &__e#>, <#long __d#>)
// exponential delay with max value
for(volatile unsigned i=0;i<bk;++i) ;
bk <<= 1;
bk &= BACKOFF_MAX;
} while(1);
q %= pool.size();
if (pool[q].pop(data)) {
atomic_long_dec(&count);
//count.fetch_sub(1);
return true;
}
return false;
#else // MULTI_MPMC_RELAX_FIFO_ORDERING
long q = atomic_long_inc_return(&dequeue) % pool.size();
bool r = pool[q].pop(data);
if (r) { atomic_long_dec(&count); return true;}
return false;
#endif
}
// check if the queue is empty
inline bool empty() {
for(size_t i=0;i<pool.size();++i)
if (!pool[i].empty()) return false;
return true;
}
private:
// std::atomic<long> enqueue;
atomic_long_t enqueue;
long padding1[longxCacheLine-sizeof(atomic_long_t)];
//std::atomic<long> dequeue;
atomic_long_t dequeue;
long padding2[longxCacheLine-sizeof(atomic_long_t)];
//std::atomic<long> count;
atomic_long_t count;
long padding3[longxCacheLine-sizeof(atomic_long_t)];
protected:
std::vector<Q> pool;
};
/*
* multiMSqueue is a specialization of the scalableMPMCqueue which uses the MSqueue
*/
class multiMSqueue: public scalableMPMCqueue<MSqueue> {
public:
multiMSqueue(size_t poolsize = scalableMPMCqueue<MSqueue>::DEFAULT_POOL_SIZE) {
if (! scalableMPMCqueue<MSqueue>::init(poolsize)) {
error("multiMSqueue init ERROR\n");
abort();
}
for(size_t i=0;i<poolsize;++i)
if (pool[i].init()<0) {
error("multiMSqueue init ERROR\n");
abort();
}
}
};
#endif // USE_STD_C0X
/* ---------------------- MaX experimental code -------------------------- */
#if 0
/*
*
* bool push(T)
* bool pop(T&)
* bool empty()
*
*
*/
typedef struct{
unsigned long data;
unsigned long next;
long padding1[64-2*sizeof(unsigned long)];
}utMPMC_list_node_t;
typedef struct{
/*HEAD*/
utMPMC_list_node_t* head;
long padding0[64-sizeof(unsigned long)];
/*TAIL*/
utMPMC_list_node_t* tail;
long padding1[64-sizeof(unsigned long)];
}utMPMC_list_info_t;
typedef struct{
/*address*/
utMPMC_list_info_t l;
/*status*/
unsigned long s;
long padding0[64-sizeof(unsigned long)];
}utMPMC_VB_note_t;
#if !defined(NEXT_SMALLEST_2_POW)
#define NEXT_SMALLEST_2_POW(A) (1 << (32 - __builtin_clz((A)-1)))
#endif
#if !defined(VOLATILE_READ)
#define VOLATILE_READ(X) (*(volatile typeof(X)*)&X)
#if !defined(OPTIMIZED_MOD_ON_2_POW)
#define OPTIMIZED_MOD_ON_2_POW(X,Y) ((X) & (Y))
#endif
#define IS_WRITABLE(STATUS,MYEQC) (STATUS==MYEQC)
#define WRITABLE_STATUS(STATUS,MYEQC) (MYEQC)
#define UPDATE_AFTER_WRITE(STATUS) (STATUS+1)
#define IS_READABLE(STATUS,MYDQC) (STATUS==MYDQC+1)
#define READABLE_STATUS(STATUS,MYDQC) (MYDQC+1)
#define UPDATE_AFTER_READ(STATUS,LEN) (STATUS+LEN-1)
#endif
template <typename Q>
class utMPMC_VB {
public:
enum {DEFAULT_POOL_SIZE=4};
utMPMC_VB() {
dqc =0;
eqc = 0;
/*
* Both push and pop start from index 0
*/
dqc = 0;
eqc = 0;
}
int init(size_t vector_len) {
len_v = NEXT_SMALLEST_2_POW(vector_len);
len_v_minus_one = len_v-1;
/*
* Allocation and Init of the Vector
*/
int done = posix_memalign((void **) v, longxCacheLine,
sizeof(utMPMC_VB_note_t) * len_v);
if (done != 0) {
return 0;
}
int i = 0;
for (i = 0; i < len_v; i++) {
v[i].s = i;
utMPMC_list_node_t * new_node;
do{new_node = (utMPMC_list_node_t *)
malloc (sizeof(utMPMC_list_node_t));}while(new_node);
new_node->data=NULL;
new_node->next=NULL;
v[i].l.tail=new_node;
v[i].l.head=new_node;
}
return 1;
}
// insert method, it never fails!!
inline bool push(void * const p) {
utMPMC_list_node_t * new_node;
do{new_node = (utMPMC_list_node_t *)
malloc (sizeof(utMPMC_list_node_t));}while(new_node);
new_node->data= (unsigned long) p;
new_node->next=NULL;
unsigned long myEQC = __sync_fetch_and_add (&eqc, 1UL);;
unsigned long myI = OPTIMIZED_MOD_ON_2_POW(myEQC, len_v_minus_one);
unsigned long target_status = WRITABLE_STATUS(target_status, myEQC);
do{}while(VOLATILE_READ(v[myI].s) != target_status);
/* List Stuff TODO*/
v[myI].l.tail->next = new_node;
v[myI].l.tail = new_node;
target_status = UPDATE_AFTER_WRITE(target_status);
/*barrier*/
__sync_synchronize();
v[myI].s = target_status;
return true;
}
// extract method, it returns false if the queue is empty
inline bool pop(void ** ret_val) {
for (;;) {
unsigned long myDQC = VOLATILE_READ(dqc);
unsigned long myI = OPTIMIZED_MOD_ON_2_POW(myDQC, len_v_minus_one);
unsigned long target_status = v[myI].s;
if (IS_READABLE(target_status,myDQC) && (v[myI].l.tail!=v[myI].l.head)) {
int atomic_result = __sync_bool_compare_and_swap(&dqc, myDQC,
myDQC + 1);
if (atomic_result) {
/*
* that is my lucky day!! I've fished something...
*/
utMPMC_list_node_t* to_be_remoed = v[myI].l.head;
/* First Advance */
v[myI].l.head = v[myI].l.head->next;
/* Secondly Extract elem */
*ret_val = v[myI].l.head->data;
/* update the rest */
target_status = UPDATE_AFTER_READ(target_status,len_v);
__sync_synchronize();
v[myI].s = target_status;
free(to_be_remoed);
return true;
} else {
continue;
}
} else {
/*
* Check if someone changed the card while I was playing
*/
if (myDQC != VOLATILE_READ(dqc)) {
continue;
}
if (VOLATILE_READ(eqc) != VOLATILE_READ(dqc)) {
continue;
}
/*
* Sorry.. no space for you...
*/
return false;
}
}
/*
* Impossible to reach this point!!!
*/
return true;
}
// inline bool empty() {
// for(size_t i=0;i<pool.size();++i)
// if (!pool[i].empty()) return false;
// return true;
// }
private:
long padding0[64 - sizeof(unsigned long)];
unsigned long eqc;
long padding1[64 - sizeof(unsigned long)];
unsigned long dqc;
long padding2[64 - sizeof(unsigned long)];
unsigned long len_v;
unsigned long len_v_minus_one;
utMPMC_VB_note_t * v;
long padding3[64 - 3*sizeof(unsigned long)];
};
// /*
// * multiMSqueue is a specialization of the scalableMPMCqueue which uses the MSqueue
// */
// class multiMSqueue: public scalableMPMCqueue<MSqueue> {
// public:
// multiMSqueue(size_t poolsize = scalableMPMCqueue<MSqueue>::DEFAULT_POOL_SIZE) {
// if (! scalableMPMCqueue<MSqueue>::init(poolsize)) {
// std::cerr << "multiMSqueue init ERROR, abort....\n";
// abort();
// }
// for(size_t i=0;i<poolsize;++i)
// if (pool[i].init()<0) {
// std::cerr << "ERROR initializing MSqueue, abort....\n";
// abort();
// }
// }
// };
#endif
} // namespace
#endif /* FF_MPMCQUEUE_HPP */