mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/mapper.hpp

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*!
 *  \link
 *  \file mapper.hpp
 *  \ingroup shared_memory_fastflow
 *
 *  \brief This file contains the thread mapper definition used in FastFlow
 */

#ifndef __THREAD_MAPPER_HPP_
#define __THREAD_MAPPER_HPP_

/* ***************************************************************************
 *
 *  FastFlow is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License version 3 as
 *  published by the Free Software Foundation.
 *  Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
 *  or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
 *
 *  This program is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 *  License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 ****************************************************************************
 */

#include <stdlib.h>
#include <ff/config.hpp>
#include <ff/svector.hpp>
#include <ff/utils.hpp>
#include <ff/mapping_utils.hpp>
#include <vector>
#if defined(MAMMUT)
#include <mammut/mammut.hpp>
#endif


#if defined(FF_CUDA) 
#include <cuda.h>
#endif

#if 0
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#include <ff/ocl/clEnvironment.hpp>
#endif


#endif

namespace ff {

/*!
 *  \ingroup shared_memory_fastflow
 *
 *  @{
 */

/*! 
 * \class threadMapper
 * \ingroup shared_memory_fastflow
 *
 * \brief The thread mapper allows to map threads to specific core using a
 * predefined mapping policy.
 *
 * The threadMapper stores a list of CPU ids. By default the list is simply a
 * linear sequence of core ids of the system, for example in a quad-core
 * system the default list is 0 1 2 3. It is possible to change the default
 * list using the method setMappingList by passing a string of space-serated
 * (or comma-separated) CPU ids. The policy implemented in the threadManager
 * is to pick up a CPU id from the list using a round-robin policy.
 *
 * This class is defined in \ref mapper.hpp
 *
 */
class threadMapper {
public:
	/**
	 * Get a static instance of the threadMapper object
	 *
	 * \return TODO
	 */
	static inline threadMapper* instance() {
		static threadMapper thm;
		return &thm;
	}

	/**
	 * Default constructor.
	 */
	threadMapper() :
			rrcnt(-1), mask(0) {
        unsigned int size = -1;
#if defined(MAMMUT)
        mammut::Mammut m;
        std::vector<mammut::topology::Cpu*> cpus = m.getInstanceTopology()->getCpus();
		if (cpus.size()<=0 || cpus[0]->getPhysicalCores().size() <=0) {
            error("threadMapper: invalid number of cores\n");
            return ;
        }
        size_t virtualPerPhysical = cpus[0]->getPhysicalCores()[0]->getVirtualCores().size();
        for(size_t k = 0; k < virtualPerPhysical; k++){
            for(size_t i = 0; i < cpus.size(); i++){
                std::vector<mammut::topology::PhysicalCore*> phyCores = cpus.at(i)->getPhysicalCores();
                for(size_t j = 0; j < phyCores.size(); j++){
                    std::vector<mammut::topology::VirtualCore*> virtCores = phyCores.at(j)->getVirtualCores();
                    CList.push_back(virtCores[k]->getVirtualCoreId());
                }
            }
        }
        int nc;
        size = nc = num_cores = CList.size();
		// usually num_cores is a power of two....!
		if (!isPowerOf2(size)) {
			size = nextPowerOf2(size);
            for(size_t i =CList.size(), j = 0; i< size; ++i, j++)
                CList.push_back(CList[j]);            
        }        
#else
        const std::string ff_mapping_string = FF_MAPPING_STRING;
        if (ff_mapping_string.length()) {
            num_cores = setMappingList(ff_mapping_string.c_str());
            assert(isPowerOf2(CList.size()));
            size = CList.size();
        } else {
            int nc = ff_numCores();
            if (nc <= 0) {
                error("threadMapper: invalid num_cores\n");
                return;
            }
            size = num_cores = nc;
            CList.reserve(size);
            for (int i = 0; i < nc; ++i)
                CList.push_back(i);

            // usually num_cores is a power of two....!
            if (!isPowerOf2(size)) {
                size = nextPowerOf2(size);
                for(size_t i =CList.size(), j = 0; i< size; ++i, j++)
                    CList.push_back(CList[j]);       
            }
        }
#endif /* MAMMUT */

        mask = size - 1;
		rrcnt = 0;
        /*
          printf("CList:\n");
          for(size_t i =0 ; i < CList.size(); ++i) {
          printf("%ld ", CList[i]);
          }
          printf("\n");
        */


#if 0
		const int max_supported_platforms = 10;
		const int max_supported_devices = 10;
		cl_uint n_platforms;
		cl_platform_id platforms[max_supported_platforms];
		cl_device_id devices[max_supported_devices];
		cl_int status = clGetPlatformIDs(max_supported_platforms, platforms,
				&n_platforms); //TODO max 10 platforms
		checkResult(status, "clGetPlatformIDs");
		for (cl_uint i = 0; i < n_platforms; ++i) {
			cl_uint n_devices;
			//GPUs
			status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU,
					max_supported_devices, devices, &n_devices);
			//checkResult(status, "clGetDeviceIDs GPU");
			if(!status)
			for (cl_uint j = 0; j < n_devices; ++j)
				ocl_gpus.push_back(devices[j]);
			//CPUs
			status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU,
					max_supported_devices, devices, &n_devices);
			//checkResult(status, "clGetDeviceIDs CPU");
			if(!status)
			for (cl_uint j = 0; j < n_devices; ++j)
				ocl_cpus.push_back(devices[j]);
			//accelerators
			status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ACCELERATOR,
                                    max_supported_devices, devices, &n_devices);
			//checkResult(status, "clGetDeviceIDs Accelerators");
			if(!status)
                for (cl_uint j = 0; j < n_devices; ++j)
                    ocl_accelerators.push_back(devices[j]);
		}
		ocl_cpu_id = ocl_gpu_id = ocl_accelerator_id = 0;
#endif
	}

	/**
	 * It allows to set a new list of CPU ids.
	 *
	 * The str variable should contain a space-separated or a comma-separated
	 * list of CPU ids. For example if the string str is "0 1 1 2 3", then the
	 * first thread will be bind to CPU 0, the second to CPU 1, the third to
	 * CPU 1, the fourth to CPU 2, the fifth to CPU 3. Then it follows the same
	 * rule for the subsequent threads.
	 *
	 * \return -1 for errors, otherwise it returns the number of elements in str
     *
	 */    
	int setMappingList(const char* str) {
		rrcnt = 0;        // reset rrcnt

		if (str == NULL) return -1; // use the previous mapping list
		char* _str = const_cast<char*>(str), *_str_end;
		svector<int> List(64);
		do {
			while (*_str == ' ' || *_str == '\t' || *_str == ',')
				++_str;
			unsigned cpuid = strtoul(_str, &_str_end, 0);
			if (_str == _str_end) {
				error("setMapping, invalid mapping string\n");
				return -1;
			}
			if (cpuid > (num_cores - 1)) {
				error("setMapping, invalid cpu id in the mapping string\n");
				return -1;
			}
			_str = _str_end;
			List.push_back(cpuid);

			if (*_str == '\0')
				break;
		} while (1);

		unsigned int size = (unsigned int) List.size();
        int ret = size;
		if (!isPowerOf2(size)) {
			size = nextPowerOf2(size);
			List.reserve(size);
		}
		mask = size - 1;
		for (size_t i = List.size(), j = 0; i < size; ++i, j++)
			List.push_back(List[j]);
		CList = List;
        return ret;
	}

    void setMappingList(const std::vector<size_t> &mapping) {
		rrcnt = 0;        // reset rrcnt

		if ((mapping.size() > (mask+1)) || (mapping.size()==0)) {
            error("Invalid pinng vector: ignoring it\n");
			return; // use the previous mapping list
        }
		svector<int> List(mask + 1);
        for (size_t i=0; i<mapping.size(); ++i) {
			auto cpuid = mapping[i];
            if (cpuid > (num_cores - 1)) {
				error("setMapping, invalid cpu id in the mapping string\n");
				return;
            }
            List.push_back(cpuid);
        }  
        
		unsigned int size = (unsigned int) List.size();
		if (!isPowerOf2(size)) {
			size = nextPowerOf2(size);
			List.reserve(size);
		}
		mask = size - 1;
		for (size_t i = List.size(), j = 0; i < size; ++i, j++)
			List.push_back(List[j]);
		CList = List;
	}

    
	/**
	 *  Returns the next CPU id using a round-robin mapping access on the mapping list. 
	 *
	 *  \return The identifier of the core.
	 */
	int getCoreId() {
		assert(rrcnt >= 0);
		int id = CList[rrcnt++];
		rrcnt &= mask;
		return id;
	}

	/**
	 * It is used for debugging.
	 *
	 * \return TODO
	 */
	unsigned int getMask() {
		return mask;
	}

	/**
	 * It is used for debugging.
	 *
	 * \return TODO
	 */
	unsigned int getCListSize() {
		return (unsigned int) CList.size();
	}

	/**
	 * It is used to get the identifier of the core.
	 *
	 * \return The identifier of the core.
	 */
	ssize_t getCoreId(unsigned int tid) {
		ssize_t id = CList[tid & mask];
		//std::cerr << "Mask is " << mask << "\n";
		//int id = CList[tid % (mask+1)];
		return id;
	}

	/**
	 * It checks whether the taken core is within the range of the cores
	 * available on the machine.
	 *
	 * \return It will return either \p true of \p false.
	 */
	inline bool checkCPUId(const int cpuId) const {
		return ((unsigned) cpuId < num_cores);
	}

#if defined(FF_CUDA) 
	inline int getNumCUDADevices() const {
		int deviceCount = 0;
		cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
		if (error_id != cudaSuccess) {
			error("getNumCUDADevices: cannot get the number of cuda devices\n");
			return -1;
		}
		return deviceCount;
	}
#endif

#if 0
	cl_device_id getOCLcpu() {
		cl_device_id res = ocl_cpus[(ocl_cpu_id++) % ocl_cpus.size()];
		char tmp[1024];
		clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);
		std::cerr << "picked CPU device: " << tmp << std::endl;
		return res;
	}

	cl_device_id getOCLgpu() {
		cl_device_id res = ocl_gpus[(ocl_gpu_id++) % ocl_gpus.size()];
		char tmp[1024];
		clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);
		std::cerr << "picked GPU device: " << tmp << std::endl;
		return res;
	}

	cl_device_id getOCLaccelerator() {
		cl_device_id res = ocl_accelerators[(ocl_accelerator_id++) % ocl_accelerators.size()];
		char tmp[1024];
		clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);
		std::cerr << "picked Accelerator device: " << tmp << std::endl;
		return res;
	}
#endif

protected:
	long rrcnt;
	unsigned int mask;
	unsigned int num_cores;
	svector<int> CList;
#if 0
	svector<cl_device_id> ocl_cpus, ocl_gpus, ocl_accelerators;
	std::atomic<unsigned int> ocl_cpu_id, ocl_gpu_id, ocl_accelerator_id;
#endif
};

} // namespace ff

/*!
 *
 * @}
 * \link
 */

#endif /* __THREAD_MAPPER_HPP_ */
add taskflow-3.8.0 2025-01-04 01:25:05 +01:00			`/* -- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -- */`

			`/*!`
			`* \link`
			`* \file mapper.hpp`
			`* \ingroup shared_memory_fastflow`
			`*`
			`* \brief This file contains the thread mapper definition used in FastFlow`
			`*/`

			`#ifndef __THREAD_MAPPER_HPP_`
			`#define __THREAD_MAPPER_HPP_`

			`/* ***************************************************************************`
			`*`
			`* FastFlow is free software; you can redistribute it and/or modify it`
			`* under the terms of the GNU Lesser General Public License version 3 as`
			`* published by the Free Software Foundation.`
			`* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3`
			`* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)`
			`*`
			`* This program is distributed in the hope that it will be useful, but WITHOUT`
			`* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or`
			`* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`* License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public License`
			`* along with this program; if not, write to the Free Software Foundation,`
			`* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.`
			`*`
			`****************************************************************************`
			`*/`

			`#include <stdlib.h>`
			`#include <ff/config.hpp>`
			`#include <ff/svector.hpp>`
			`#include <ff/utils.hpp>`
			`#include <ff/mapping_utils.hpp>`
			`#include <vector>`
			`#if defined(MAMMUT)`
			`#include <mammut/mammut.hpp>`
			`#endif`


			`#if defined(FF_CUDA)`
			`#include <cuda.h>`
			`#endif`

			`#if 0`
			`#ifdef __APPLE__`
			`#include <OpenCL/opencl.h>`
			`#else`
			`#include <CL/opencl.h>`
			`#include <ff/ocl/clEnvironment.hpp>`
			`#endif`





			`#endif`

			`namespace ff {`

			`/*!`
			`* \ingroup shared_memory_fastflow`
			`*`
			`* @{`
			`*/`

			`/*!`
			`* \class threadMapper`
			`* \ingroup shared_memory_fastflow`
			`*`
			`* \brief The thread mapper allows to map threads to specific core using a`
			`* predefined mapping policy.`
			`*`
			`* The threadMapper stores a list of CPU ids. By default the list is simply a`
			`* linear sequence of core ids of the system, for example in a quad-core`
			`* system the default list is 0 1 2 3. It is possible to change the default`
			`* list using the method setMappingList by passing a string of space-serated`
			`* (or comma-separated) CPU ids. The policy implemented in the threadManager`
			`* is to pick up a CPU id from the list using a round-robin policy.`
			`*`
			`* This class is defined in \ref mapper.hpp`
			`*`
			`*/`
			`class threadMapper {`
			`public:`
			`/**`
			`* Get a static instance of the threadMapper object`
			`*`
			`* \return TODO`
			`*/`
			`static inline threadMapper* instance() {`
			`static threadMapper thm;`
			`return &thm;`
			`}`

			`/**`
			`* Default constructor.`
			`*/`
			`threadMapper() :`
			`rrcnt(-1), mask(0) {`
			`unsigned int size = -1;`
			`#if defined(MAMMUT)`
			`mammut::Mammut m;`
			`std::vector<mammut::topology::Cpu*> cpus = m.getInstanceTopology()->getCpus();`
			`if (cpus.size()<=0 \|\| cpus[0]->getPhysicalCores().size() <=0) {`
			`error("threadMapper: invalid number of cores\n");`
			`return ;`
			`}`
			`size_t virtualPerPhysical = cpus[0]->getPhysicalCores()[0]->getVirtualCores().size();`
			`for(size_t k = 0; k < virtualPerPhysical; k++){`
			`for(size_t i = 0; i < cpus.size(); i++){`
			`std::vector<mammut::topology::PhysicalCore*> phyCores = cpus.at(i)->getPhysicalCores();`
			`for(size_t j = 0; j < phyCores.size(); j++){`
			`std::vector<mammut::topology::VirtualCore*> virtCores = phyCores.at(j)->getVirtualCores();`
			`CList.push_back(virtCores[k]->getVirtualCoreId());`
			`}`
			`}`
			`}`
			`int nc;`
			`size = nc = num_cores = CList.size();`
			`// usually num_cores is a power of two....!`
			`if (!isPowerOf2(size)) {`
			`size = nextPowerOf2(size);`
			`for(size_t i =CList.size(), j = 0; i< size; ++i, j++)`
			`CList.push_back(CList[j]);`
			`}`
			`#else`
			`const std::string ff_mapping_string = FF_MAPPING_STRING;`
			`if (ff_mapping_string.length()) {`
			`num_cores = setMappingList(ff_mapping_string.c_str());`
			`assert(isPowerOf2(CList.size()));`
			`size = CList.size();`
			`} else {`
			`int nc = ff_numCores();`
			`if (nc <= 0) {`
			`error("threadMapper: invalid num_cores\n");`
			`return;`
			`}`
			`size = num_cores = nc;`
			`CList.reserve(size);`
			`for (int i = 0; i < nc; ++i)`
			`CList.push_back(i);`

			`// usually num_cores is a power of two....!`
			`if (!isPowerOf2(size)) {`
			`size = nextPowerOf2(size);`
			`for(size_t i =CList.size(), j = 0; i< size; ++i, j++)`
			`CList.push_back(CList[j]);`
			`}`
			`}`
			`#endif /* MAMMUT */`

			`mask = size - 1;`
			`rrcnt = 0;`
			`/*`
			`printf("CList:\n");`
			`for(size_t i =0 ; i < CList.size(); ++i) {`
			`printf("%ld ", CList[i]);`
			`}`
			`printf("\n");`
			`*/`


			`#if 0`
			`const int max_supported_platforms = 10;`
			`const int max_supported_devices = 10;`
			`cl_uint n_platforms;`
			`cl_platform_id platforms[max_supported_platforms];`
			`cl_device_id devices[max_supported_devices];`
			`cl_int status = clGetPlatformIDs(max_supported_platforms, platforms,`
			`&n_platforms); //TODO max 10 platforms`
			`checkResult(status, "clGetPlatformIDs");`
			`for (cl_uint i = 0; i < n_platforms; ++i) {`
			`cl_uint n_devices;`
			`//GPUs`
			`status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU,`
			`max_supported_devices, devices, &n_devices);`
			`//checkResult(status, "clGetDeviceIDs GPU");`
			`if(!status)`
			`for (cl_uint j = 0; j < n_devices; ++j)`
			`ocl_gpus.push_back(devices[j]);`
			`//CPUs`
			`status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU,`
			`max_supported_devices, devices, &n_devices);`
			`//checkResult(status, "clGetDeviceIDs CPU");`
			`if(!status)`
			`for (cl_uint j = 0; j < n_devices; ++j)`
			`ocl_cpus.push_back(devices[j]);`
			`//accelerators`
			`status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ACCELERATOR,`
			`max_supported_devices, devices, &n_devices);`
			`//checkResult(status, "clGetDeviceIDs Accelerators");`
			`if(!status)`
			`for (cl_uint j = 0; j < n_devices; ++j)`
			`ocl_accelerators.push_back(devices[j]);`
			`}`
			`ocl_cpu_id = ocl_gpu_id = ocl_accelerator_id = 0;`
			`#endif`
			`}`

			`/**`
			`* It allows to set a new list of CPU ids.`
			`*`
			`* The str variable should contain a space-separated or a comma-separated`
			`* list of CPU ids. For example if the string str is "0 1 1 2 3", then the`
			`* first thread will be bind to CPU 0, the second to CPU 1, the third to`
			`* CPU 1, the fourth to CPU 2, the fifth to CPU 3. Then it follows the same`
			`* rule for the subsequent threads.`
			`*`
			`* \return -1 for errors, otherwise it returns the number of elements in str`
			`*`
			`*/`
			`int setMappingList(const char* str) {`
			`rrcnt = 0; // reset rrcnt`

			`if (str == NULL) return -1; // use the previous mapping list`
			`char* _str = const_cast<char>(str), _str_end;`
			`svector<int> List(64);`
			`do {`
			`while (_str == ' ' \|\| _str == '\t' \|\| *_str == ',')`
			`++_str;`
			`unsigned cpuid = strtoul(_str, &_str_end, 0);`
			`if (_str == _str_end) {`
			`error("setMapping, invalid mapping string\n");`
			`return -1;`
			`}`
			`if (cpuid > (num_cores - 1)) {`
			`error("setMapping, invalid cpu id in the mapping string\n");`
			`return -1;`
			`}`
			`_str = _str_end;`
			`List.push_back(cpuid);`

			`if (*_str == '\0')`
			`break;`
			`} while (1);`

			`unsigned int size = (unsigned int) List.size();`
			`int ret = size;`
			`if (!isPowerOf2(size)) {`
			`size = nextPowerOf2(size);`
			`List.reserve(size);`
			`}`
			`mask = size - 1;`
			`for (size_t i = List.size(), j = 0; i < size; ++i, j++)`
			`List.push_back(List[j]);`
			`CList = List;`
			`return ret;`
			`}`

			`void setMappingList(const std::vector<size_t> &mapping) {`
			`rrcnt = 0; // reset rrcnt`

			`if ((mapping.size() > (mask+1)) \|\| (mapping.size()==0)) {`
			`error("Invalid pinng vector: ignoring it\n");`
			`return; // use the previous mapping list`
			`}`
			`svector<int> List(mask + 1);`
			`for (size_t i=0; i<mapping.size(); ++i) {`
			`auto cpuid = mapping[i];`
			`if (cpuid > (num_cores - 1)) {`
			`error("setMapping, invalid cpu id in the mapping string\n");`
			`return;`
			`}`
			`List.push_back(cpuid);`
			`}`

			`unsigned int size = (unsigned int) List.size();`
			`if (!isPowerOf2(size)) {`
			`size = nextPowerOf2(size);`
			`List.reserve(size);`
			`}`
			`mask = size - 1;`
			`for (size_t i = List.size(), j = 0; i < size; ++i, j++)`
			`List.push_back(List[j]);`
			`CList = List;`
			`}`


			`/**`
			`* Returns the next CPU id using a round-robin mapping access on the mapping list.`
			`*`
			`* \return The identifier of the core.`
			`*/`
			`int getCoreId() {`
			`assert(rrcnt >= 0);`
			`int id = CList[rrcnt++];`
			`rrcnt &= mask;`
			`return id;`
			`}`

			`/**`
			`* It is used for debugging.`
			`*`
			`* \return TODO`
			`*/`
			`unsigned int getMask() {`
			`return mask;`
			`}`

			`/**`
			`* It is used for debugging.`
			`*`
			`* \return TODO`
			`*/`
			`unsigned int getCListSize() {`
			`return (unsigned int) CList.size();`
			`}`

			`/**`
			`* It is used to get the identifier of the core.`
			`*`
			`* \return The identifier of the core.`
			`*/`
			`ssize_t getCoreId(unsigned int tid) {`
			`ssize_t id = CList[tid & mask];`
			`//std::cerr << "Mask is " << mask << "\n";`
			`//int id = CList[tid % (mask+1)];`
			`return id;`
			`}`

			`/**`
			`* It checks whether the taken core is within the range of the cores`
			`* available on the machine.`
			`*`
			`* \return It will return either \p true of \p false.`
			`*/`
			`inline bool checkCPUId(const int cpuId) const {`
			`return ((unsigned) cpuId < num_cores);`
			`}`

			`#if defined(FF_CUDA)`
			`inline int getNumCUDADevices() const {`
			`int deviceCount = 0;`
			`cudaError_t error_id = cudaGetDeviceCount(&deviceCount);`
			`if (error_id != cudaSuccess) {`
			`error("getNumCUDADevices: cannot get the number of cuda devices\n");`
			`return -1;`
			`}`
			`return deviceCount;`
			`}`
			`#endif`

			`#if 0`
			`cl_device_id getOCLcpu() {`
			`cl_device_id res = ocl_cpus[(ocl_cpu_id++) % ocl_cpus.size()];`
			`char tmp[1024];`
			`clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);`
			`std::cerr << "picked CPU device: " << tmp << std::endl;`
			`return res;`
			`}`

			`cl_device_id getOCLgpu() {`
			`cl_device_id res = ocl_gpus[(ocl_gpu_id++) % ocl_gpus.size()];`
			`char tmp[1024];`
			`clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);`
			`std::cerr << "picked GPU device: " << tmp << std::endl;`
			`return res;`
			`}`

			`cl_device_id getOCLaccelerator() {`
			`cl_device_id res = ocl_accelerators[(ocl_accelerator_id++) % ocl_accelerators.size()];`
			`char tmp[1024];`
			`clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);`
			`std::cerr << "picked Accelerator device: " << tmp << std::endl;`
			`return res;`
			`}`
			`#endif`

			`protected:`
			`long rrcnt;`
			`unsigned int mask;`
			`unsigned int num_cores;`
			`svector<int> CList;`
			`#if 0`
			`svector<cl_device_id> ocl_cpus, ocl_gpus, ocl_accelerators;`
			`std::atomic<unsigned int> ocl_cpu_id, ocl_gpu_id, ocl_accelerator_id;`
			`#endif`
			`};`

			`} // namespace ff`

			`/*!`
			`*`
			`* @}`
			`* \link`
			`*/`

			`#endif /* __THREAD_MAPPER_HPP_ */`