393 lines
11 KiB
C++
393 lines
11 KiB
C++
/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
|
|
/*!
|
|
* \link
|
|
* \file mapper.hpp
|
|
* \ingroup shared_memory_fastflow
|
|
*
|
|
* \brief This file contains the thread mapper definition used in FastFlow
|
|
*/
|
|
|
|
#ifndef __THREAD_MAPPER_HPP_
|
|
#define __THREAD_MAPPER_HPP_
|
|
|
|
/* ***************************************************************************
|
|
*
|
|
* FastFlow is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU Lesser General Public License version 3 as
|
|
* published by the Free Software Foundation.
|
|
* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
|
|
* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
* License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*
|
|
****************************************************************************
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <ff/config.hpp>
|
|
#include <ff/svector.hpp>
|
|
#include <ff/utils.hpp>
|
|
#include <ff/mapping_utils.hpp>
|
|
#include <vector>
|
|
#if defined(MAMMUT)
|
|
#include <mammut/mammut.hpp>
|
|
#endif
|
|
|
|
|
|
#if defined(FF_CUDA)
|
|
#include <cuda.h>
|
|
#endif
|
|
|
|
#if 0
|
|
#ifdef __APPLE__
|
|
#include <OpenCL/opencl.h>
|
|
#else
|
|
#include <CL/opencl.h>
|
|
#include <ff/ocl/clEnvironment.hpp>
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
namespace ff {
|
|
|
|
/*!
|
|
* \ingroup shared_memory_fastflow
|
|
*
|
|
* @{
|
|
*/
|
|
|
|
/*!
|
|
* \class threadMapper
|
|
* \ingroup shared_memory_fastflow
|
|
*
|
|
* \brief The thread mapper allows to map threads to specific core using a
|
|
* predefined mapping policy.
|
|
*
|
|
* The threadMapper stores a list of CPU ids. By default the list is simply a
|
|
* linear sequence of core ids of the system, for example in a quad-core
|
|
* system the default list is 0 1 2 3. It is possible to change the default
|
|
* list using the method setMappingList by passing a string of space-serated
|
|
* (or comma-separated) CPU ids. The policy implemented in the threadManager
|
|
* is to pick up a CPU id from the list using a round-robin policy.
|
|
*
|
|
* This class is defined in \ref mapper.hpp
|
|
*
|
|
*/
|
|
class threadMapper {
|
|
public:
|
|
/**
|
|
* Get a static instance of the threadMapper object
|
|
*
|
|
* \return TODO
|
|
*/
|
|
static inline threadMapper* instance() {
|
|
static threadMapper thm;
|
|
return &thm;
|
|
}
|
|
|
|
/**
|
|
* Default constructor.
|
|
*/
|
|
threadMapper() :
|
|
rrcnt(-1), mask(0) {
|
|
unsigned int size = -1;
|
|
#if defined(MAMMUT)
|
|
mammut::Mammut m;
|
|
std::vector<mammut::topology::Cpu*> cpus = m.getInstanceTopology()->getCpus();
|
|
if (cpus.size()<=0 || cpus[0]->getPhysicalCores().size() <=0) {
|
|
error("threadMapper: invalid number of cores\n");
|
|
return ;
|
|
}
|
|
size_t virtualPerPhysical = cpus[0]->getPhysicalCores()[0]->getVirtualCores().size();
|
|
for(size_t k = 0; k < virtualPerPhysical; k++){
|
|
for(size_t i = 0; i < cpus.size(); i++){
|
|
std::vector<mammut::topology::PhysicalCore*> phyCores = cpus.at(i)->getPhysicalCores();
|
|
for(size_t j = 0; j < phyCores.size(); j++){
|
|
std::vector<mammut::topology::VirtualCore*> virtCores = phyCores.at(j)->getVirtualCores();
|
|
CList.push_back(virtCores[k]->getVirtualCoreId());
|
|
}
|
|
}
|
|
}
|
|
int nc;
|
|
size = nc = num_cores = CList.size();
|
|
// usually num_cores is a power of two....!
|
|
if (!isPowerOf2(size)) {
|
|
size = nextPowerOf2(size);
|
|
for(size_t i =CList.size(), j = 0; i< size; ++i, j++)
|
|
CList.push_back(CList[j]);
|
|
}
|
|
#else
|
|
const std::string ff_mapping_string = FF_MAPPING_STRING;
|
|
if (ff_mapping_string.length()) {
|
|
num_cores = setMappingList(ff_mapping_string.c_str());
|
|
assert(isPowerOf2(CList.size()));
|
|
size = CList.size();
|
|
} else {
|
|
int nc = ff_numCores();
|
|
if (nc <= 0) {
|
|
error("threadMapper: invalid num_cores\n");
|
|
return;
|
|
}
|
|
size = num_cores = nc;
|
|
CList.reserve(size);
|
|
for (int i = 0; i < nc; ++i)
|
|
CList.push_back(i);
|
|
|
|
// usually num_cores is a power of two....!
|
|
if (!isPowerOf2(size)) {
|
|
size = nextPowerOf2(size);
|
|
for(size_t i =CList.size(), j = 0; i< size; ++i, j++)
|
|
CList.push_back(CList[j]);
|
|
}
|
|
}
|
|
#endif /* MAMMUT */
|
|
|
|
mask = size - 1;
|
|
rrcnt = 0;
|
|
/*
|
|
printf("CList:\n");
|
|
for(size_t i =0 ; i < CList.size(); ++i) {
|
|
printf("%ld ", CList[i]);
|
|
}
|
|
printf("\n");
|
|
*/
|
|
|
|
|
|
#if 0
|
|
const int max_supported_platforms = 10;
|
|
const int max_supported_devices = 10;
|
|
cl_uint n_platforms;
|
|
cl_platform_id platforms[max_supported_platforms];
|
|
cl_device_id devices[max_supported_devices];
|
|
cl_int status = clGetPlatformIDs(max_supported_platforms, platforms,
|
|
&n_platforms); //TODO max 10 platforms
|
|
checkResult(status, "clGetPlatformIDs");
|
|
for (cl_uint i = 0; i < n_platforms; ++i) {
|
|
cl_uint n_devices;
|
|
//GPUs
|
|
status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU,
|
|
max_supported_devices, devices, &n_devices);
|
|
//checkResult(status, "clGetDeviceIDs GPU");
|
|
if(!status)
|
|
for (cl_uint j = 0; j < n_devices; ++j)
|
|
ocl_gpus.push_back(devices[j]);
|
|
//CPUs
|
|
status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU,
|
|
max_supported_devices, devices, &n_devices);
|
|
//checkResult(status, "clGetDeviceIDs CPU");
|
|
if(!status)
|
|
for (cl_uint j = 0; j < n_devices; ++j)
|
|
ocl_cpus.push_back(devices[j]);
|
|
//accelerators
|
|
status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ACCELERATOR,
|
|
max_supported_devices, devices, &n_devices);
|
|
//checkResult(status, "clGetDeviceIDs Accelerators");
|
|
if(!status)
|
|
for (cl_uint j = 0; j < n_devices; ++j)
|
|
ocl_accelerators.push_back(devices[j]);
|
|
}
|
|
ocl_cpu_id = ocl_gpu_id = ocl_accelerator_id = 0;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* It allows to set a new list of CPU ids.
|
|
*
|
|
* The str variable should contain a space-separated or a comma-separated
|
|
* list of CPU ids. For example if the string str is "0 1 1 2 3", then the
|
|
* first thread will be bind to CPU 0, the second to CPU 1, the third to
|
|
* CPU 1, the fourth to CPU 2, the fifth to CPU 3. Then it follows the same
|
|
* rule for the subsequent threads.
|
|
*
|
|
* \return -1 for errors, otherwise it returns the number of elements in str
|
|
*
|
|
*/
|
|
int setMappingList(const char* str) {
|
|
rrcnt = 0; // reset rrcnt
|
|
|
|
if (str == NULL) return -1; // use the previous mapping list
|
|
char* _str = const_cast<char*>(str), *_str_end;
|
|
svector<int> List(64);
|
|
do {
|
|
while (*_str == ' ' || *_str == '\t' || *_str == ',')
|
|
++_str;
|
|
unsigned cpuid = strtoul(_str, &_str_end, 0);
|
|
if (_str == _str_end) {
|
|
error("setMapping, invalid mapping string\n");
|
|
return -1;
|
|
}
|
|
if (cpuid > (num_cores - 1)) {
|
|
error("setMapping, invalid cpu id in the mapping string\n");
|
|
return -1;
|
|
}
|
|
_str = _str_end;
|
|
List.push_back(cpuid);
|
|
|
|
if (*_str == '\0')
|
|
break;
|
|
} while (1);
|
|
|
|
unsigned int size = (unsigned int) List.size();
|
|
int ret = size;
|
|
if (!isPowerOf2(size)) {
|
|
size = nextPowerOf2(size);
|
|
List.reserve(size);
|
|
}
|
|
mask = size - 1;
|
|
for (size_t i = List.size(), j = 0; i < size; ++i, j++)
|
|
List.push_back(List[j]);
|
|
CList = List;
|
|
return ret;
|
|
}
|
|
|
|
void setMappingList(const std::vector<size_t> &mapping) {
|
|
rrcnt = 0; // reset rrcnt
|
|
|
|
if ((mapping.size() > (mask+1)) || (mapping.size()==0)) {
|
|
error("Invalid pinng vector: ignoring it\n");
|
|
return; // use the previous mapping list
|
|
}
|
|
svector<int> List(mask + 1);
|
|
for (size_t i=0; i<mapping.size(); ++i) {
|
|
auto cpuid = mapping[i];
|
|
if (cpuid > (num_cores - 1)) {
|
|
error("setMapping, invalid cpu id in the mapping string\n");
|
|
return;
|
|
}
|
|
List.push_back(cpuid);
|
|
}
|
|
|
|
unsigned int size = (unsigned int) List.size();
|
|
if (!isPowerOf2(size)) {
|
|
size = nextPowerOf2(size);
|
|
List.reserve(size);
|
|
}
|
|
mask = size - 1;
|
|
for (size_t i = List.size(), j = 0; i < size; ++i, j++)
|
|
List.push_back(List[j]);
|
|
CList = List;
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the next CPU id using a round-robin mapping access on the mapping list.
|
|
*
|
|
* \return The identifier of the core.
|
|
*/
|
|
int getCoreId() {
|
|
assert(rrcnt >= 0);
|
|
int id = CList[rrcnt++];
|
|
rrcnt &= mask;
|
|
return id;
|
|
}
|
|
|
|
/**
|
|
* It is used for debugging.
|
|
*
|
|
* \return TODO
|
|
*/
|
|
unsigned int getMask() {
|
|
return mask;
|
|
}
|
|
|
|
/**
|
|
* It is used for debugging.
|
|
*
|
|
* \return TODO
|
|
*/
|
|
unsigned int getCListSize() {
|
|
return (unsigned int) CList.size();
|
|
}
|
|
|
|
/**
|
|
* It is used to get the identifier of the core.
|
|
*
|
|
* \return The identifier of the core.
|
|
*/
|
|
ssize_t getCoreId(unsigned int tid) {
|
|
ssize_t id = CList[tid & mask];
|
|
//std::cerr << "Mask is " << mask << "\n";
|
|
//int id = CList[tid % (mask+1)];
|
|
return id;
|
|
}
|
|
|
|
/**
|
|
* It checks whether the taken core is within the range of the cores
|
|
* available on the machine.
|
|
*
|
|
* \return It will return either \p true of \p false.
|
|
*/
|
|
inline bool checkCPUId(const int cpuId) const {
|
|
return ((unsigned) cpuId < num_cores);
|
|
}
|
|
|
|
#if defined(FF_CUDA)
|
|
inline int getNumCUDADevices() const {
|
|
int deviceCount = 0;
|
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
|
if (error_id != cudaSuccess) {
|
|
error("getNumCUDADevices: cannot get the number of cuda devices\n");
|
|
return -1;
|
|
}
|
|
return deviceCount;
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
cl_device_id getOCLcpu() {
|
|
cl_device_id res = ocl_cpus[(ocl_cpu_id++) % ocl_cpus.size()];
|
|
char tmp[1024];
|
|
clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);
|
|
std::cerr << "picked CPU device: " << tmp << std::endl;
|
|
return res;
|
|
}
|
|
|
|
cl_device_id getOCLgpu() {
|
|
cl_device_id res = ocl_gpus[(ocl_gpu_id++) % ocl_gpus.size()];
|
|
char tmp[1024];
|
|
clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);
|
|
std::cerr << "picked GPU device: " << tmp << std::endl;
|
|
return res;
|
|
}
|
|
|
|
cl_device_id getOCLaccelerator() {
|
|
cl_device_id res = ocl_accelerators[(ocl_accelerator_id++) % ocl_accelerators.size()];
|
|
char tmp[1024];
|
|
clGetDeviceInfo(res, CL_DEVICE_NAME, 1024 * sizeof(char), tmp, NULL);
|
|
std::cerr << "picked Accelerator device: " << tmp << std::endl;
|
|
return res;
|
|
}
|
|
#endif
|
|
|
|
protected:
|
|
long rrcnt;
|
|
unsigned int mask;
|
|
unsigned int num_cores;
|
|
svector<int> CList;
|
|
#if 0
|
|
svector<cl_device_id> ocl_cpus, ocl_gpus, ocl_accelerators;
|
|
std::atomic<unsigned int> ocl_cpu_id, ocl_gpu_id, ocl_accelerator_id;
|
|
#endif
|
|
};
|
|
|
|
} // namespace ff
|
|
|
|
/*!
|
|
*
|
|
* @}
|
|
* \link
|
|
*/
|
|
|
|
#endif /* __THREAD_MAPPER_HPP_ */
|