mesytec-mnode/external/taskflow-3.8.0/taskflow/cuda/cuda_device.hpp

343 lines
8.8 KiB
C++
Raw Normal View History

2025-01-04 01:25:05 +01:00
#pragma once
#include "cuda_error.hpp"
/**
@file cuda_device.hpp
@brief CUDA device utilities include file
*/
namespace tf {
/**
@brief queries the number of available devices
*/
inline size_t cuda_get_num_devices() {
int N = 0;
TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
return static_cast<size_t>(N);
}
/**
@brief gets the current device associated with the caller thread
*/
inline int cuda_get_device() {
int id;
TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
return id;
}
/**
@brief switches to a given device context
*/
inline void cuda_set_device(int id) {
TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
}
/**
@brief obtains the device property
*/
inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
TF_CHECK_CUDA(
cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
);
}
/**
@brief obtains the device property
*/
inline cudaDeviceProp cuda_get_device_property(int i) {
cudaDeviceProp p;
TF_CHECK_CUDA(
cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
);
return p;
}
/**
@brief dumps the device property
*/
inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {
os << "Major revision number: " << p.major << '\n'
<< "Minor revision number: " << p.minor << '\n'
<< "Name: " << p.name << '\n'
<< "Total global memory: " << p.totalGlobalMem << '\n'
<< "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
<< "Total registers per block: " << p.regsPerBlock << '\n'
<< "Warp size: " << p.warpSize << '\n'
<< "Maximum memory pitch: " << p.memPitch << '\n'
<< "Maximum threads per block: " << p.maxThreadsPerBlock << '\n';
os << "Maximum dimension of block: ";
for (int i = 0; i < 3; ++i) {
if(i) os << 'x';
os << p.maxThreadsDim[i];
}
os << '\n';
os << "Maximum dimension of grid: ";
for (int i = 0; i < 3; ++i) {
if(i) os << 'x';
os << p.maxGridSize[i];;
}
os << '\n';
os << "Clock rate: " << p.clockRate << '\n'
<< "Total constant memory: " << p.totalConstMem << '\n'
<< "Texture alignment: " << p.textureAlignment << '\n'
<< "Concurrent copy and execution: " << p.deviceOverlap << '\n'
<< "Number of multiprocessors: " << p.multiProcessorCount << '\n'
<< "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n'
<< "GPU sharing Host Memory: " << p.integrated << '\n'
<< "Host page-locked mem mapping: " << p.canMapHostMemory << '\n'
<< "Alignment for Surfaces: " << p.surfaceAlignment << '\n'
<< "Device has ECC support: " << p.ECCEnabled << '\n'
<< "Unified Addressing (UVA): " << p.unifiedAddressing << '\n';
}
/**
@brief queries the maximum threads per block on a device
*/
inline size_t cuda_get_device_max_threads_per_block(int d) {
int threads = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
"failed to query the maximum threads per block on device ", d
)
return threads;
}
/**
@brief queries the maximum x-dimension per block on a device
*/
inline size_t cuda_get_device_max_x_dim_per_block(int d) {
int dim = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
"failed to query the maximum x-dimension per block on device ", d
)
return dim;
}
/**
@brief queries the maximum y-dimension per block on a device
*/
inline size_t cuda_get_device_max_y_dim_per_block(int d) {
int dim = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
"failed to query the maximum y-dimension per block on device ", d
)
return dim;
}
/**
@brief queries the maximum z-dimension per block on a device
*/
inline size_t cuda_get_device_max_z_dim_per_block(int d) {
int dim = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
"failed to query the maximum z-dimension per block on device ", d
)
return dim;
}
/**
@brief queries the maximum x-dimension per grid on a device
*/
inline size_t cuda_get_device_max_x_dim_per_grid(int d) {
int dim = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
"failed to query the maximum x-dimension per grid on device ", d
)
return dim;
}
/**
@brief queries the maximum y-dimension per grid on a device
*/
inline size_t cuda_get_device_max_y_dim_per_grid(int d) {
int dim = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
"failed to query the maximum y-dimension per grid on device ", d
)
return dim;
}
/**
@brief queries the maximum z-dimension per grid on a device
*/
inline size_t cuda_get_device_max_z_dim_per_grid(int d) {
int dim = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
"failed to query the maximum z-dimension per grid on device ", d
)
return dim;
}
/**
@brief queries the maximum shared memory size in bytes per block on a device
*/
inline size_t cuda_get_device_max_shm_per_block(int d) {
int num = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
"failed to query the maximum shared memory per block on device ", d
)
return num;
}
/**
@brief queries the warp size on a device
*/
inline size_t cuda_get_device_warp_size(int d) {
int num = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
"failed to query the warp size per block on device ", d
)
return num;
}
/**
@brief queries the major number of compute capability of a device
*/
inline int cuda_get_device_compute_capability_major(int d) {
int num = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
"failed to query the major number of compute capability of device ", d
)
return num;
}
/**
@brief queries the minor number of compute capability of a device
*/
inline int cuda_get_device_compute_capability_minor(int d) {
int num = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
"failed to query the minor number of compute capability of device ", d
)
return num;
}
/**
@brief queries if the device supports unified addressing
*/
inline bool cuda_get_device_unified_addressing(int d) {
int num = 0;
TF_CHECK_CUDA(
cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
"failed to query unified addressing status on device ", d
)
return num;
}
// ----------------------------------------------------------------------------
// CUDA Version
// ----------------------------------------------------------------------------
/**
@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
*/
inline int cuda_get_driver_version() {
int num = 0;
TF_CHECK_CUDA(
cudaDriverGetVersion(&num),
"failed to query the latest cuda version supported by the driver"
);
return num;
}
/**
@brief queries the CUDA Runtime version (1000 * major + 10 * minor)
*/
inline int cuda_get_runtime_version() {
int num = 0;
TF_CHECK_CUDA(
cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
);
return num;
}
// ----------------------------------------------------------------------------
// cudaScopedDevice
// ----------------------------------------------------------------------------
/** @class cudaScopedDevice
@brief class to create an RAII-styled context switch
Sample usage:
@code{.cpp}
{
tf::cudaScopedDevice device(1); // switch to the device context 1
// create a stream under device context 1
cudaStream_t stream;
cudaStreamCreate(&stream);
} // leaving the scope and goes back to the previous device context
@endcode
%cudaScopedDevice is neither movable nor copyable.
*/
class cudaScopedDevice {
public:
/**
@brief constructs a RAII-styled device switcher
@param device device context to scope in the guard
*/
explicit cudaScopedDevice(int device);
/**
@brief destructs the guard and switches back to the previous device context
*/
~cudaScopedDevice();
private:
cudaScopedDevice() = delete;
cudaScopedDevice(const cudaScopedDevice&) = delete;
cudaScopedDevice(cudaScopedDevice&&) = delete;
int _p;
};
// Constructor
inline cudaScopedDevice::cudaScopedDevice(int dev) {
TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
if(_p == dev) {
_p = -1;
}
else {
TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
}
}
// Destructor
inline cudaScopedDevice::~cudaScopedDevice() {
if(_p != -1) {
cudaSetDevice(_p);
//TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
}
}
} // end of namespace cuda ---------------------------------------------------