342 lines
8.8 KiB
C++
342 lines
8.8 KiB
C++
#pragma once
|
|
|
|
#include "cuda_error.hpp"
|
|
|
|
/**
|
|
@file cuda_device.hpp
|
|
@brief CUDA device utilities include file
|
|
*/
|
|
|
|
namespace tf {
|
|
|
|
/**
|
|
@brief queries the number of available devices
|
|
*/
|
|
inline size_t cuda_get_num_devices() {
|
|
int N = 0;
|
|
TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
|
|
return static_cast<size_t>(N);
|
|
}
|
|
|
|
/**
|
|
@brief gets the current device associated with the caller thread
|
|
*/
|
|
inline int cuda_get_device() {
|
|
int id;
|
|
TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
|
|
return id;
|
|
}
|
|
|
|
/**
|
|
@brief switches to a given device context
|
|
*/
|
|
inline void cuda_set_device(int id) {
|
|
TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
|
|
}
|
|
|
|
/**
|
|
@brief obtains the device property
|
|
*/
|
|
inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
|
|
TF_CHECK_CUDA(
|
|
cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
|
|
);
|
|
}
|
|
|
|
/**
|
|
@brief obtains the device property
|
|
*/
|
|
inline cudaDeviceProp cuda_get_device_property(int i) {
|
|
cudaDeviceProp p;
|
|
TF_CHECK_CUDA(
|
|
cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
|
|
);
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
@brief dumps the device property
|
|
*/
|
|
inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {
|
|
|
|
os << "Major revision number: " << p.major << '\n'
|
|
<< "Minor revision number: " << p.minor << '\n'
|
|
<< "Name: " << p.name << '\n'
|
|
<< "Total global memory: " << p.totalGlobalMem << '\n'
|
|
<< "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
|
|
<< "Total registers per block: " << p.regsPerBlock << '\n'
|
|
<< "Warp size: " << p.warpSize << '\n'
|
|
<< "Maximum memory pitch: " << p.memPitch << '\n'
|
|
<< "Maximum threads per block: " << p.maxThreadsPerBlock << '\n';
|
|
|
|
os << "Maximum dimension of block: ";
|
|
for (int i = 0; i < 3; ++i) {
|
|
if(i) os << 'x';
|
|
os << p.maxThreadsDim[i];
|
|
}
|
|
os << '\n';
|
|
|
|
os << "Maximum dimension of grid: ";
|
|
for (int i = 0; i < 3; ++i) {
|
|
if(i) os << 'x';
|
|
os << p.maxGridSize[i];;
|
|
}
|
|
os << '\n';
|
|
|
|
os << "Clock rate: " << p.clockRate << '\n'
|
|
<< "Total constant memory: " << p.totalConstMem << '\n'
|
|
<< "Texture alignment: " << p.textureAlignment << '\n'
|
|
<< "Concurrent copy and execution: " << p.deviceOverlap << '\n'
|
|
<< "Number of multiprocessors: " << p.multiProcessorCount << '\n'
|
|
<< "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n'
|
|
<< "GPU sharing Host Memory: " << p.integrated << '\n'
|
|
<< "Host page-locked mem mapping: " << p.canMapHostMemory << '\n'
|
|
<< "Alignment for Surfaces: " << p.surfaceAlignment << '\n'
|
|
<< "Device has ECC support: " << p.ECCEnabled << '\n'
|
|
<< "Unified Addressing (UVA): " << p.unifiedAddressing << '\n';
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum threads per block on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_threads_per_block(int d) {
|
|
int threads = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
|
|
"failed to query the maximum threads per block on device ", d
|
|
)
|
|
return threads;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum x-dimension per block on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_x_dim_per_block(int d) {
|
|
int dim = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
|
|
"failed to query the maximum x-dimension per block on device ", d
|
|
)
|
|
return dim;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum y-dimension per block on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_y_dim_per_block(int d) {
|
|
int dim = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
|
|
"failed to query the maximum y-dimension per block on device ", d
|
|
)
|
|
return dim;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum z-dimension per block on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_z_dim_per_block(int d) {
|
|
int dim = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
|
|
"failed to query the maximum z-dimension per block on device ", d
|
|
)
|
|
return dim;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum x-dimension per grid on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_x_dim_per_grid(int d) {
|
|
int dim = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
|
|
"failed to query the maximum x-dimension per grid on device ", d
|
|
)
|
|
return dim;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum y-dimension per grid on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_y_dim_per_grid(int d) {
|
|
int dim = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
|
|
"failed to query the maximum y-dimension per grid on device ", d
|
|
)
|
|
return dim;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum z-dimension per grid on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_z_dim_per_grid(int d) {
|
|
int dim = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
|
|
"failed to query the maximum z-dimension per grid on device ", d
|
|
)
|
|
return dim;
|
|
}
|
|
|
|
/**
|
|
@brief queries the maximum shared memory size in bytes per block on a device
|
|
*/
|
|
inline size_t cuda_get_device_max_shm_per_block(int d) {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
|
|
"failed to query the maximum shared memory per block on device ", d
|
|
)
|
|
return num;
|
|
}
|
|
|
|
/**
|
|
@brief queries the warp size on a device
|
|
*/
|
|
inline size_t cuda_get_device_warp_size(int d) {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
|
|
"failed to query the warp size per block on device ", d
|
|
)
|
|
return num;
|
|
}
|
|
|
|
/**
|
|
@brief queries the major number of compute capability of a device
|
|
*/
|
|
inline int cuda_get_device_compute_capability_major(int d) {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
|
|
"failed to query the major number of compute capability of device ", d
|
|
)
|
|
return num;
|
|
}
|
|
|
|
/**
|
|
@brief queries the minor number of compute capability of a device
|
|
*/
|
|
inline int cuda_get_device_compute_capability_minor(int d) {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
|
|
"failed to query the minor number of compute capability of device ", d
|
|
)
|
|
return num;
|
|
}
|
|
|
|
/**
|
|
@brief queries if the device supports unified addressing
|
|
*/
|
|
inline bool cuda_get_device_unified_addressing(int d) {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
|
|
"failed to query unified addressing status on device ", d
|
|
)
|
|
return num;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// CUDA Version
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
|
|
*/
|
|
inline int cuda_get_driver_version() {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaDriverGetVersion(&num),
|
|
"failed to query the latest cuda version supported by the driver"
|
|
);
|
|
return num;
|
|
}
|
|
|
|
/**
|
|
@brief queries the CUDA Runtime version (1000 * major + 10 * minor)
|
|
*/
|
|
inline int cuda_get_runtime_version() {
|
|
int num = 0;
|
|
TF_CHECK_CUDA(
|
|
cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
|
|
);
|
|
return num;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaScopedDevice
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/** @class cudaScopedDevice
|
|
|
|
@brief class to create an RAII-styled context switch
|
|
|
|
Sample usage:
|
|
|
|
@code{.cpp}
|
|
{
|
|
tf::cudaScopedDevice device(1); // switch to the device context 1
|
|
|
|
// create a stream under device context 1
|
|
cudaStream_t stream;
|
|
cudaStreamCreate(&stream);
|
|
|
|
} // leaving the scope and goes back to the previous device context
|
|
@endcode
|
|
|
|
%cudaScopedDevice is neither movable nor copyable.
|
|
*/
|
|
class cudaScopedDevice {
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief constructs a RAII-styled device switcher
|
|
|
|
@param device device context to scope in the guard
|
|
*/
|
|
explicit cudaScopedDevice(int device);
|
|
|
|
/**
|
|
@brief destructs the guard and switches back to the previous device context
|
|
*/
|
|
~cudaScopedDevice();
|
|
|
|
private:
|
|
|
|
cudaScopedDevice() = delete;
|
|
cudaScopedDevice(const cudaScopedDevice&) = delete;
|
|
cudaScopedDevice(cudaScopedDevice&&) = delete;
|
|
|
|
int _p;
|
|
};
|
|
|
|
// Constructor
|
|
inline cudaScopedDevice::cudaScopedDevice(int dev) {
|
|
TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
|
|
if(_p == dev) {
|
|
_p = -1;
|
|
}
|
|
else {
|
|
TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
|
|
}
|
|
}
|
|
|
|
// Destructor
|
|
inline cudaScopedDevice::~cudaScopedDevice() {
|
|
if(_p != -1) {
|
|
cudaSetDevice(_p);
|
|
//TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
|
|
}
|
|
}
|
|
|
|
} // end of namespace cuda ---------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|