mesytec-mnode/external/taskflow-3.8.0/taskflow/cuda/cuda_device.hpp

#pragma once

#include "cuda_error.hpp"

/**
@file cuda_device.hpp
@brief CUDA device utilities include file
*/

namespace tf {

/**
@brief queries the number of available devices
*/
inline size_t cuda_get_num_devices() {
	int N = 0;
  TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
	return static_cast<size_t>(N);
}

/**
@brief gets the current device associated with the caller thread
*/
inline int cuda_get_device() {
  int id;
  TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
	return id;
}

/**
@brief switches to a given device context
*/
inline void cuda_set_device(int id) {
  TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
}

/**
@brief obtains the device property
*/
inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
  TF_CHECK_CUDA(
    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
  );
}

/**
@brief obtains the device property
*/
inline cudaDeviceProp cuda_get_device_property(int i) {
  cudaDeviceProp p;
  TF_CHECK_CUDA(
    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
  );
  return p;
}

/**
@brief dumps the device property
*/
inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {

  os << "Major revision number:         " << p.major << '\n'
     << "Minor revision number:         " << p.minor << '\n'
     << "Name:                          " << p.name  << '\n'
     << "Total global memory:           " << p.totalGlobalMem << '\n'
     << "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
     << "Total registers per block:     " << p.regsPerBlock << '\n'
     << "Warp size:                     " << p.warpSize << '\n'
     << "Maximum memory pitch:          " << p.memPitch << '\n'
     << "Maximum threads per block:     " << p.maxThreadsPerBlock << '\n';

  os << "Maximum dimension of block:    ";
  for (int i = 0; i < 3; ++i) {
    if(i) os << 'x';
    os << p.maxThreadsDim[i];
  }
  os << '\n';

  os << "Maximum dimension of grid:    ";
  for (int i = 0; i < 3; ++i) {
    if(i) os << 'x';
    os << p.maxGridSize[i];;
  }
  os << '\n';

  os << "Clock rate:                    " << p.clockRate << '\n'
     << "Total constant memory:         " << p.totalConstMem << '\n'
     << "Texture alignment:             " << p.textureAlignment << '\n'
     << "Concurrent copy and execution: " << p.deviceOverlap << '\n'
     << "Number of multiprocessors:     " << p.multiProcessorCount << '\n'
     << "Kernel execution timeout:      " << p.kernelExecTimeoutEnabled << '\n'
     << "GPU sharing Host Memory:       " << p.integrated << '\n'
     << "Host page-locked mem mapping:  " << p.canMapHostMemory << '\n'
     << "Alignment for Surfaces:        " << p.surfaceAlignment << '\n'
     << "Device has ECC support:        " << p.ECCEnabled << '\n'
     << "Unified Addressing (UVA):      " << p.unifiedAddressing << '\n';
}

/**
@brief queries the maximum threads per block on a device
*/
inline size_t cuda_get_device_max_threads_per_block(int d) {
  int threads = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
    "failed to query the maximum threads per block on device ", d
  )
  return threads;
}

/**
@brief queries the maximum x-dimension per block on a device
*/
inline size_t cuda_get_device_max_x_dim_per_block(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
    "failed to query the maximum x-dimension per block on device ", d
  )
  return dim;
}

/**
@brief queries the maximum y-dimension per block on a device
*/
inline size_t cuda_get_device_max_y_dim_per_block(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
    "failed to query the maximum y-dimension per block on device ", d
  )
  return dim;
}

/**
@brief queries the maximum z-dimension per block on a device
*/
inline size_t cuda_get_device_max_z_dim_per_block(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
    "failed to query the maximum z-dimension per block on device ", d
  )
  return dim;
}

/**
@brief queries the maximum x-dimension per grid on a device
*/
inline size_t cuda_get_device_max_x_dim_per_grid(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
    "failed to query the maximum x-dimension per grid on device ", d
  )
  return dim;
}

/**
@brief queries the maximum y-dimension per grid on a device
*/
inline size_t cuda_get_device_max_y_dim_per_grid(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
    "failed to query the maximum y-dimension per grid on device ", d
  )
  return dim;
}

/**
@brief queries the maximum z-dimension per grid on a device
*/
inline size_t cuda_get_device_max_z_dim_per_grid(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
    "failed to query the maximum z-dimension per grid on device ", d
  )
  return dim;
}

/**
@brief queries the maximum shared memory size in bytes per block on a device
*/
inline size_t cuda_get_device_max_shm_per_block(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
    "failed to query the maximum shared memory per block on device ", d
  )
  return num;
}

/**
@brief queries the warp size on a device
*/
inline size_t cuda_get_device_warp_size(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
    "failed to query the warp size per block on device ", d
  )
  return num;
}

/**
@brief queries the major number of compute capability of a device
*/
inline int cuda_get_device_compute_capability_major(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
    "failed to query the major number of compute capability of device ", d
  )
  return num;
}

/**
@brief queries the minor number of compute capability of a device
*/
inline int cuda_get_device_compute_capability_minor(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
    "failed to query the minor number of compute capability of device ", d
  )
  return num;
}

/**
@brief queries if the device supports unified addressing
*/
inline bool cuda_get_device_unified_addressing(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
    "failed to query unified addressing status on device ", d
  )
  return num;
}

// ----------------------------------------------------------------------------
// CUDA Version
// ----------------------------------------------------------------------------

/**
@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
*/
inline int cuda_get_driver_version() {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDriverGetVersion(&num),
    "failed to query the latest cuda version supported by the driver"
  );
  return num;
}

/**
@brief queries the CUDA Runtime version (1000 * major + 10 * minor)
*/
inline int cuda_get_runtime_version() {
  int num = 0;
  TF_CHECK_CUDA(
    cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
  );
  return num;
}

// ----------------------------------------------------------------------------
// cudaScopedDevice
// ----------------------------------------------------------------------------

/** @class cudaScopedDevice

@brief class to create an RAII-styled context switch

Sample usage:

@code{.cpp}
{
  tf::cudaScopedDevice device(1);  // switch to the device context 1

  // create a stream under device context 1
  cudaStream_t stream;
  cudaStreamCreate(&stream);

}  // leaving the scope and goes back to the previous device context
@endcode

%cudaScopedDevice is neither movable nor copyable.
*/
class cudaScopedDevice {

  public:

    /**
    @brief constructs a RAII-styled device switcher

    @param device device context to scope in the guard
    */
    explicit cudaScopedDevice(int device);

    /**
    @brief destructs the guard and switches back to the previous device context
    */
    ~cudaScopedDevice();

  private:

    cudaScopedDevice() = delete;
    cudaScopedDevice(const cudaScopedDevice&) = delete;
    cudaScopedDevice(cudaScopedDevice&&) = delete;

    int _p;
};

// Constructor
inline cudaScopedDevice::cudaScopedDevice(int dev) {
  TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
  if(_p == dev) {
    _p = -1;
  }
  else {
    TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
  }
}

// Destructor
inline cudaScopedDevice::~cudaScopedDevice() {
  if(_p != -1) {
    cudaSetDevice(_p);
    //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
  }
}

}  // end of namespace cuda ---------------------------------------------------
add taskflow-3.8.0 2025-01-04 01:25:05 +01:00			`#pragma once`

			`#include "cuda_error.hpp"`

			`/**`
			`@file cuda_device.hpp`
			`@brief CUDA device utilities include file`
			`*/`

			`namespace tf {`

			`/**`
			`@brief queries the number of available devices`
			`*/`
			`inline size_t cuda_get_num_devices() {`
			`int N = 0;`
			`TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");`
			`return static_cast<size_t>(N);`
			`}`

			`/**`
			`@brief gets the current device associated with the caller thread`
			`*/`
			`inline int cuda_get_device() {`
			`int id;`
			`TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");`
			`return id;`
			`}`

			`/**`
			`@brief switches to a given device context`
			`*/`
			`inline void cuda_set_device(int id) {`
			`TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);`
			`}`

			`/**`
			`@brief obtains the device property`
			`*/`
			`inline void cuda_get_device_property(int i, cudaDeviceProp& p) {`
			`TF_CHECK_CUDA(`
			`cudaGetDeviceProperties(&p, i), "failed to get property of device ", i`
			`);`
			`}`

			`/**`
			`@brief obtains the device property`
			`*/`
			`inline cudaDeviceProp cuda_get_device_property(int i) {`
			`cudaDeviceProp p;`
			`TF_CHECK_CUDA(`
			`cudaGetDeviceProperties(&p, i), "failed to get property of device ", i`
			`);`
			`return p;`
			`}`

			`/**`
			`@brief dumps the device property`
			`*/`
			`inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {`

			`os << "Major revision number: " << p.major << '\n'`
			`<< "Minor revision number: " << p.minor << '\n'`
			`<< "Name: " << p.name << '\n'`
			`<< "Total global memory: " << p.totalGlobalMem << '\n'`
			`<< "Total shared memory per block: " << p.sharedMemPerBlock << '\n'`
			`<< "Total registers per block: " << p.regsPerBlock << '\n'`
			`<< "Warp size: " << p.warpSize << '\n'`
			`<< "Maximum memory pitch: " << p.memPitch << '\n'`
			`<< "Maximum threads per block: " << p.maxThreadsPerBlock << '\n';`

			`os << "Maximum dimension of block: ";`
			`for (int i = 0; i < 3; ++i) {`
			`if(i) os << 'x';`
			`os << p.maxThreadsDim[i];`
			`}`
			`os << '\n';`

			`os << "Maximum dimension of grid: ";`
			`for (int i = 0; i < 3; ++i) {`
			`if(i) os << 'x';`
			`os << p.maxGridSize[i];;`
			`}`
			`os << '\n';`

			`os << "Clock rate: " << p.clockRate << '\n'`
			`<< "Total constant memory: " << p.totalConstMem << '\n'`
			`<< "Texture alignment: " << p.textureAlignment << '\n'`
			`<< "Concurrent copy and execution: " << p.deviceOverlap << '\n'`
			`<< "Number of multiprocessors: " << p.multiProcessorCount << '\n'`
			`<< "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n'`
			`<< "GPU sharing Host Memory: " << p.integrated << '\n'`
			`<< "Host page-locked mem mapping: " << p.canMapHostMemory << '\n'`
			`<< "Alignment for Surfaces: " << p.surfaceAlignment << '\n'`
			`<< "Device has ECC support: " << p.ECCEnabled << '\n'`
			`<< "Unified Addressing (UVA): " << p.unifiedAddressing << '\n';`
			`}`

			`/**`
			`@brief queries the maximum threads per block on a device`
			`*/`
			`inline size_t cuda_get_device_max_threads_per_block(int d) {`
			`int threads = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),`
			`"failed to query the maximum threads per block on device ", d`
			`)`
			`return threads;`
			`}`

			`/**`
			`@brief queries the maximum x-dimension per block on a device`
			`*/`
			`inline size_t cuda_get_device_max_x_dim_per_block(int d) {`
			`int dim = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),`
			`"failed to query the maximum x-dimension per block on device ", d`
			`)`
			`return dim;`
			`}`

			`/**`
			`@brief queries the maximum y-dimension per block on a device`
			`*/`
			`inline size_t cuda_get_device_max_y_dim_per_block(int d) {`
			`int dim = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),`
			`"failed to query the maximum y-dimension per block on device ", d`
			`)`
			`return dim;`
			`}`

			`/**`
			`@brief queries the maximum z-dimension per block on a device`
			`*/`
			`inline size_t cuda_get_device_max_z_dim_per_block(int d) {`
			`int dim = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),`
			`"failed to query the maximum z-dimension per block on device ", d`
			`)`
			`return dim;`
			`}`

			`/**`
			`@brief queries the maximum x-dimension per grid on a device`
			`*/`
			`inline size_t cuda_get_device_max_x_dim_per_grid(int d) {`
			`int dim = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),`
			`"failed to query the maximum x-dimension per grid on device ", d`
			`)`
			`return dim;`
			`}`

			`/**`
			`@brief queries the maximum y-dimension per grid on a device`
			`*/`
			`inline size_t cuda_get_device_max_y_dim_per_grid(int d) {`
			`int dim = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),`
			`"failed to query the maximum y-dimension per grid on device ", d`
			`)`
			`return dim;`
			`}`

			`/**`
			`@brief queries the maximum z-dimension per grid on a device`
			`*/`
			`inline size_t cuda_get_device_max_z_dim_per_grid(int d) {`
			`int dim = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),`
			`"failed to query the maximum z-dimension per grid on device ", d`
			`)`
			`return dim;`
			`}`

			`/**`
			`@brief queries the maximum shared memory size in bytes per block on a device`
			`*/`
			`inline size_t cuda_get_device_max_shm_per_block(int d) {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),`
			`"failed to query the maximum shared memory per block on device ", d`
			`)`
			`return num;`
			`}`

			`/**`
			`@brief queries the warp size on a device`
			`*/`
			`inline size_t cuda_get_device_warp_size(int d) {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),`
			`"failed to query the warp size per block on device ", d`
			`)`
			`return num;`
			`}`

			`/**`
			`@brief queries the major number of compute capability of a device`
			`*/`
			`inline int cuda_get_device_compute_capability_major(int d) {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),`
			`"failed to query the major number of compute capability of device ", d`
			`)`
			`return num;`
			`}`

			`/**`
			`@brief queries the minor number of compute capability of a device`
			`*/`
			`inline int cuda_get_device_compute_capability_minor(int d) {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),`
			`"failed to query the minor number of compute capability of device ", d`
			`)`
			`return num;`
			`}`

			`/**`
			`@brief queries if the device supports unified addressing`
			`*/`
			`inline bool cuda_get_device_unified_addressing(int d) {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),`
			`"failed to query unified addressing status on device ", d`
			`)`
			`return num;`
			`}`

			`// ----------------------------------------------------------------------------`
			`// CUDA Version`
			`// ----------------------------------------------------------------------------`

			`/**`
			`@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver`
			`*/`
			`inline int cuda_get_driver_version() {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaDriverGetVersion(&num),`
			`"failed to query the latest cuda version supported by the driver"`
			`);`
			`return num;`
			`}`

			`/**`
			`@brief queries the CUDA Runtime version (1000 * major + 10 * minor)`
			`*/`
			`inline int cuda_get_runtime_version() {`
			`int num = 0;`
			`TF_CHECK_CUDA(`
			`cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"`
			`);`
			`return num;`
			`}`

			`// ----------------------------------------------------------------------------`
			`// cudaScopedDevice`
			`// ----------------------------------------------------------------------------`

			`/** @class cudaScopedDevice`

			`@brief class to create an RAII-styled context switch`

			`Sample usage:`

			`@code{.cpp}`
			`{`
			`tf::cudaScopedDevice device(1); // switch to the device context 1`

			`// create a stream under device context 1`
			`cudaStream_t stream;`
			`cudaStreamCreate(&stream);`

			`} // leaving the scope and goes back to the previous device context`
			`@endcode`

			`%cudaScopedDevice is neither movable nor copyable.`
			`*/`
			`class cudaScopedDevice {`

			`public:`

			`/**`
			`@brief constructs a RAII-styled device switcher`

			`@param device device context to scope in the guard`
			`*/`
			`explicit cudaScopedDevice(int device);`

			`/**`
			`@brief destructs the guard and switches back to the previous device context`
			`*/`
			`~cudaScopedDevice();`

			`private:`

			`cudaScopedDevice() = delete;`
			`cudaScopedDevice(const cudaScopedDevice&) = delete;`
			`cudaScopedDevice(cudaScopedDevice&&) = delete;`

			`int _p;`
			`};`

			`// Constructor`
			`inline cudaScopedDevice::cudaScopedDevice(int dev) {`
			`TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");`
			`if(_p == dev) {`
			`_p = -1;`
			`}`
			`else {`
			`TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);`
			`}`
			`}`

			`// Destructor`
			`inline cudaScopedDevice::~cudaScopedDevice() {`
			`if(_p != -1) {`
			`cudaSetDevice(_p);`
			`//TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);`
			`}`
			`}`

			`} // end of namespace cuda ---------------------------------------------------`