#pragma once

#include "cuda_error.hpp"

/**
@file cuda_device.hpp
@brief CUDA device utilities include file
*/

namespace tf {

/**
@brief queries the number of available devices
*/
inline size_t cuda_get_num_devices() {
	int N = 0;
  TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
	return static_cast<size_t>(N);
}

/**
@brief gets the current device associated with the caller thread
*/
inline int cuda_get_device() {
  int id;
  TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
	return id;
}

/**
@brief switches to a given device context
*/
inline void cuda_set_device(int id) {
  TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
}

/**
@brief obtains the device property
*/
inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
  TF_CHECK_CUDA(
    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
  );
}

/**
@brief obtains the device property
*/
inline cudaDeviceProp cuda_get_device_property(int i) {
  cudaDeviceProp p;
  TF_CHECK_CUDA(
    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
  );
  return p;
}

/**
@brief dumps the device property
*/
inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {

  os << "Major revision number:         " << p.major << '\n'
     << "Minor revision number:         " << p.minor << '\n'
     << "Name:                          " << p.name  << '\n'
     << "Total global memory:           " << p.totalGlobalMem << '\n'
     << "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
     << "Total registers per block:     " << p.regsPerBlock << '\n'
     << "Warp size:                     " << p.warpSize << '\n'
     << "Maximum memory pitch:          " << p.memPitch << '\n'
     << "Maximum threads per block:     " << p.maxThreadsPerBlock << '\n';

  os << "Maximum dimension of block:    ";
  for (int i = 0; i < 3; ++i) {
    if(i) os << 'x';
    os << p.maxThreadsDim[i];
  }
  os << '\n';

  os << "Maximum dimension of grid:    ";
  for (int i = 0; i < 3; ++i) {
    if(i) os << 'x';
    os << p.maxGridSize[i];;
  }
  os << '\n';

  os << "Clock rate:                    " << p.clockRate << '\n'
     << "Total constant memory:         " << p.totalConstMem << '\n'
     << "Texture alignment:             " << p.textureAlignment << '\n'
     << "Concurrent copy and execution: " << p.deviceOverlap << '\n'
     << "Number of multiprocessors:     " << p.multiProcessorCount << '\n'
     << "Kernel execution timeout:      " << p.kernelExecTimeoutEnabled << '\n'
     << "GPU sharing Host Memory:       " << p.integrated << '\n'
     << "Host page-locked mem mapping:  " << p.canMapHostMemory << '\n'
     << "Alignment for Surfaces:        " << p.surfaceAlignment << '\n'
     << "Device has ECC support:        " << p.ECCEnabled << '\n'
     << "Unified Addressing (UVA):      " << p.unifiedAddressing << '\n';
}

/**
@brief queries the maximum threads per block on a device
*/
inline size_t cuda_get_device_max_threads_per_block(int d) {
  int threads = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
    "failed to query the maximum threads per block on device ", d
  )
  return threads;
}

/**
@brief queries the maximum x-dimension per block on a device
*/
inline size_t cuda_get_device_max_x_dim_per_block(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
    "failed to query the maximum x-dimension per block on device ", d
  )
  return dim;
}

/**
@brief queries the maximum y-dimension per block on a device
*/
inline size_t cuda_get_device_max_y_dim_per_block(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
    "failed to query the maximum y-dimension per block on device ", d
  )
  return dim;
}

/**
@brief queries the maximum z-dimension per block on a device
*/
inline size_t cuda_get_device_max_z_dim_per_block(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
    "failed to query the maximum z-dimension per block on device ", d
  )
  return dim;
}

/**
@brief queries the maximum x-dimension per grid on a device
*/
inline size_t cuda_get_device_max_x_dim_per_grid(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
    "failed to query the maximum x-dimension per grid on device ", d
  )
  return dim;
}

/**
@brief queries the maximum y-dimension per grid on a device
*/
inline size_t cuda_get_device_max_y_dim_per_grid(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
    "failed to query the maximum y-dimension per grid on device ", d
  )
  return dim;
}

/**
@brief queries the maximum z-dimension per grid on a device
*/
inline size_t cuda_get_device_max_z_dim_per_grid(int d) {
  int dim = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
    "failed to query the maximum z-dimension per grid on device ", d
  )
  return dim;
}

/**
@brief queries the maximum shared memory size in bytes per block on a device
*/
inline size_t cuda_get_device_max_shm_per_block(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
    "failed to query the maximum shared memory per block on device ", d
  )
  return num;
}

/**
@brief queries the warp size on a device
*/
inline size_t cuda_get_device_warp_size(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
    "failed to query the warp size per block on device ", d
  )
  return num;
}

/**
@brief queries the major number of compute capability of a device
*/
inline int cuda_get_device_compute_capability_major(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
    "failed to query the major number of compute capability of device ", d
  )
  return num;
}

/**
@brief queries the minor number of compute capability of a device
*/
inline int cuda_get_device_compute_capability_minor(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
    "failed to query the minor number of compute capability of device ", d
  )
  return num;
}

/**
@brief queries if the device supports unified addressing
*/
inline bool cuda_get_device_unified_addressing(int d) {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
    "failed to query unified addressing status on device ", d
  )
  return num;
}

// ----------------------------------------------------------------------------
// CUDA Version
// ----------------------------------------------------------------------------

/**
@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
*/
inline int cuda_get_driver_version() {
  int num = 0;
  TF_CHECK_CUDA(
    cudaDriverGetVersion(&num),
    "failed to query the latest cuda version supported by the driver"
  );
  return num;
}

/**
@brief queries the CUDA Runtime version (1000 * major + 10 * minor)
*/
inline int cuda_get_runtime_version() {
  int num = 0;
  TF_CHECK_CUDA(
    cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
  );
  return num;
}

// ----------------------------------------------------------------------------
// cudaScopedDevice
// ----------------------------------------------------------------------------

/** @class cudaScopedDevice

@brief class to create an RAII-styled context switch

Sample usage:

@code{.cpp}
{
  tf::cudaScopedDevice device(1);  // switch to the device context 1

  // create a stream under device context 1
  cudaStream_t stream;
  cudaStreamCreate(&stream);

}  // leaving the scope and goes back to the previous device context
@endcode

%cudaScopedDevice is neither movable nor copyable.
*/
class cudaScopedDevice {

  public:

    /**
    @brief constructs a RAII-styled device switcher

    @param device device context to scope in the guard
    */
    explicit cudaScopedDevice(int device);

    /**
    @brief destructs the guard and switches back to the previous device context
    */
    ~cudaScopedDevice();

  private:

    cudaScopedDevice() = delete;
    cudaScopedDevice(const cudaScopedDevice&) = delete;
    cudaScopedDevice(cudaScopedDevice&&) = delete;

    int _p;
};

// Constructor
inline cudaScopedDevice::cudaScopedDevice(int dev) {
  TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
  if(_p == dev) {
    _p = -1;
  }
  else {
    TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
  }
}

// Destructor
inline cudaScopedDevice::~cudaScopedDevice() {
  if(_p != -1) {
    cudaSetDevice(_p);
    //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
  }
}

}  // end of namespace cuda ---------------------------------------------------