287 lines
7.2 KiB
C++
287 lines
7.2 KiB
C++
#pragma once
|
|
|
|
#include "cuda_error.hpp"
|
|
|
|
namespace tf {
|
|
|
|
/**
|
|
@brief per-thread object pool to manage CUDA device object
|
|
|
|
@tparam H object type
|
|
@tparam C function object to create a library object
|
|
@tparam D function object to delete a library object
|
|
|
|
A CUDA device object has a lifetime associated with a device,
|
|
for example, @c cudaStream_t, @c cublasHandle_t, etc.
|
|
Creating a device object is typically expensive (e.g., 10-200 ms)
|
|
and destroying it may trigger implicit device synchronization.
|
|
For applications that intensively make use of device objects,
|
|
it is desirable to reuse them as much as possible.
|
|
|
|
There exists an one-to-one relationship between CUDA devices in CUDA Runtime API
|
|
and CUcontexts in the CUDA Driver API within a process.
|
|
The specific context which the CUDA Runtime API uses for a device
|
|
is called the device's primary context.
|
|
From the perspective of the CUDA Runtime API,
|
|
a device and its primary context are synonymous.
|
|
|
|
We design the device object pool in a decentralized fashion by keeping
|
|
(1) a global pool to keep track of potentially usable objects and
|
|
(2) a per-thread pool to footprint objects with shared ownership.
|
|
The global pool does not own the object and therefore does not destruct any of them.
|
|
The per-thread pool keeps the footprints of objects with shared ownership
|
|
and will destruct them if the thread holds the last reference count after it joins.
|
|
The motivation of this decentralized control is to avoid device objects
|
|
from being destroyed while the context had been destroyed due to driver shutdown.
|
|
|
|
*/
|
|
template <typename H, typename C, typename D>
|
|
class cudaPerThreadDeviceObjectPool {
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief structure to store a context object
|
|
*/
|
|
struct Object {
|
|
|
|
int device;
|
|
H value;
|
|
|
|
Object(int);
|
|
~Object();
|
|
|
|
Object(const Object&) = delete;
|
|
Object(Object&&) = delete;
|
|
};
|
|
|
|
private:
|
|
|
|
// Master thread hold the storage to the pool.
|
|
// Due to some ordering, cuda context may be destroyed when the master
|
|
// program thread destroys the cuda object.
|
|
// Therefore, we use a decentralized approach to let child thread
|
|
// destroy cuda objects while the master thread only keeps a weak reference
|
|
// to those objects for reuse.
|
|
struct cudaGlobalDeviceObjectPool {
|
|
|
|
std::shared_ptr<Object> acquire(int);
|
|
void release(int, std::weak_ptr<Object>);
|
|
|
|
std::mutex mutex;
|
|
std::unordered_map<int, std::vector<std::weak_ptr<Object>>> pool;
|
|
};
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief default constructor
|
|
*/
|
|
cudaPerThreadDeviceObjectPool() = default;
|
|
|
|
/**
|
|
@brief acquires a device object with shared ownership
|
|
*/
|
|
std::shared_ptr<Object> acquire(int);
|
|
|
|
/**
|
|
@brief releases a device object with moved ownership
|
|
*/
|
|
void release(std::shared_ptr<Object>&&);
|
|
|
|
/**
|
|
@brief queries the number of device objects with shared ownership
|
|
*/
|
|
size_t footprint_size() const;
|
|
|
|
private:
|
|
|
|
inline static cudaGlobalDeviceObjectPool _shared_pool;
|
|
|
|
std::unordered_set<std::shared_ptr<Object>> _footprint;
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaPerThreadDeviceObject::cudaHanale definition
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template <typename H, typename C, typename D>
|
|
cudaPerThreadDeviceObjectPool<H, C, D>::Object::Object(int d) :
|
|
device {d} {
|
|
cudaScopedDevice ctx(device);
|
|
value = C{}();
|
|
}
|
|
|
|
template <typename H, typename C, typename D>
|
|
cudaPerThreadDeviceObjectPool<H, C, D>::Object::~Object() {
|
|
cudaScopedDevice ctx(device);
|
|
D{}(value);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaPerThreadDeviceObject::cudaHanaldePool definition
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template <typename H, typename C, typename D>
|
|
std::shared_ptr<typename cudaPerThreadDeviceObjectPool<H, C, D>::Object>
|
|
cudaPerThreadDeviceObjectPool<H, C, D>::cudaGlobalDeviceObjectPool::acquire(int d) {
|
|
std::scoped_lock<std::mutex> lock(mutex);
|
|
if(auto itr = pool.find(d); itr != pool.end()) {
|
|
while(!itr->second.empty()) {
|
|
auto sptr = itr->second.back().lock();
|
|
itr->second.pop_back();
|
|
if(sptr) {
|
|
return sptr;
|
|
}
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
template <typename H, typename C, typename D>
|
|
void cudaPerThreadDeviceObjectPool<H, C, D>::cudaGlobalDeviceObjectPool::release(
|
|
int d, std::weak_ptr<Object> ptr
|
|
) {
|
|
std::scoped_lock<std::mutex> lock(mutex);
|
|
pool[d].push_back(ptr);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaPerThreadDeviceObject definition
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template <typename H, typename C, typename D>
|
|
std::shared_ptr<typename cudaPerThreadDeviceObjectPool<H, C, D>::Object>
|
|
cudaPerThreadDeviceObjectPool<H, C, D>::acquire(int d) {
|
|
|
|
auto ptr = _shared_pool.acquire(d);
|
|
|
|
if(!ptr) {
|
|
ptr = std::make_shared<Object>(d);
|
|
}
|
|
|
|
return ptr;
|
|
}
|
|
|
|
template <typename H, typename C, typename D>
|
|
void cudaPerThreadDeviceObjectPool<H, C, D>::release(
|
|
std::shared_ptr<Object>&& ptr
|
|
) {
|
|
_shared_pool.release(ptr->device, ptr);
|
|
_footprint.insert(std::move(ptr));
|
|
}
|
|
|
|
template <typename H, typename C, typename D>
|
|
size_t cudaPerThreadDeviceObjectPool<H, C, D>::footprint_size() const {
|
|
return _footprint.size();
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaObject
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@class cudaObject
|
|
|
|
@brief class to create an RAII-styled and move-only wrapper for CUDA objects
|
|
*/
|
|
template <typename T, typename C, typename D>
|
|
class cudaObject {
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief constructs a CUDA object from the given one
|
|
*/
|
|
explicit cudaObject(T obj) : object(obj) {}
|
|
|
|
/**
|
|
@brief constructs a new CUDA object
|
|
*/
|
|
cudaObject() : object{ C{}() } {}
|
|
|
|
/**
|
|
@brief disabled copy constructor
|
|
*/
|
|
cudaObject(const cudaObject&) = delete;
|
|
|
|
/**
|
|
@brief move constructor
|
|
*/
|
|
cudaObject(cudaObject&& rhs) : object{rhs.object} {
|
|
rhs.object = nullptr;
|
|
}
|
|
|
|
/**
|
|
@brief destructs the CUDA object
|
|
*/
|
|
~cudaObject() { D{}(object); }
|
|
|
|
/**
|
|
@brief disabled copy assignment
|
|
*/
|
|
cudaObject& operator = (const cudaObject&) = delete;
|
|
|
|
/**
|
|
@brief move assignment
|
|
*/
|
|
cudaObject& operator = (cudaObject&& rhs) {
|
|
D {} (object);
|
|
object = rhs.object;
|
|
rhs.object = nullptr;
|
|
return *this;
|
|
}
|
|
|
|
/**
|
|
@brief implicit conversion to the native CUDA stream (cudaObject_t)
|
|
|
|
Returns the underlying stream of type @c cudaObject_t.
|
|
*/
|
|
operator T () const {
|
|
return object;
|
|
}
|
|
|
|
/**
|
|
@brief deletes the current CUDA object (if any) and creates a new one
|
|
*/
|
|
void create() {
|
|
D {} (object);
|
|
object = C{}();
|
|
}
|
|
|
|
/**
|
|
@brief resets this CUDA object to the given one
|
|
*/
|
|
void reset(T new_obj) {
|
|
D {} (object);
|
|
object = new_obj;
|
|
}
|
|
|
|
/**
|
|
@brief deletes the current CUDA object
|
|
*/
|
|
void clear() {
|
|
reset(nullptr);
|
|
}
|
|
|
|
/**
|
|
@brief releases the ownership of the CUDA object
|
|
*/
|
|
T release() {
|
|
auto tmp = object;
|
|
object = nullptr;
|
|
return tmp;
|
|
}
|
|
|
|
protected:
|
|
|
|
/**
|
|
@brief the CUDA object
|
|
*/
|
|
T object;
|
|
};
|
|
|
|
} // end of namespace tf -----------------------------------------------------
|
|
|
|
|
|
|