404 lines
11 KiB
C++
404 lines
11 KiB
C++
#pragma once
|
|
|
|
#include "cuda_graph.hpp"
|
|
|
|
/**
|
|
@file cuda_optimizer.hpp
|
|
@brief %cudaFlow capturing algorithms include file
|
|
*/
|
|
|
|
namespace tf {
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaFlowOptimizerBase
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@private
|
|
|
|
@brief class to provide helper common methods for optimization algorithms
|
|
*/
|
|
class cudaFlowOptimizerBase {
|
|
|
|
protected:
|
|
|
|
std::vector<cudaFlowNode*> _toposort(cudaFlowGraph&);
|
|
std::vector<std::vector<cudaFlowNode*>> _levelize(cudaFlowGraph&);
|
|
};
|
|
|
|
// Function: _toposort
|
|
inline std::vector<cudaFlowNode*> cudaFlowOptimizerBase::_toposort(cudaFlowGraph& graph) {
|
|
|
|
std::vector<cudaFlowNode*> res;
|
|
std::queue<cudaFlowNode*> bfs;
|
|
|
|
res.reserve(graph._nodes.size());
|
|
|
|
// insert the first level of nodes into the queue
|
|
for(auto& u : graph._nodes) {
|
|
|
|
auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
|
|
hu->level = u->_dependents.size();
|
|
|
|
if(hu->level == 0) {
|
|
bfs.push(u.get());
|
|
}
|
|
}
|
|
|
|
// levelize the graph using bfs
|
|
while(!bfs.empty()) {
|
|
|
|
auto u = bfs.front();
|
|
bfs.pop();
|
|
|
|
res.push_back(u);
|
|
|
|
for(auto v : u->_successors) {
|
|
auto hv = std::get_if<cudaFlowNode::Capture>(&v->_handle);
|
|
if(--hv->level == 0) {
|
|
bfs.push(v);
|
|
}
|
|
}
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
// Function: _levelize
|
|
inline std::vector<std::vector<cudaFlowNode*>>
|
|
cudaFlowOptimizerBase::_levelize(cudaFlowGraph& graph) {
|
|
|
|
std::queue<cudaFlowNode*> bfs;
|
|
|
|
size_t max_level = 0;
|
|
|
|
// insert the first level of nodes into the queue
|
|
for(auto& u : graph._nodes) {
|
|
|
|
auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
|
|
hu->level = u->_dependents.size();
|
|
|
|
if(hu->level == 0) {
|
|
bfs.push(u.get());
|
|
}
|
|
}
|
|
|
|
// levelize the graph using bfs
|
|
while(!bfs.empty()) {
|
|
|
|
auto u = bfs.front();
|
|
bfs.pop();
|
|
|
|
auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
|
|
|
|
for(auto v : u->_successors) {
|
|
auto hv = std::get_if<cudaFlowNode::Capture>(&v->_handle);
|
|
if(--hv->level == 0) {
|
|
hv->level = hu->level + 1;
|
|
if(hv->level > max_level) {
|
|
max_level = hv->level;
|
|
}
|
|
bfs.push(v);
|
|
}
|
|
}
|
|
}
|
|
|
|
// set level_graph and each node's idx
|
|
std::vector<std::vector<cudaFlowNode*>> level_graph(max_level+1);
|
|
for(auto& u : graph._nodes) {
|
|
auto hu = std::get_if<cudaFlowNode::Capture>(&u->_handle);
|
|
hu->lid = level_graph[hu->level].size();
|
|
level_graph[hu->level].emplace_back(u.get());
|
|
|
|
//for(auto s : u->_successors) {
|
|
// assert(hu.level < std::get_if<cudaFlowNode::Capture>(&s->_handle)->level);
|
|
//}
|
|
}
|
|
|
|
return level_graph;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// class definition: cudaFlowSequentialOptimizer
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@class cudaFlowSequentialOptimizer
|
|
|
|
@brief class to capture a CUDA graph using a sequential stream
|
|
|
|
A sequential capturing algorithm finds a topological order of
|
|
the described graph and captures dependent GPU tasks using a single stream.
|
|
All GPU tasks run sequentially without breaking inter dependencies.
|
|
*/
|
|
class cudaFlowSequentialOptimizer : public cudaFlowOptimizerBase {
|
|
|
|
friend class cudaFlowCapturer;
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief constructs a sequential optimizer
|
|
*/
|
|
cudaFlowSequentialOptimizer() = default;
|
|
|
|
private:
|
|
|
|
cudaGraph_t _optimize(cudaFlowGraph& graph);
|
|
};
|
|
|
|
inline cudaGraph_t cudaFlowSequentialOptimizer::_optimize(cudaFlowGraph& graph) {
|
|
|
|
// acquire per-thread stream and turn it into capture mode
|
|
// we must use ThreadLocal mode to avoid clashing with CUDA global states
|
|
|
|
cudaStream stream;
|
|
|
|
stream.begin_capture(cudaStreamCaptureModeThreadLocal);
|
|
|
|
auto ordered = _toposort(graph);
|
|
for(auto node : ordered) {
|
|
std::get_if<cudaFlowNode::Capture>(&node->_handle)->work(stream);
|
|
}
|
|
|
|
return stream.end_capture();
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// class definition: cudaFlowLinearOptimizer
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@class cudaFlowLinearOptimizer
|
|
|
|
@brief class to capture a linear CUDA graph using a sequential stream
|
|
|
|
A linear capturing algorithm is a special case of tf::cudaFlowSequentialOptimizer
|
|
and assumes the input task graph to be a single linear chain of tasks
|
|
(i.e., a straight line).
|
|
This assumption allows faster optimization during the capturing process.
|
|
If the input task graph is not a linear chain, the behavior is undefined.
|
|
*/
|
|
class cudaFlowLinearOptimizer : public cudaFlowOptimizerBase {
|
|
|
|
friend class cudaFlowCapturer;
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief constructs a linear optimizer
|
|
*/
|
|
cudaFlowLinearOptimizer() = default;
|
|
|
|
private:
|
|
|
|
cudaGraph_t _optimize(cudaFlowGraph& graph);
|
|
};
|
|
|
|
inline cudaGraph_t cudaFlowLinearOptimizer::_optimize(cudaFlowGraph& graph) {
|
|
|
|
// acquire per-thread stream and turn it into capture mode
|
|
// we must use ThreadLocal mode to avoid clashing with CUDA global states
|
|
cudaStream stream;
|
|
|
|
stream.begin_capture(cudaStreamCaptureModeThreadLocal);
|
|
|
|
// find the source node
|
|
cudaFlowNode* src {nullptr};
|
|
for(auto& u : graph._nodes) {
|
|
if(u->_dependents.size() == 0) {
|
|
src = u.get();
|
|
while(src) {
|
|
std::get_if<cudaFlowNode::Capture>(&src->_handle)->work(stream);
|
|
src = src->_successors.empty() ? nullptr : src->_successors[0];
|
|
}
|
|
break;
|
|
}
|
|
// ideally, there should be only one source
|
|
}
|
|
|
|
return stream.end_capture();
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// class definition: cudaFlowRoundRobinOptimizer
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@class cudaFlowRoundRobinOptimizer
|
|
|
|
@brief class to capture a CUDA graph using a round-robin algorithm
|
|
|
|
A round-robin capturing algorithm levelizes the user-described graph
|
|
and assign streams to nodes in a round-robin order level by level.
|
|
The algorithm is based on the following paper published in Euro-Par 2021:
|
|
+ Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using %Task Graph Parallelism," <i>European Conference on Parallel and Distributed Computing (Euro-Par)</i>, 2021
|
|
|
|
The round-robin optimization algorithm is best suited for large %cudaFlow graphs
|
|
that compose hundreds of or thousands of GPU operations
|
|
(e.g., kernels and memory copies) with many of them being able to run in parallel.
|
|
You can configure the number of streams to the optimizer to adjust the
|
|
maximum kernel currency in the captured CUDA graph.
|
|
*/
|
|
class cudaFlowRoundRobinOptimizer : public cudaFlowOptimizerBase {
|
|
|
|
friend class cudaFlowCapturer;
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief constructs a round-robin optimizer with 4 streams by default
|
|
*/
|
|
cudaFlowRoundRobinOptimizer() = default;
|
|
|
|
/**
|
|
@brief constructs a round-robin optimizer with the given number of streams
|
|
*/
|
|
explicit cudaFlowRoundRobinOptimizer(size_t num_streams);
|
|
|
|
/**
|
|
@brief queries the number of streams used by the optimizer
|
|
*/
|
|
size_t num_streams() const;
|
|
|
|
/**
|
|
@brief sets the number of streams used by the optimizer
|
|
*/
|
|
void num_streams(size_t n);
|
|
|
|
private:
|
|
|
|
size_t _num_streams {4};
|
|
|
|
cudaGraph_t _optimize(cudaFlowGraph& graph);
|
|
|
|
void _reset(std::vector<std::vector<cudaFlowNode*>>& graph);
|
|
|
|
};
|
|
|
|
// Constructor
|
|
inline cudaFlowRoundRobinOptimizer::cudaFlowRoundRobinOptimizer(size_t num_streams) :
|
|
_num_streams {num_streams} {
|
|
|
|
if(num_streams == 0) {
|
|
TF_THROW("number of streams must be at least one");
|
|
}
|
|
}
|
|
|
|
// Function: num_streams
|
|
inline size_t cudaFlowRoundRobinOptimizer::num_streams() const {
|
|
return _num_streams;
|
|
}
|
|
|
|
// Procedure: num_streams
|
|
inline void cudaFlowRoundRobinOptimizer::num_streams(size_t n) {
|
|
if(n == 0) {
|
|
TF_THROW("number of streams must be at least one");
|
|
}
|
|
_num_streams = n;
|
|
}
|
|
|
|
inline void cudaFlowRoundRobinOptimizer::_reset(
|
|
std::vector<std::vector<cudaFlowNode*>>& graph
|
|
) {
|
|
//level == global id
|
|
//idx == stream id we want to skip
|
|
size_t id{0};
|
|
for(auto& each_level: graph) {
|
|
for(auto& node: each_level) {
|
|
auto hn = std::get_if<cudaFlowNode::Capture>(&node->_handle);
|
|
hn->level = id++;
|
|
hn->idx = _num_streams;
|
|
hn->event = nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Function: _optimize
|
|
inline cudaGraph_t cudaFlowRoundRobinOptimizer::_optimize(cudaFlowGraph& graph) {
|
|
|
|
// levelize the graph
|
|
auto levelized = _levelize(graph);
|
|
|
|
// initialize the data structure
|
|
_reset(levelized);
|
|
|
|
// begin to capture
|
|
std::vector<cudaStream> streams(_num_streams);
|
|
|
|
streams[0].begin_capture(cudaStreamCaptureModeThreadLocal);
|
|
|
|
// reserve space for scoped events
|
|
std::vector<cudaEvent> events;
|
|
events.reserve((_num_streams >> 1) + levelized.size());
|
|
|
|
// fork
|
|
cudaEvent_t fork_event = events.emplace_back();
|
|
streams[0].record(fork_event);
|
|
|
|
for(size_t i = 1; i < streams.size(); ++i) {
|
|
streams[i].wait(fork_event);
|
|
}
|
|
|
|
// assign streams to levelized nodes in a round-robin manner
|
|
for(auto& each_level: levelized) {
|
|
for(auto& node: each_level) {
|
|
auto hn = std::get_if<cudaFlowNode::Capture>(&node->_handle);
|
|
size_t sid = hn->lid % _num_streams;
|
|
|
|
//wait events
|
|
cudaFlowNode* wait_node{nullptr};
|
|
for(auto& pn: node->_dependents) {
|
|
auto phn = std::get_if<cudaFlowNode::Capture>(&pn->_handle);
|
|
size_t psid = phn->lid % _num_streams;
|
|
|
|
//level == global id
|
|
//idx == stream id we want to skip
|
|
if(psid == hn->idx) {
|
|
if(wait_node == nullptr ||
|
|
std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->level < phn->level) {
|
|
wait_node = pn;
|
|
}
|
|
}
|
|
else if(psid != sid) {
|
|
streams[sid].wait(phn->event);
|
|
}
|
|
}
|
|
|
|
if(wait_node != nullptr) {
|
|
assert(std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->event);
|
|
streams[sid].wait(std::get_if<cudaFlowNode::Capture>(&wait_node->_handle)->event);
|
|
}
|
|
|
|
//capture
|
|
hn->work(streams[sid]);
|
|
|
|
//create/record stream
|
|
for(auto& sn: node->_successors) {
|
|
auto shn = std::get_if<cudaFlowNode::Capture>(&sn->_handle);
|
|
size_t ssid = shn->lid % _num_streams;
|
|
if(ssid != sid) {
|
|
if(!hn->event) {
|
|
hn->event = events.emplace_back();
|
|
streams[sid].record(hn->event);
|
|
}
|
|
//idx == stream id we want to skip
|
|
shn->idx = sid;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// join
|
|
for(size_t i=1; i<_num_streams; ++i) {
|
|
cudaEvent_t join_event = events.emplace_back();
|
|
streams[i].record(join_event);
|
|
streams[0].wait(join_event);
|
|
}
|
|
|
|
return streams[0].end_capture();
|
|
}
|
|
|
|
|
|
} // end of namespace tf -----------------------------------------------------
|
|
|