mesytec-mnode/external/taskflow-3.8.0/taskflow/cuda/cuda_graph.hpp

806 lines
20 KiB
C++
Raw Permalink Normal View History

2025-01-04 01:25:05 +01:00
#pragma once
#include "cuda_memory.hpp"
#include "cuda_stream.hpp"
#include "cuda_meta.hpp"
#include "../utility/traits.hpp"
namespace tf {
// ----------------------------------------------------------------------------
// cudaGraph_t routines
// ----------------------------------------------------------------------------
/**
@brief gets the memcpy node parameter of a copy task
*/
template <typename T,
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
>
cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {
using U = std::decay_t<T>;
cudaMemcpy3DParms p;
p.srcArray = nullptr;
p.srcPos = ::make_cudaPos(0, 0, 0);
p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
p.dstArray = nullptr;
p.dstPos = ::make_cudaPos(0, 0, 0);
p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
p.kind = cudaMemcpyDefault;
return p;
}
/**
@brief gets the memcpy node parameter of a memcpy task (untyped)
*/
inline cudaMemcpy3DParms cuda_get_memcpy_parms(
void* tgt, const void* src, size_t bytes
) {
// Parameters in cudaPitchedPtr
// d - Pointer to allocated memory
// p - Pitch of allocated memory in bytes
// xsz - Logical width of allocation in elements
// ysz - Logical height of allocation in elements
cudaMemcpy3DParms p;
p.srcArray = nullptr;
p.srcPos = ::make_cudaPos(0, 0, 0);
p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
p.dstArray = nullptr;
p.dstPos = ::make_cudaPos(0, 0, 0);
p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
p.extent = ::make_cudaExtent(bytes, 1, 1);
p.kind = cudaMemcpyDefault;
return p;
}
/**
@brief gets the memset node parameter of a memcpy task (untyped)
*/
inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {
cudaMemsetParams p;
p.dst = dst;
p.value = ch;
p.pitch = 0;
//p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
//p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
p.elementSize = 1; // either 1, 2, or 4
p.width = count;
p.height = 1;
return p;
}
/**
@brief gets the memset node parameter of a fill task (typed)
*/
template <typename T, std::enable_if_t<
is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
>
cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {
cudaMemsetParams p;
p.dst = dst;
// perform bit-wise copy
p.value = 0; // crucial
static_assert(sizeof(T) <= sizeof(p.value), "internal error");
std::memcpy(&p.value, &value, sizeof(T));
p.pitch = 0;
p.elementSize = sizeof(T); // either 1, 2, or 4
p.width = count;
p.height = 1;
return p;
}
/**
@brief gets the memset node parameter of a zero task (typed)
*/
template <typename T, std::enable_if_t<
is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
>
cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {
cudaMemsetParams p;
p.dst = dst;
p.value = 0;
p.pitch = 0;
p.elementSize = sizeof(T); // either 1, 2, or 4
p.width = count;
p.height = 1;
return p;
}
/**
@brief queries the number of root nodes in a native CUDA graph
*/
inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) {
size_t num_nodes;
TF_CHECK_CUDA(
cudaGraphGetRootNodes(graph, nullptr, &num_nodes),
"failed to get native graph root nodes"
);
return num_nodes;
}
/**
@brief queries the number of nodes in a native CUDA graph
*/
inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) {
size_t num_nodes;
TF_CHECK_CUDA(
cudaGraphGetNodes(graph, nullptr, &num_nodes),
"failed to get native graph nodes"
);
return num_nodes;
}
/**
@brief queries the number of edges in a native CUDA graph
*/
inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) {
size_t num_edges;
TF_CHECK_CUDA(
cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges),
"failed to get native graph edges"
);
return num_edges;
}
/**
@brief acquires the nodes in a native CUDA graph
*/
inline std::vector<cudaGraphNode_t> cuda_graph_get_nodes(cudaGraph_t graph) {
size_t num_nodes = cuda_graph_get_num_nodes(graph);
std::vector<cudaGraphNode_t> nodes(num_nodes);
TF_CHECK_CUDA(
cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
"failed to get native graph nodes"
);
return nodes;
}
/**
@brief acquires the root nodes in a native CUDA graph
*/
inline std::vector<cudaGraphNode_t> cuda_graph_get_root_nodes(cudaGraph_t graph) {
size_t num_nodes = cuda_graph_get_num_root_nodes(graph);
std::vector<cudaGraphNode_t> nodes(num_nodes);
TF_CHECK_CUDA(
cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
"failed to get native graph nodes"
);
return nodes;
}
/**
@brief acquires the edges in a native CUDA graph
*/
inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
cuda_graph_get_edges(cudaGraph_t graph) {
size_t num_edges = cuda_graph_get_num_edges(graph);
std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
TF_CHECK_CUDA(
cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges),
"failed to get native graph edges"
);
std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
for(size_t i=0; i<num_edges; i++) {
edges[i] = std::make_pair(froms[i], tos[i]);
}
return edges;
}
/**
@brief queries the type of a native CUDA graph node
valid type values are:
+ cudaGraphNodeTypeKernel = 0x00
+ cudaGraphNodeTypeMemcpy = 0x01
+ cudaGraphNodeTypeMemset = 0x02
+ cudaGraphNodeTypeHost = 0x03
+ cudaGraphNodeTypeGraph = 0x04
+ cudaGraphNodeTypeEmpty = 0x05
+ cudaGraphNodeTypeWaitEvent = 0x06
+ cudaGraphNodeTypeEventRecord = 0x07
*/
inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {
cudaGraphNodeType type;
TF_CHECK_CUDA(
cudaGraphNodeGetType(node, &type), "failed to get native graph node type"
);
return type;
}
/**
@brief convert the type of a native CUDA graph node to a readable string
*/
inline const char* cuda_graph_node_type_to_string(cudaGraphNodeType type) {
switch(type) {
case cudaGraphNodeTypeKernel : return "kernel";
case cudaGraphNodeTypeMemcpy : return "memcpy";
case cudaGraphNodeTypeMemset : return "memset";
case cudaGraphNodeTypeHost : return "host";
case cudaGraphNodeTypeGraph : return "graph";
case cudaGraphNodeTypeEmpty : return "empty";
case cudaGraphNodeTypeWaitEvent : return "event_wait";
case cudaGraphNodeTypeEventRecord : return "event_record";
default : return "undefined";
}
}
/**
@brief dumps a native CUDA graph and all associated child graphs to a DOT format
@tparam T output stream target
@param os target output stream
@param graph native CUDA graph
*/
template <typename T>
void cuda_dump_graph(T& os, cudaGraph_t g) {
os << "digraph cudaGraph {\n";
std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;
stack.push(std::make_tuple(g, nullptr, 1));
int pl = 0;
while(stack.empty() == false) {
auto [graph, parent, l] = stack.top();
stack.pop();
for(int i=0; i<pl-l+1; i++) {
os << "}\n";
}
os << "subgraph cluster_p" << graph << " {\n"
<< "label=\"cudaGraph-L" << l << "\";\n"
<< "color=\"purple\";\n";
auto nodes = cuda_graph_get_nodes(graph);
auto edges = cuda_graph_get_edges(graph);
for(auto& [from, to] : edges) {
os << 'p' << from << " -> " << 'p' << to << ";\n";
}
for(auto& node : nodes) {
auto type = cuda_get_graph_node_type(node);
if(type == cudaGraphNodeTypeGraph) {
cudaGraph_t child_graph;
TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), "");
stack.push(std::make_tuple(child_graph, node, l+1));
os << 'p' << node << "["
<< "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "
<< "label=\"cudaGraph-L" << l+1
<< "\"];\n";
}
else {
os << 'p' << node << "[label=\""
<< cuda_graph_node_type_to_string(type)
<< "\"];\n";
}
}
// precede to parent
if(parent != nullptr) {
std::unordered_set<cudaGraphNode_t> successors;
for(const auto& p : edges) {
successors.insert(p.first);
}
for(auto node : nodes) {
if(successors.find(node) == successors.end()) {
os << 'p' << node << " -> " << 'p' << parent << ";\n";
}
}
}
// set the previous level
pl = l;
}
for(int i=0; i<=pl; i++) {
os << "}\n";
}
}
// ----------------------------------------------------------------------------
// cudaGraph
// ----------------------------------------------------------------------------
/**
@private
*/
struct cudaGraphCreator {
cudaGraph_t operator () () const {
cudaGraph_t g;
TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph");
return g;
}
};
/**
@private
*/
struct cudaGraphDeleter {
void operator () (cudaGraph_t g) const {
if(g) {
cudaGraphDestroy(g);
}
}
};
/**
@class cudaGraph
@brief class to create an RAII-styled wrapper over a CUDA executable graph
A cudaGraph object is an RAII-styled wrapper over
a native CUDA graph (@c cudaGraph_t).
A cudaGraph object is move-only.
*/
class cudaGraph :
public cudaObject<cudaGraph_t, cudaGraphCreator, cudaGraphDeleter> {
public:
/**
@brief constructs an RAII-styled object from the given CUDA exec
Constructs a cudaGraph object from the given CUDA graph @c native.
*/
explicit cudaGraph(cudaGraph_t native) : cudaObject(native) { }
/**
@brief constructs a cudaGraph object with a new CUDA graph
*/
cudaGraph() = default;
};
// ----------------------------------------------------------------------------
// cudaGraphExec
// ----------------------------------------------------------------------------
/**
@private
*/
struct cudaGraphExecCreator {
cudaGraphExec_t operator () () const { return nullptr; }
};
/**
@private
*/
struct cudaGraphExecDeleter {
void operator () (cudaGraphExec_t executable) const {
if(executable) {
cudaGraphExecDestroy(executable);
}
}
};
/**
@class cudaGraphExec
@brief class to create an RAII-styled wrapper over a CUDA executable graph
A cudaGraphExec object is an RAII-styled wrapper over
a native CUDA executable graph (@c cudaGraphExec_t).
A cudaGraphExec object is move-only.
*/
class cudaGraphExec :
public cudaObject<cudaGraphExec_t, cudaGraphExecCreator, cudaGraphExecDeleter> {
public:
/**
@brief constructs an RAII-styled object from the given CUDA exec
Constructs a cudaGraphExec object which owns @c exec.
*/
explicit cudaGraphExec(cudaGraphExec_t exec) : cudaObject(exec) { }
/**
@brief default constructor
*/
cudaGraphExec() = default;
/**
@brief instantiates the executable from the given CUDA graph
*/
void instantiate(cudaGraph_t graph) {
cudaGraphExecDeleter {} (object);
TF_CHECK_CUDA(
cudaGraphInstantiate(&object, graph, nullptr, nullptr, 0),
"failed to create an executable graph"
);
}
/**
@brief updates the executable from the given CUDA graph
*/
cudaGraphExecUpdateResult update(cudaGraph_t graph) {
cudaGraphNode_t error_node;
cudaGraphExecUpdateResult error_result;
cudaGraphExecUpdate(object, graph, &error_node, &error_result);
return error_result;
}
/**
@brief launches the executable graph via the given stream
*/
void launch(cudaStream_t stream) {
TF_CHECK_CUDA(
cudaGraphLaunch(object, stream), "failed to launch a CUDA executable graph"
);
}
};
// ----------------------------------------------------------------------------
// cudaFlowGraph class
// ----------------------------------------------------------------------------
// class: cudaFlowGraph
class cudaFlowGraph {
friend class cudaFlowNode;
friend class cudaTask;
friend class cudaFlowCapturer;
friend class cudaFlow;
friend class cudaFlowOptimizerBase;
friend class cudaFlowSequentialOptimizer;
friend class cudaFlowLinearOptimizer;
friend class cudaFlowRoundRobinOptimizer;
friend class Taskflow;
friend class Executor;
constexpr static int OFFLOADED = 0x01;
constexpr static int CHANGED = 0x02;
constexpr static int UPDATED = 0x04;
public:
cudaFlowGraph() = default;
~cudaFlowGraph() = default;
cudaFlowGraph(const cudaFlowGraph&) = delete;
cudaFlowGraph(cudaFlowGraph&&) = default;
cudaFlowGraph& operator = (const cudaFlowGraph&) = delete;
cudaFlowGraph& operator = (cudaFlowGraph&&) = default;
template <typename... ArgsT>
cudaFlowNode* emplace_back(ArgsT&&...);
bool empty() const;
void clear();
void dump(std::ostream&, const void*, const std::string&) const ;
private:
int _state{CHANGED};
cudaGraph _native_handle {nullptr};
std::vector<std::unique_ptr<cudaFlowNode>> _nodes;
};
// ----------------------------------------------------------------------------
// cudaFlowNode class
// ----------------------------------------------------------------------------
/**
@private
@class: cudaFlowNode
*/
class cudaFlowNode {
friend class cudaFlowGraph;
friend class cudaTask;
friend class cudaFlow;
friend class cudaFlowCapturer;
friend class cudaFlowOptimizerBase;
friend class cudaFlowSequentialOptimizer;
friend class cudaFlowLinearOptimizer;
friend class cudaFlowRoundRobinOptimizer;
friend class Taskflow;
friend class Executor;
// Empty handle
struct Empty {
};
// Host handle
struct Host {
template <typename C>
Host(C&&);
std::function<void()> func;
static void callback(void*);
};
// Memset handle
struct Memset {
};
// Memcpy handle
struct Memcpy {
};
// Kernel handle
struct Kernel {
template <typename F>
Kernel(F&& f);
void* func {nullptr};
};
// Subflow handle
struct Subflow {
cudaFlowGraph cfg;
};
// Capture
struct Capture {
template <typename C>
Capture(C&&);
std::function<void(cudaStream_t)> work;
cudaEvent_t event;
size_t level;
size_t lid;
size_t idx;
};
using handle_t = std::variant<
Empty,
Host,
Memset,
Memcpy,
Kernel,
Subflow,
Capture
>;
public:
// variant index
constexpr static auto EMPTY = get_index_v<Empty, handle_t>;
constexpr static auto HOST = get_index_v<Host, handle_t>;
constexpr static auto MEMSET = get_index_v<Memset, handle_t>;
constexpr static auto MEMCPY = get_index_v<Memcpy, handle_t>;
constexpr static auto KERNEL = get_index_v<Kernel, handle_t>;
constexpr static auto SUBFLOW = get_index_v<Subflow, handle_t>;
constexpr static auto CAPTURE = get_index_v<Capture, handle_t>;
cudaFlowNode() = delete;
template <typename... ArgsT>
cudaFlowNode(cudaFlowGraph&, ArgsT&&...);
private:
cudaFlowGraph& _cfg;
std::string _name;
handle_t _handle;
cudaGraphNode_t _native_handle {nullptr};
SmallVector<cudaFlowNode*> _successors;
SmallVector<cudaFlowNode*> _dependents;
void _precede(cudaFlowNode*);
};
// ----------------------------------------------------------------------------
// cudaFlowNode definitions
// ----------------------------------------------------------------------------
// Host handle constructor
template <typename C>
cudaFlowNode::Host::Host(C&& c) : func {std::forward<C>(c)} {
}
// Host callback
inline void cudaFlowNode::Host::callback(void* data) {
static_cast<Host*>(data)->func();
};
// Kernel handle constructor
template <typename F>
cudaFlowNode::Kernel::Kernel(F&& f) :
func {std::forward<F>(f)} {
}
// Capture handle constructor
template <typename C>
cudaFlowNode::Capture::Capture(C&& c) :
work {std::forward<C>(c)} {
}
// Constructor
template <typename... ArgsT>
cudaFlowNode::cudaFlowNode(cudaFlowGraph& graph, ArgsT&&... args) :
_cfg {graph},
_handle {std::forward<ArgsT>(args)...} {
}
// Procedure: _precede
inline void cudaFlowNode::_precede(cudaFlowNode* v) {
_cfg._state |= cudaFlowGraph::CHANGED;
_successors.push_back(v);
v->_dependents.push_back(this);
// capture node doesn't have the native graph yet
if(_handle.index() != cudaFlowNode::CAPTURE) {
TF_CHECK_CUDA(
cudaGraphAddDependencies(
_cfg._native_handle, &_native_handle, &v->_native_handle, 1
),
"failed to add a preceding link ", this, "->", v
);
}
}
// ----------------------------------------------------------------------------
// cudaGraph definitions
// ----------------------------------------------------------------------------
// Function: empty
inline bool cudaFlowGraph::empty() const {
return _nodes.empty();
}
// Procedure: clear
inline void cudaFlowGraph::clear() {
_state |= cudaFlowGraph::CHANGED;
_nodes.clear();
_native_handle.clear();
}
// Function: emplace_back
template <typename... ArgsT>
cudaFlowNode* cudaFlowGraph::emplace_back(ArgsT&&... args) {
_state |= cudaFlowGraph::CHANGED;
auto node = std::make_unique<cudaFlowNode>(std::forward<ArgsT>(args)...);
_nodes.emplace_back(std::move(node));
return _nodes.back().get();
// TODO: use object pool to save memory
//auto node = new cudaFlowNode(std::forward<ArgsT>(args)...);
//_nodes.push_back(node);
//return node;
}
// Procedure: dump the graph to a DOT format
inline void cudaFlowGraph::dump(
std::ostream& os, const void* root, const std::string& root_name
) const {
// recursive dump with stack
std::stack<std::tuple<const cudaFlowGraph*, const cudaFlowNode*, int>> stack;
stack.push(std::make_tuple(this, nullptr, 1));
int pl = 0;
while(!stack.empty()) {
auto [graph, parent, l] = stack.top();
stack.pop();
for(int i=0; i<pl-l+1; i++) {
os << "}\n";
}
if(parent == nullptr) {
if(root) {
os << "subgraph cluster_p" << root << " {\nlabel=\"cudaFlow: ";
if(root_name.empty()) os << 'p' << root;
else os << root_name;
os << "\";\n" << "color=\"purple\"\n";
}
else {
os << "digraph cudaFlow {\n";
}
}
else {
os << "subgraph cluster_p" << parent << " {\nlabel=\"cudaSubflow: ";
if(parent->_name.empty()) os << 'p' << parent;
else os << parent->_name;
os << "\";\n" << "color=\"purple\"\n";
}
for(auto& node : graph->_nodes) {
auto v = node.get();
os << 'p' << v << "[label=\"";
if(v->_name.empty()) {
os << 'p' << v << "\"";
}
else {
os << v->_name << "\"";
}
switch(v->_handle.index()) {
case cudaFlowNode::KERNEL:
os << " style=\"filled\""
<< " color=\"white\" fillcolor=\"black\""
<< " fontcolor=\"white\""
<< " shape=\"box3d\"";
break;
case cudaFlowNode::SUBFLOW:
stack.push(std::make_tuple(
&(std::get_if<cudaFlowNode::Subflow>(&v->_handle)->cfg), v, l+1)
);
os << " style=\"filled\""
<< " color=\"black\" fillcolor=\"purple\""
<< " fontcolor=\"white\""
<< " shape=\"folder\"";
break;
default:
break;
}
os << "];\n";
for(const auto s : v->_successors) {
os << 'p' << v << " -> " << 'p' << s << ";\n";
}
if(v->_successors.size() == 0) {
if(parent == nullptr) {
if(root) {
os << 'p' << v << " -> p" << root << ";\n";
}
}
else {
os << 'p' << v << " -> p" << parent << ";\n";
}
}
}
// set the previous level
pl = l;
}
for(int i=0; i<pl; i++) {
os << "}\n";
}
}
} // end of namespace tf -----------------------------------------------------