#pragma once #include "cuda_memory.hpp" #include "cuda_stream.hpp" #include "cuda_meta.hpp" #include "../utility/traits.hpp" namespace tf { // ---------------------------------------------------------------------------- // cudaGraph_t routines // ---------------------------------------------------------------------------- /** @brief gets the memcpy node parameter of a copy task */ template , void>* = nullptr > cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) { using U = std::decay_t; cudaMemcpy3DParms p; p.srcArray = nullptr; p.srcPos = ::make_cudaPos(0, 0, 0); p.srcPtr = ::make_cudaPitchedPtr(const_cast(src), num*sizeof(U), num, 1); p.dstArray = nullptr; p.dstPos = ::make_cudaPos(0, 0, 0); p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1); p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1); p.kind = cudaMemcpyDefault; return p; } /** @brief gets the memcpy node parameter of a memcpy task (untyped) */ inline cudaMemcpy3DParms cuda_get_memcpy_parms( void* tgt, const void* src, size_t bytes ) { // Parameters in cudaPitchedPtr // d - Pointer to allocated memory // p - Pitch of allocated memory in bytes // xsz - Logical width of allocation in elements // ysz - Logical height of allocation in elements cudaMemcpy3DParms p; p.srcArray = nullptr; p.srcPos = ::make_cudaPos(0, 0, 0); p.srcPtr = ::make_cudaPitchedPtr(const_cast(src), bytes, bytes, 1); p.dstArray = nullptr; p.dstPos = ::make_cudaPos(0, 0, 0); p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1); p.extent = ::make_cudaExtent(bytes, 1, 1); p.kind = cudaMemcpyDefault; return p; } /** @brief gets the memset node parameter of a memcpy task (untyped) */ inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) { cudaMemsetParams p; p.dst = dst; p.value = ch; p.pitch = 0; //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1; //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count; p.elementSize = 1; // either 1, 2, or 4 p.width = count; p.height = 1; return p; } /** @brief gets the memset node parameter of a fill task (typed) */ template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr > cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) { cudaMemsetParams p; p.dst = dst; // perform bit-wise copy p.value = 0; // crucial static_assert(sizeof(T) <= sizeof(p.value), "internal error"); std::memcpy(&p.value, &value, sizeof(T)); p.pitch = 0; p.elementSize = sizeof(T); // either 1, 2, or 4 p.width = count; p.height = 1; return p; } /** @brief gets the memset node parameter of a zero task (typed) */ template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr > cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) { cudaMemsetParams p; p.dst = dst; p.value = 0; p.pitch = 0; p.elementSize = sizeof(T); // either 1, 2, or 4 p.width = count; p.height = 1; return p; } /** @brief queries the number of root nodes in a native CUDA graph */ inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) { size_t num_nodes; TF_CHECK_CUDA( cudaGraphGetRootNodes(graph, nullptr, &num_nodes), "failed to get native graph root nodes" ); return num_nodes; } /** @brief queries the number of nodes in a native CUDA graph */ inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) { size_t num_nodes; TF_CHECK_CUDA( cudaGraphGetNodes(graph, nullptr, &num_nodes), "failed to get native graph nodes" ); return num_nodes; } /** @brief queries the number of edges in a native CUDA graph */ inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) { size_t num_edges; TF_CHECK_CUDA( cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges), "failed to get native graph edges" ); return num_edges; } /** @brief acquires the nodes in a native CUDA graph */ inline std::vector cuda_graph_get_nodes(cudaGraph_t graph) { size_t num_nodes = cuda_graph_get_num_nodes(graph); std::vector nodes(num_nodes); TF_CHECK_CUDA( cudaGraphGetNodes(graph, nodes.data(), &num_nodes), "failed to get native graph nodes" ); return nodes; } /** @brief acquires the root nodes in a native CUDA graph */ inline std::vector cuda_graph_get_root_nodes(cudaGraph_t graph) { size_t num_nodes = cuda_graph_get_num_root_nodes(graph); std::vector nodes(num_nodes); TF_CHECK_CUDA( cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes), "failed to get native graph nodes" ); return nodes; } /** @brief acquires the edges in a native CUDA graph */ inline std::vector> cuda_graph_get_edges(cudaGraph_t graph) { size_t num_edges = cuda_graph_get_num_edges(graph); std::vector froms(num_edges), tos(num_edges); TF_CHECK_CUDA( cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges), "failed to get native graph edges" ); std::vector> edges(num_edges); for(size_t i=0; i void cuda_dump_graph(T& os, cudaGraph_t g) { os << "digraph cudaGraph {\n"; std::stack> stack; stack.push(std::make_tuple(g, nullptr, 1)); int pl = 0; while(stack.empty() == false) { auto [graph, parent, l] = stack.top(); stack.pop(); for(int i=0; i " << 'p' << to << ";\n"; } for(auto& node : nodes) { auto type = cuda_get_graph_node_type(node); if(type == cudaGraphNodeTypeGraph) { cudaGraph_t child_graph; TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), ""); stack.push(std::make_tuple(child_graph, node, l+1)); os << 'p' << node << "[" << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, " << "label=\"cudaGraph-L" << l+1 << "\"];\n"; } else { os << 'p' << node << "[label=\"" << cuda_graph_node_type_to_string(type) << "\"];\n"; } } // precede to parent if(parent != nullptr) { std::unordered_set successors; for(const auto& p : edges) { successors.insert(p.first); } for(auto node : nodes) { if(successors.find(node) == successors.end()) { os << 'p' << node << " -> " << 'p' << parent << ";\n"; } } } // set the previous level pl = l; } for(int i=0; i<=pl; i++) { os << "}\n"; } } // ---------------------------------------------------------------------------- // cudaGraph // ---------------------------------------------------------------------------- /** @private */ struct cudaGraphCreator { cudaGraph_t operator () () const { cudaGraph_t g; TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph"); return g; } }; /** @private */ struct cudaGraphDeleter { void operator () (cudaGraph_t g) const { if(g) { cudaGraphDestroy(g); } } }; /** @class cudaGraph @brief class to create an RAII-styled wrapper over a CUDA executable graph A cudaGraph object is an RAII-styled wrapper over a native CUDA graph (@c cudaGraph_t). A cudaGraph object is move-only. */ class cudaGraph : public cudaObject { public: /** @brief constructs an RAII-styled object from the given CUDA exec Constructs a cudaGraph object from the given CUDA graph @c native. */ explicit cudaGraph(cudaGraph_t native) : cudaObject(native) { } /** @brief constructs a cudaGraph object with a new CUDA graph */ cudaGraph() = default; }; // ---------------------------------------------------------------------------- // cudaGraphExec // ---------------------------------------------------------------------------- /** @private */ struct cudaGraphExecCreator { cudaGraphExec_t operator () () const { return nullptr; } }; /** @private */ struct cudaGraphExecDeleter { void operator () (cudaGraphExec_t executable) const { if(executable) { cudaGraphExecDestroy(executable); } } }; /** @class cudaGraphExec @brief class to create an RAII-styled wrapper over a CUDA executable graph A cudaGraphExec object is an RAII-styled wrapper over a native CUDA executable graph (@c cudaGraphExec_t). A cudaGraphExec object is move-only. */ class cudaGraphExec : public cudaObject { public: /** @brief constructs an RAII-styled object from the given CUDA exec Constructs a cudaGraphExec object which owns @c exec. */ explicit cudaGraphExec(cudaGraphExec_t exec) : cudaObject(exec) { } /** @brief default constructor */ cudaGraphExec() = default; /** @brief instantiates the executable from the given CUDA graph */ void instantiate(cudaGraph_t graph) { cudaGraphExecDeleter {} (object); TF_CHECK_CUDA( cudaGraphInstantiate(&object, graph, nullptr, nullptr, 0), "failed to create an executable graph" ); } /** @brief updates the executable from the given CUDA graph */ cudaGraphExecUpdateResult update(cudaGraph_t graph) { cudaGraphNode_t error_node; cudaGraphExecUpdateResult error_result; cudaGraphExecUpdate(object, graph, &error_node, &error_result); return error_result; } /** @brief launches the executable graph via the given stream */ void launch(cudaStream_t stream) { TF_CHECK_CUDA( cudaGraphLaunch(object, stream), "failed to launch a CUDA executable graph" ); } }; // ---------------------------------------------------------------------------- // cudaFlowGraph class // ---------------------------------------------------------------------------- // class: cudaFlowGraph class cudaFlowGraph { friend class cudaFlowNode; friend class cudaTask; friend class cudaFlowCapturer; friend class cudaFlow; friend class cudaFlowOptimizerBase; friend class cudaFlowSequentialOptimizer; friend class cudaFlowLinearOptimizer; friend class cudaFlowRoundRobinOptimizer; friend class Taskflow; friend class Executor; constexpr static int OFFLOADED = 0x01; constexpr static int CHANGED = 0x02; constexpr static int UPDATED = 0x04; public: cudaFlowGraph() = default; ~cudaFlowGraph() = default; cudaFlowGraph(const cudaFlowGraph&) = delete; cudaFlowGraph(cudaFlowGraph&&) = default; cudaFlowGraph& operator = (const cudaFlowGraph&) = delete; cudaFlowGraph& operator = (cudaFlowGraph&&) = default; template cudaFlowNode* emplace_back(ArgsT&&...); bool empty() const; void clear(); void dump(std::ostream&, const void*, const std::string&) const ; private: int _state{CHANGED}; cudaGraph _native_handle {nullptr}; std::vector> _nodes; }; // ---------------------------------------------------------------------------- // cudaFlowNode class // ---------------------------------------------------------------------------- /** @private @class: cudaFlowNode */ class cudaFlowNode { friend class cudaFlowGraph; friend class cudaTask; friend class cudaFlow; friend class cudaFlowCapturer; friend class cudaFlowOptimizerBase; friend class cudaFlowSequentialOptimizer; friend class cudaFlowLinearOptimizer; friend class cudaFlowRoundRobinOptimizer; friend class Taskflow; friend class Executor; // Empty handle struct Empty { }; // Host handle struct Host { template Host(C&&); std::function func; static void callback(void*); }; // Memset handle struct Memset { }; // Memcpy handle struct Memcpy { }; // Kernel handle struct Kernel { template Kernel(F&& f); void* func {nullptr}; }; // Subflow handle struct Subflow { cudaFlowGraph cfg; }; // Capture struct Capture { template Capture(C&&); std::function work; cudaEvent_t event; size_t level; size_t lid; size_t idx; }; using handle_t = std::variant< Empty, Host, Memset, Memcpy, Kernel, Subflow, Capture >; public: // variant index constexpr static auto EMPTY = get_index_v; constexpr static auto HOST = get_index_v; constexpr static auto MEMSET = get_index_v; constexpr static auto MEMCPY = get_index_v; constexpr static auto KERNEL = get_index_v; constexpr static auto SUBFLOW = get_index_v; constexpr static auto CAPTURE = get_index_v; cudaFlowNode() = delete; template cudaFlowNode(cudaFlowGraph&, ArgsT&&...); private: cudaFlowGraph& _cfg; std::string _name; handle_t _handle; cudaGraphNode_t _native_handle {nullptr}; SmallVector _successors; SmallVector _dependents; void _precede(cudaFlowNode*); }; // ---------------------------------------------------------------------------- // cudaFlowNode definitions // ---------------------------------------------------------------------------- // Host handle constructor template cudaFlowNode::Host::Host(C&& c) : func {std::forward(c)} { } // Host callback inline void cudaFlowNode::Host::callback(void* data) { static_cast(data)->func(); }; // Kernel handle constructor template cudaFlowNode::Kernel::Kernel(F&& f) : func {std::forward(f)} { } // Capture handle constructor template cudaFlowNode::Capture::Capture(C&& c) : work {std::forward(c)} { } // Constructor template cudaFlowNode::cudaFlowNode(cudaFlowGraph& graph, ArgsT&&... args) : _cfg {graph}, _handle {std::forward(args)...} { } // Procedure: _precede inline void cudaFlowNode::_precede(cudaFlowNode* v) { _cfg._state |= cudaFlowGraph::CHANGED; _successors.push_back(v); v->_dependents.push_back(this); // capture node doesn't have the native graph yet if(_handle.index() != cudaFlowNode::CAPTURE) { TF_CHECK_CUDA( cudaGraphAddDependencies( _cfg._native_handle, &_native_handle, &v->_native_handle, 1 ), "failed to add a preceding link ", this, "->", v ); } } // ---------------------------------------------------------------------------- // cudaGraph definitions // ---------------------------------------------------------------------------- // Function: empty inline bool cudaFlowGraph::empty() const { return _nodes.empty(); } // Procedure: clear inline void cudaFlowGraph::clear() { _state |= cudaFlowGraph::CHANGED; _nodes.clear(); _native_handle.clear(); } // Function: emplace_back template cudaFlowNode* cudaFlowGraph::emplace_back(ArgsT&&... args) { _state |= cudaFlowGraph::CHANGED; auto node = std::make_unique(std::forward(args)...); _nodes.emplace_back(std::move(node)); return _nodes.back().get(); // TODO: use object pool to save memory //auto node = new cudaFlowNode(std::forward(args)...); //_nodes.push_back(node); //return node; } // Procedure: dump the graph to a DOT format inline void cudaFlowGraph::dump( std::ostream& os, const void* root, const std::string& root_name ) const { // recursive dump with stack std::stack> stack; stack.push(std::make_tuple(this, nullptr, 1)); int pl = 0; while(!stack.empty()) { auto [graph, parent, l] = stack.top(); stack.pop(); for(int i=0; i_name.empty()) os << 'p' << parent; else os << parent->_name; os << "\";\n" << "color=\"purple\"\n"; } for(auto& node : graph->_nodes) { auto v = node.get(); os << 'p' << v << "[label=\""; if(v->_name.empty()) { os << 'p' << v << "\""; } else { os << v->_name << "\""; } switch(v->_handle.index()) { case cudaFlowNode::KERNEL: os << " style=\"filled\"" << " color=\"white\" fillcolor=\"black\"" << " fontcolor=\"white\"" << " shape=\"box3d\""; break; case cudaFlowNode::SUBFLOW: stack.push(std::make_tuple( &(std::get_if(&v->_handle)->cfg), v, l+1) ); os << " style=\"filled\"" << " color=\"black\" fillcolor=\"purple\"" << " fontcolor=\"white\"" << " shape=\"folder\""; break; default: break; } os << "];\n"; for(const auto s : v->_successors) { os << 'p' << v << " -> " << 'p' << s << ";\n"; } if(v->_successors.size() == 0) { if(parent == nullptr) { if(root) { os << 'p' << v << " -> p" << root << ";\n"; } } else { os << 'p' << v << " -> p" << parent << ";\n"; } } } // set the previous level pl = l; } for(int i=0; i