#pragma once #include "cuda_task.hpp" #include "cuda_optimizer.hpp" /** @file cuda_capturer.hpp @brief %cudaFlow capturer include file */ namespace tf { // ---------------------------------------------------------------------------- // class definition: cudaFlowCapturer // ---------------------------------------------------------------------------- /** @class cudaFlowCapturer @brief class to create a %cudaFlow graph using stream capture The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous CUDA operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, @c task_1 and @c task_2, where @c task_1 runs before @c task_2. @code{.cpp} taskflow.emplace([](tf::cudaFlowCapturer& capturer){ // capture my_kernel_1 through the given stream managed by the capturer auto task_1 = capturer.on([&](cudaStream_t stream){ my_kernel_1<<>>(my_parameters_1); }); // capture my_kernel_2 through the given stream managed by the capturer auto task_2 = capturer.on([&](cudaStream_t stream){ my_kernel_2<<>>(my_parameters_2); }); task_1.precede(task_2); }); @endcode Similar to tf::cudaFlow, a %cudaFlowCapturer is a task (tf::Task) created from tf::Taskflow and will be run by @em one worker thread in the executor. That is, the callable that describes a %cudaFlowCapturer will be executed sequentially. Inside a %cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run in parallel depending on the selected optimization algorithm. By default, we use tf::cudaFlowRoundRobinOptimizer to transform a user-level graph into a native CUDA graph. Please refer to @ref GPUTaskingcudaFlowCapturer for details. */ class cudaFlowCapturer { friend class cudaFlow; friend class Executor; // created by user struct External { cudaFlowGraph graph; }; // created from cudaFlow struct Internal { }; using handle_t = std::variant; using Optimizer = std::variant< cudaFlowRoundRobinOptimizer, cudaFlowSequentialOptimizer, cudaFlowLinearOptimizer >; public: /** @brief constructs a standalone cudaFlowCapturer A standalone %cudaFlow capturer does not go through any taskflow and can be run by the caller thread using tf::cudaFlowCapturer::run. */ cudaFlowCapturer() = default; /** @brief destructs the cudaFlowCapturer */ ~cudaFlowCapturer() = default; /** @brief default move constructor */ cudaFlowCapturer(cudaFlowCapturer&&) = default; /** @brief default move assignment operator */ cudaFlowCapturer& operator = (cudaFlowCapturer&&) = default; /** @brief queries the emptiness of the graph */ bool empty() const; /** @brief queries the number of tasks */ size_t num_tasks() const; /** @brief clear this %cudaFlow capturer */ void clear(); /** @brief dumps the %cudaFlow graph into a DOT format through an output stream */ void dump(std::ostream& os) const; /** @brief dumps the native captured graph into a DOT format through an output stream */ void dump_native_graph(std::ostream& os) const; // ------------------------------------------------------------------------ // basic methods // ------------------------------------------------------------------------ /** @brief captures a sequential CUDA operations from the given callable @tparam C callable type constructible with @c std::function @param callable a callable to capture CUDA operations with the stream This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable. */ template , void>* = nullptr > cudaTask on(C&& callable); /** @brief updates a capture task to another sequential CUDA operations The method is similar to cudaFlowCapturer::on but operates on an existing task. */ template , void>* = nullptr > void on(cudaTask task, C&& callable); /** @brief captures a no-operation task @return a tf::cudaTask handle An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of @c n nodes with a barrier between them can be represented using an empty node and @c 2*n dependency edges, rather than no empty node and @c n^2 dependency edges. */ cudaTask noop(); /** @brief updates a task to a no-operation task The method is similar to tf::cudaFlowCapturer::noop but operates on an existing task. */ void noop(cudaTask task); /** @brief copies data between host and device asynchronously through a stream @param dst destination memory address @param src source memory address @param count size in bytes to copy The method captures a @c cudaMemcpyAsync operation through an internal stream. */ cudaTask memcpy(void* dst, const void* src, size_t count); /** @brief updates a capture task to a memcpy operation The method is similar to cudaFlowCapturer::memcpy but operates on an existing task. */ void memcpy(cudaTask task, void* dst, const void* src, size_t count); /** @brief captures a copy task of typed data @tparam T element type (non-void) @param tgt pointer to the target memory block @param src pointer to the source memory block @param num number of elements to copy @return cudaTask handle A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs. */ template , void>* = nullptr > cudaTask copy(T* tgt, const T* src, size_t num); /** @brief updates a capture task to a copy operation The method is similar to cudaFlowCapturer::copy but operates on an existing task. */ template , void>* = nullptr > void copy(cudaTask task, T* tgt, const T* src, size_t num); /** @brief initializes or sets GPU memory to the given value byte by byte @param ptr pointer to GPU memory @param v value to set for each byte of the specified memory @param n size in bytes to set The method captures a @c cudaMemsetAsync operation through an internal stream to fill the first @c count bytes of the memory area pointed to by @c devPtr with the constant byte value @c value. */ cudaTask memset(void* ptr, int v, size_t n); /** @brief updates a capture task to a memset operation The method is similar to cudaFlowCapturer::memset but operates on an existing task. */ void memset(cudaTask task, void* ptr, int value, size_t n); /** @brief captures a kernel @tparam F kernel function type @tparam ArgsT kernel function parameters type @param g configured grid @param b configured block @param s configured shared memory size in bytes @param f kernel function @param args arguments to forward to the kernel function by copy @return cudaTask handle */ template cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args); /** @brief updates a capture task to a kernel operation The method is similar to cudaFlowCapturer::kernel but operates on an existing task. */ template void kernel( cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args ); // ------------------------------------------------------------------------ // generic algorithms // ------------------------------------------------------------------------ /** @brief capturers a kernel to runs the given callable with only one thread @tparam C callable type @param c callable to run by a single kernel thread */ template cudaTask single_task(C c); /** @brief updates a capture task to a single-threaded kernel This method is similar to cudaFlowCapturer::single_task but operates on an existing task. */ template void single_task(cudaTask task, C c); /** @brief captures a kernel that applies a callable to each dereferenced element of the data array @tparam I iterator type @tparam C callable type @param first iterator to the beginning @param last iterator to the end @param callable a callable object to apply to the dereferenced iterator @return cudaTask handle This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} for(auto itr = first; itr != last; i++) { callable(*itr); } @endcode */ template cudaTask for_each(I first, I last, C callable); /** @brief updates a capture task to a for-each kernel task This method is similar to cudaFlowCapturer::for_each but operates on an existing task. */ template void for_each(cudaTask task, I first, I last, C callable); /** @brief captures a kernel that applies a callable to each index in the range with the step size @tparam I index type @tparam C callable type @param first beginning index @param last last index @param step step size @param callable the callable to apply to each element in the data array @return cudaTask handle This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} // step is positive [first, last) for(auto i=first; ilast; i+=step) { callable(i); } @endcode */ template cudaTask for_each_index(I first, I last, I step, C callable); /** @brief updates a capture task to a for-each-index kernel task This method is similar to cudaFlowCapturer::for_each_index but operates on an existing task. */ template void for_each_index( cudaTask task, I first, I last, I step, C callable ); /** @brief captures a kernel that transforms an input range to an output range @tparam I input iterator type @tparam O output iterator type @tparam C unary operator type @param first iterator to the beginning of the input range @param last iterator to the end of the input range @param output iterator to the beginning of the output range @param op unary operator to apply to transform each item in the range @return cudaTask handle This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} while (first != last) { *output++ = op(*first++); } @endcode */ template cudaTask transform(I first, I last, O output, C op); /** @brief updates a capture task to a transform kernel task This method is similar to cudaFlowCapturer::transform but operates on an existing task. */ template void transform(cudaTask task, I first, I last, O output, C op); /** @brief captures a kernel that transforms two input ranges to an output range @tparam I1 first input iterator type @tparam I2 second input iterator type @tparam O output iterator type @tparam C unary operator type @param first1 iterator to the beginning of the input range @param last1 iterator to the end of the input range @param first2 iterato @param output iterator to the beginning of the output range @param op binary operator to apply to transform each pair of items in the two input ranges @return cudaTask handle This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} while (first1 != last1) { *output++ = op(*first1++, *first2++); } @endcode */ template cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op); /** @brief updates a capture task to a transform kernel task This method is similar to cudaFlowCapturer::transform but operates on an existing task. */ template void transform( cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op ); // ------------------------------------------------------------------------ // Capturing methods // ------------------------------------------------------------------------ /** @brief selects a different optimization algorithm @tparam OPT optimizer type @tparam ArgsT arguments types @param args arguments to forward to construct the optimizer @return a reference to the optimizer We currently supports the following optimization algorithms to capture a user-described %cudaFlow: + tf::cudaFlowSequentialOptimizer + tf::cudaFlowRoundRobinOptimizer + tf::cudaFlowLinearOptimizer By default, tf::cudaFlowCapturer uses the round-robin optimization algorithm with four streams to transform a user-level graph into a native CUDA graph. */ template OPT& make_optimizer(ArgsT&&... args); /** @brief captures the cudaFlow and turns it into a CUDA Graph */ cudaGraph_t capture(); // ------------------------------------------------------------------------ // offload methods // ------------------------------------------------------------------------ /** @brief offloads the %cudaFlowCapturer onto a GPU asynchronously via a stream @param stream stream for performing this operation Offloads the present %cudaFlowCapturer onto a GPU asynchronously via the given stream. An offloaded %cudaFlowCapturer forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters. */ void run(cudaStream_t stream); /** @brief acquires a reference to the underlying CUDA graph */ cudaGraph_t native_graph(); /** @brief acquires a reference to the underlying CUDA graph executable */ cudaGraphExec_t native_executable(); private: cudaFlowGraph _cfg; Optimizer _optimizer; cudaGraphExec _exe {nullptr}; }; // Function: empty inline bool cudaFlowCapturer::empty() const { return _cfg.empty(); } // Function: num_tasks inline size_t cudaFlowCapturer::num_tasks() const { return _cfg._nodes.size(); } // Procedure: clear inline void cudaFlowCapturer::clear() { _exe.clear(); _cfg.clear(); } // Procedure: dump inline void cudaFlowCapturer::dump(std::ostream& os) const { _cfg.dump(os, nullptr, ""); } // Procedure: dump_native_graph inline void cudaFlowCapturer::dump_native_graph(std::ostream& os) const { cuda_dump_graph(os, _cfg._native_handle); } // Function: capture template , void>* > cudaTask cudaFlowCapturer::on(C&& callable) { auto node = _cfg.emplace_back(_cfg, std::in_place_type_t{}, std::forward(callable) ); return cudaTask(node); } // Function: noop inline cudaTask cudaFlowCapturer::noop() { return on([](cudaStream_t){}); } // Function: noop inline void cudaFlowCapturer::noop(cudaTask task) { on(task, [](cudaStream_t){}); } // Function: memcpy inline cudaTask cudaFlowCapturer::memcpy( void* dst, const void* src, size_t count ) { return on([dst, src, count] (cudaStream_t stream) mutable { TF_CHECK_CUDA( cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), "failed to capture memcpy" ); }); } // Function: copy template , void>*> cudaTask cudaFlowCapturer::copy(T* tgt, const T* src, size_t num) { return on([tgt, src, num] (cudaStream_t stream) mutable { TF_CHECK_CUDA( cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), "failed to capture copy" ); }); } // Function: memset inline cudaTask cudaFlowCapturer::memset(void* ptr, int v, size_t n) { return on([ptr, v, n] (cudaStream_t stream) mutable { TF_CHECK_CUDA( cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" ); }); } // Function: kernel template cudaTask cudaFlowCapturer::kernel( dim3 g, dim3 b, size_t s, F f, ArgsT&&... args ) { return on([g, b, s, f, args...] (cudaStream_t stream) mutable { f<<>>(args...); }); } // Function: capture inline cudaGraph_t cudaFlowCapturer::capture() { return std::visit( [this](auto&& opt){ return opt._optimize(_cfg); }, _optimizer ); } // Procedure: run inline void cudaFlowCapturer::run(cudaStream_t stream) { // If the topology got changed, we need to destroy the executable // and create a new one if(_cfg._state & cudaFlowGraph::CHANGED) { _cfg._native_handle.reset(capture()); _exe.instantiate(_cfg._native_handle); } // if the graph is just updated (i.e., topology does not change), // we can skip part of the optimization and just update the executable // with the new captured graph else if(_cfg._state & cudaFlowGraph::UPDATED) { // TODO: skip part of the optimization (e.g., levelization) _cfg._native_handle.reset(capture()); if(_exe.update(_cfg._native_handle) != cudaGraphExecUpdateSuccess) { _exe.instantiate(_cfg._native_handle); } } // run the executable (should exist) _exe.launch(stream); _cfg._state = cudaFlowGraph::OFFLOADED; } // Function: native_graph inline cudaGraph_t cudaFlowCapturer::native_graph() { return _cfg._native_handle; } // Function: native_executable inline cudaGraphExec_t cudaFlowCapturer::native_executable() { return _exe; } // Function: on template , void>* > void cudaFlowCapturer::on(cudaTask task, C&& callable) { if(task.type() != cudaTaskType::CAPTURE) { TF_THROW("invalid cudaTask type (must be CAPTURE)"); } _cfg._state |= cudaFlowGraph::UPDATED; std::get_if(&task._node->_handle)->work = std::forward(callable); } // Function: memcpy inline void cudaFlowCapturer::memcpy( cudaTask task, void* dst, const void* src, size_t count ) { on(task, [dst, src, count](cudaStream_t stream) mutable { TF_CHECK_CUDA( cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), "failed to capture memcpy" ); }); } // Function: copy template , void>* > void cudaFlowCapturer::copy( cudaTask task, T* tgt, const T* src, size_t num ) { on(task, [tgt, src, num] (cudaStream_t stream) mutable { TF_CHECK_CUDA( cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), "failed to capture copy" ); }); } // Function: memset inline void cudaFlowCapturer::memset( cudaTask task, void* ptr, int v, size_t n ) { on(task, [ptr, v, n] (cudaStream_t stream) mutable { TF_CHECK_CUDA( cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" ); }); } // Function: kernel template void cudaFlowCapturer::kernel( cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args ) { on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable { f<<>>(args...); }); } // Function: make_optimizer template OPT& cudaFlowCapturer::make_optimizer(ArgsT&&... args) { return _optimizer.emplace(std::forward(args)...); } } // end of namespace tf -----------------------------------------------------