namespace tf { /** @page GPUTaskingcudaFlowCapturer GPU Tasking (%cudaFlowCapturer) You can create a %cudaFlow through stream capture, which allows you to implicitly capture a CUDA graph using stream-based interface. Compared to explicit CUDA %Graph construction (tf::cudaFlow), implicit CUDA %Graph capturing (tf::cudaFlowCapturer) is more flexible in building GPU task graphs. @tableofcontents @section GPUTaskingcudaFlowCapturerIncludeTheHeader Include the Header You need to include the header file, `%taskflow/cuda/cudaflow.hpp`, for capturing a GPU task graph using tf::cudaFlowCapturer. @code{.cpp} #include @endcode @section Capture_a_cudaFlow Capture a cudaFlow When your program has no access to direct kernel calls but can only invoke them through a stream-based interface (e.g., @cuBLAS and @cuDNN library functions), you can use tf::cudaFlowCapturer to capture the hidden GPU operations into a CUDA graph. A %cudaFlowCapturer is similar to a %cudaFlow except it constructs a GPU task graph through stream capture. You use the method tf::cudaFlowCapturer::on to capture a sequence of @em asynchronous GPU operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, @c task_1 (@c my_kernel_1) and @c task_2 (@c my_kernel_2) , where @c task_1 runs before @c task_2. @code{.cpp} // create a cudaFlow capturer to run a CUDA graph using stream capturing tf::cudaFlowCapturer capturer; // capture my_kernel_1 through a stream managed by capturer tf::cudaTask task_1 = capturer.on([&](cudaStream_t stream){ my_kernel_1<<>>(my_parameters_1); }).name("my_kernel_1"); // capture my_kernel_2 through a stream managed by capturer tf::cudaTask task_2 = capturer.on([&](cudaStream_t stream){ my_kernel_2<<>>(my_parameters_2); }).name("my_kernel_2"); // my_kernel_1 runs before my_kernel_2 task_1.precede(task_2); // offload captured GPU tasks using the CUDA Graph execution model tf::cudaStream stream; capturer.run(stream); stream.synchronize(); // dump the cudaFlow to a DOT format through std::cout capturer.dump(std::cout) @endcode @dotfile images/cudaflow_capturer_1.dot @warning Inside tf::cudaFlowCapturer::on, you should @em NOT modify the properties of the stream argument but only use it to capture @em asynchronous GPU operations (e.g., @c kernel, @c cudaMemcpyAsync). The stream argument is internal to the capturer use only. @section CommonCaptureMethods Common Capture Methods tf::cudaFlowCapturer defines a set of methods for capturing common GPU operations, such as tf::cudaFlowCapturer::kernel, tf::cudaFlowCapturer::memcpy, tf::cudaFlowCapturer::memset, and so on. For example, the following code snippet uses these pre-defined methods to construct a GPU task graph of one host-to-device copy, kernel, and one device-to-host copy, in this order of their dependencies. @code{.cpp} tf::cudaFlowCapturer capturer; // copy data from host_data to gpu_data tf::cudaTask h2d = capturer.memcpy(gpu_data, host_data, bytes) .name("h2d"); // capture my_kernel to do computation on gpu_data tf::cudaTask kernel = capturer.kernel(grid, block, shm_size, kernel, kernel_args); .name("my_kernel"); // copy data from gpu_data to host_data tf::cudaTask d2h = capturer.memcpy(host_data, gpu_data, bytes) .name("d2h"); // build task dependencies h2d.precede(kernel); kernel.precede(d2h); @endcode @dotfile images/cudaflow_capturer_2.dot @section CreateACapturerOnASpecificGPU Create a Capturer on a Specific GPU You can run a %cudaFlow capturer on a specific GPU by switching to the context of that GPU using tf::cudaScopedDevice, following the CUDA convention of multi-GPU programming. The example below creates a %cudaFlow capturer and runs it on GPU @c 2: @code{.cpp} { // create an RAII-styled switcher to the context of GPU 2 tf::cudaScopedDevice context(2); // create a cudaFlow capturer under GPU 2 tf::cudaFlowCapturer capturer; // ... // create a stream under GPU 2 and offload the capturer to that GPU tf::cudaStream stream; capturer.run(stream); stream.synchronize(); } @endcode tf::cudaScopedDevice is an RAII-styled wrapper to perform @em scoped switch to the given GPU context. When the scope is destroyed, it switches back to the original context. @note By default, a %cudaFlow capturer runs on the current GPU associated with the caller, which is typically @c 0. @section CreateACapturerWithinAcudaFlow Create a Capturer from a cudaFlow Within a parent %cudaFlow, you can capture a %cudaFlow to form a subflow that eventually becomes a @em child node in the underlying CUDA task graph. The following example defines a captured flow @c task2 of two dependent tasks, @c task2_1 and @c task2_2, and @c task2 runs after @c task1. @code{.cpp} tf::cudaFlow cudaflow; tf::cudaTask task1 = cudaflow.kernel(grid, block, shm, my_kernel, args...) .name("kernel"); // task2 forms a subflow as a child node in the underlying CUDA graph tf::cudaTask task2 = cudaflow.capture([&](tf::cudaFlowCapturer& capturer){ // capture kernel_1 using the given stream tf::cudaTask task2_1 = capturer.on([&](cudaStream_t stream){ kernel_2<<>>(args1...); }).name("kernel_1"); // capture kernel_2 using the given stream tf::cudaTask task2_2 = capturer.on([&](cudaStream_t stream){ kernel_2<<>>(args2...); }).name("kernel_2"); // kernel_1 runs before kernel_2 task2_1.precede(task2_2); }).name("capturer"); task1.precede(task2); @endcode @dotfile images/cudaflow_capturer_3.dot @section OffloadAcudaFlowCapturer Offload a cudaFlow Capturer When you offload a %cudaFlow capturer using tf::cudaFlowCapturer::run, the runtime transforms that capturer (i.e., application GPU task graph) into a native CUDA graph and an executable instance both optimized for maximum kernel concurrency. Depending on the optimization algorithm, the application GPU task graph may be different from the actual executable graph submitted to the CUDA runtime. @code{.cpp} tf::cudaStream stream; // launch a cudaflow capturer asynchronously through a stream capturer.run(stream); // wait for the cudaflow to finish stream.synchronize(); @endcode @section UpdateAcudaFlowCapturer Update a cudaFlow Capturer Between successive offloads (i.e., executions of a %cudaFlow capturer), you can update the captured task with a different set of parameters. Every task-creation method in tf::cudaFlowCapturer has an overload to update the parameters of a created task by that method. The following example creates a kernel task and updates its parameter between successive runs: @code{.cpp} tf::cudaStream stream; tf::cudaFlowCapturer cf; // create a kernel task tf::cudaTask task = cf.kernel(grid1, block1, shm1, kernel, kernel_args_1); cf.run(stream); stream.synchronize(); // update the created kernel task with different parameters cf.kernel(task, grid2, block2, shm2, kernel, kernel_args_2); cf.run(stream); stream.synchronize(); @endcode When you run a updated %cudaFlow capturer, %Taskflow will try to update the underlying executable with the newly captured graph first. If that update is unsuccessful, %Taskflow will destroy the executable graph and re-instantiate a new one from the newly captured graph. @section IntegrateCudaFlowCapturerIntoTaskflow Integrate a cudaFlow Capturer into Taskflow You can create a task to enclose a %cudaFlow capturer and run it from a worker thread. The usage of the capturer remains the same except that the capturer is run by a worker thread from a taskflow task. The following example runs a %cudaFlow capturer from a static task: @code{.cpp} tf::Executor executor; tf::Taskflow taskflow; taskflow.emplace([](){ // create a cudaFlow capturer inside a static task tf::cudaFlowCapturer capturer; // ... capture a GPU task graph capturer.kernel(...); // run the capturer through a stream tf::cudaStream stream; capturer.run(stream); stream.synchronize(); }); @endcode */ }