248 lines
8.1 KiB
Text
248 lines
8.1 KiB
Text
namespace tf {
|
|
|
|
/** @page GPUTaskingcudaFlowCapturer GPU Tasking (%cudaFlowCapturer)
|
|
|
|
You can create a %cudaFlow through <i>stream capture</i>, which allows you
|
|
to implicitly capture a CUDA graph using stream-based interface.
|
|
Compared to explicit CUDA %Graph construction (tf::cudaFlow),
|
|
implicit CUDA %Graph capturing (tf::cudaFlowCapturer) is more flexible
|
|
in building GPU task graphs.
|
|
|
|
@tableofcontents
|
|
|
|
@section GPUTaskingcudaFlowCapturerIncludeTheHeader Include the Header
|
|
|
|
You need to include the header file, `%taskflow/cuda/cudaflow.hpp`,
|
|
for capturing a GPU task graph using tf::cudaFlowCapturer.
|
|
|
|
@code{.cpp}
|
|
#include <taskflow/cuda/cudaflow.hpp>
|
|
@endcode
|
|
|
|
@section Capture_a_cudaFlow Capture a cudaFlow
|
|
|
|
When your program has no access to direct kernel calls but can only
|
|
invoke them through a stream-based interface (e.g., @cuBLAS and @cuDNN library functions),
|
|
you can use tf::cudaFlowCapturer to capture the hidden GPU operations into a CUDA graph.
|
|
A %cudaFlowCapturer is similar to a %cudaFlow except it constructs a GPU task graph
|
|
through <i>stream capture</i>.
|
|
You use the method tf::cudaFlowCapturer::on
|
|
to capture a sequence of @em asynchronous GPU operations through the given stream.
|
|
The following example creates a CUDA graph that captures two kernel tasks,
|
|
@c task_1 (@c my_kernel_1)
|
|
and
|
|
@c task_2 (@c my_kernel_2) ,
|
|
where @c task_1 runs before @c task_2.
|
|
|
|
@code{.cpp}
|
|
// create a cudaFlow capturer to run a CUDA graph using stream capturing
|
|
tf::cudaFlowCapturer capturer;
|
|
|
|
// capture my_kernel_1 through a stream managed by capturer
|
|
tf::cudaTask task_1 = capturer.on([&](cudaStream_t stream){
|
|
my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
|
|
}).name("my_kernel_1");
|
|
|
|
// capture my_kernel_2 through a stream managed by capturer
|
|
tf::cudaTask task_2 = capturer.on([&](cudaStream_t stream){
|
|
my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
|
|
}).name("my_kernel_2");
|
|
|
|
// my_kernel_1 runs before my_kernel_2
|
|
task_1.precede(task_2);
|
|
|
|
// offload captured GPU tasks using the CUDA Graph execution model
|
|
tf::cudaStream stream;
|
|
capturer.run(stream);
|
|
stream.synchronize();
|
|
|
|
// dump the cudaFlow to a DOT format through std::cout
|
|
capturer.dump(std::cout)
|
|
@endcode
|
|
|
|
@dotfile images/cudaflow_capturer_1.dot
|
|
|
|
@warning
|
|
Inside tf::cudaFlowCapturer::on, you should @em NOT modify the properties of
|
|
the stream argument but only use it to capture @em asynchronous GPU operations
|
|
(e.g., @c kernel, @c cudaMemcpyAsync).
|
|
The stream argument is internal to the capturer use only.
|
|
|
|
@section CommonCaptureMethods Common Capture Methods
|
|
|
|
tf::cudaFlowCapturer defines a set of methods for capturing common GPU operations,
|
|
such as tf::cudaFlowCapturer::kernel, tf::cudaFlowCapturer::memcpy,
|
|
tf::cudaFlowCapturer::memset, and so on.
|
|
For example, the following code snippet uses these pre-defined methods
|
|
to construct a GPU task graph of one host-to-device copy, kernel,
|
|
and one device-to-host copy, in this order of their dependencies.
|
|
|
|
@code{.cpp}
|
|
tf::cudaFlowCapturer capturer;
|
|
|
|
// copy data from host_data to gpu_data
|
|
tf::cudaTask h2d = capturer.memcpy(gpu_data, host_data, bytes)
|
|
.name("h2d");
|
|
|
|
// capture my_kernel to do computation on gpu_data
|
|
tf::cudaTask kernel = capturer.kernel(grid, block, shm_size, kernel, kernel_args);
|
|
.name("my_kernel");
|
|
|
|
// copy data from gpu_data to host_data
|
|
tf::cudaTask d2h = capturer.memcpy(host_data, gpu_data, bytes)
|
|
.name("d2h");
|
|
|
|
// build task dependencies
|
|
h2d.precede(kernel);
|
|
kernel.precede(d2h);
|
|
@endcode
|
|
|
|
@dotfile images/cudaflow_capturer_2.dot
|
|
|
|
@section CreateACapturerOnASpecificGPU Create a Capturer on a Specific GPU
|
|
|
|
You can run a %cudaFlow capturer on a specific GPU by switching to the context
|
|
of that GPU using tf::cudaScopedDevice, following the CUDA convention of multi-GPU programming.
|
|
The example below creates a %cudaFlow capturer and runs it on GPU @c 2:
|
|
|
|
@code{.cpp}
|
|
{
|
|
// create an RAII-styled switcher to the context of GPU 2
|
|
tf::cudaScopedDevice context(2);
|
|
|
|
// create a cudaFlow capturer under GPU 2
|
|
tf::cudaFlowCapturer capturer;
|
|
// ...
|
|
|
|
// create a stream under GPU 2 and offload the capturer to that GPU
|
|
tf::cudaStream stream;
|
|
capturer.run(stream);
|
|
stream.synchronize();
|
|
}
|
|
@endcode
|
|
|
|
tf::cudaScopedDevice is an RAII-styled wrapper to perform @em scoped switch
|
|
to the given GPU context.
|
|
When the scope is destroyed, it switches back to the original context.
|
|
|
|
@note
|
|
By default, a %cudaFlow capturer runs on the current GPU associated with the caller,
|
|
which is typically @c 0.
|
|
|
|
@section CreateACapturerWithinAcudaFlow Create a Capturer from a cudaFlow
|
|
|
|
Within a parent %cudaFlow, you can capture a %cudaFlow to form a subflow that
|
|
eventually becomes a @em child node in the underlying CUDA task graph.
|
|
The following example defines a captured flow @c task2 of two dependent tasks,
|
|
@c task2_1 and @c task2_2, and @c task2 runs after @c task1.
|
|
|
|
@code{.cpp}
|
|
tf::cudaFlow cudaflow;
|
|
|
|
tf::cudaTask task1 = cudaflow.kernel(grid, block, shm, my_kernel, args...)
|
|
.name("kernel");
|
|
|
|
// task2 forms a subflow as a child node in the underlying CUDA graph
|
|
tf::cudaTask task2 = cudaflow.capture([&](tf::cudaFlowCapturer& capturer){
|
|
|
|
// capture kernel_1 using the given stream
|
|
tf::cudaTask task2_1 = capturer.on([&](cudaStream_t stream){
|
|
kernel_2<<<grid1, block1, shm_size1, stream>>>(args1...);
|
|
}).name("kernel_1");
|
|
|
|
// capture kernel_2 using the given stream
|
|
tf::cudaTask task2_2 = capturer.on([&](cudaStream_t stream){
|
|
kernel_2<<<grid2, block2, shm_size2, stream>>>(args2...);
|
|
}).name("kernel_2");
|
|
|
|
// kernel_1 runs before kernel_2
|
|
task2_1.precede(task2_2);
|
|
}).name("capturer");
|
|
|
|
task1.precede(task2);
|
|
@endcode
|
|
|
|
@dotfile images/cudaflow_capturer_3.dot
|
|
|
|
|
|
@section OffloadAcudaFlowCapturer Offload a cudaFlow Capturer
|
|
|
|
When you offload a %cudaFlow capturer using tf::cudaFlowCapturer::run,
|
|
the runtime transforms that capturer (i.e., application GPU task graph)
|
|
into a native CUDA graph and an executable instance
|
|
both optimized for maximum kernel concurrency.
|
|
Depending on the optimization algorithm,
|
|
the application GPU task graph may be different
|
|
from the actual executable graph submitted to the CUDA runtime.
|
|
|
|
@code{.cpp}
|
|
tf::cudaStream stream;
|
|
// launch a cudaflow capturer asynchronously through a stream
|
|
capturer.run(stream);
|
|
// wait for the cudaflow to finish
|
|
stream.synchronize();
|
|
@endcode
|
|
|
|
@section UpdateAcudaFlowCapturer Update a cudaFlow Capturer
|
|
|
|
Between successive offloads (i.e., executions of a %cudaFlow capturer),
|
|
you can update the captured task with a different set of parameters.
|
|
Every task-creation method in tf::cudaFlowCapturer has an overload
|
|
to update the parameters of a created task by that method.
|
|
The following example creates a kernel task and updates its parameter
|
|
between successive runs:
|
|
|
|
@code{.cpp}
|
|
tf::cudaStream stream;
|
|
tf::cudaFlowCapturer cf;
|
|
|
|
// create a kernel task
|
|
tf::cudaTask task = cf.kernel(grid1, block1, shm1, kernel, kernel_args_1);
|
|
cf.run(stream);
|
|
stream.synchronize();
|
|
|
|
// update the created kernel task with different parameters
|
|
cf.kernel(task, grid2, block2, shm2, kernel, kernel_args_2);
|
|
cf.run(stream);
|
|
stream.synchronize();
|
|
@endcode
|
|
|
|
|
|
When you run a updated %cudaFlow capturer,
|
|
%Taskflow will try to update the underlying executable
|
|
with the newly captured graph first.
|
|
If that update is unsuccessful,
|
|
%Taskflow will destroy the executable graph and re-instantiate
|
|
a new one from the newly captured graph.
|
|
|
|
@section IntegrateCudaFlowCapturerIntoTaskflow Integrate a cudaFlow Capturer into Taskflow
|
|
|
|
You can create a task to enclose a %cudaFlow capturer and run it from a worker thread.
|
|
The usage of the capturer remains the same except that the capturer is run by a worker thread
|
|
from a taskflow task.
|
|
The following example runs a %cudaFlow capturer from a static task:
|
|
|
|
@code{.cpp}
|
|
tf::Executor executor;
|
|
tf::Taskflow taskflow;
|
|
|
|
taskflow.emplace([](){
|
|
// create a cudaFlow capturer inside a static task
|
|
tf::cudaFlowCapturer capturer;
|
|
|
|
// ... capture a GPU task graph
|
|
capturer.kernel(...);
|
|
|
|
// run the capturer through a stream
|
|
tf::cudaStream stream;
|
|
capturer.run(stream);
|
|
stream.synchronize();
|
|
});
|
|
@endcode
|
|
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|