mesytec-mnode/external/taskflow-3.8.0/benchmarks/hetero_traversal/tbb.cu
2025-01-04 01:25:05 +01:00

241 lines
8.7 KiB
Text

#include "graph.hpp"
#include <tbb/global_control.h>
#include <tbb/flow_graph.h>
struct cudaStream {
std::vector<std::vector<cudaStream_t>> streams;
cudaStream(unsigned N) : streams(tf::cuda_get_num_devices()) {
for(size_t i=0; i<streams.size(); ++i) {
streams[i].resize(N);
tf::cudaScopedDevice ctx(i);
for(unsigned j=0; j<N; ++j) {
TF_CHECK_CUDA(cudaStreamCreate(&streams[i][j]), "failed to create a stream on ", i);
}
}
}
~cudaStream() {
for(size_t i=0; i<streams.size(); ++i) {
tf::cudaScopedDevice ctx(i);
for(unsigned j=0; j<streams[i].size(); ++j) {
cudaStreamDestroy(streams[i][j]);
}
}
}
cudaStream_t per_thread_stream(int device) {
auto id = std::hash<std::thread::id>()(std::this_thread::get_id()) % streams[device].size();
return streams[device][id];
}
};
void TBB(const Graph& g, unsigned num_cpus, unsigned num_gpus) {
using namespace tbb;
using namespace tbb::flow;
tbb::global_control c(
tbb::global_control::max_allowed_parallelism, num_cpus + num_gpus
);
tbb::flow::graph G;
cudaStream streams(num_cpus + num_gpus);
std::atomic<int> counter {0};
int* cx = new int[N];
int* cy = new int[N];
int* cz = new int[N];
int* gx = nullptr;
int* gy = nullptr;
int* gz = nullptr;
TF_CHECK_CUDA(cudaMallocManaged(&gx, N*sizeof(int)), "failed at cudaMallocManaged");
TF_CHECK_CUDA(cudaMallocManaged(&gy, N*sizeof(int)), "failed at cudaMallocManaged");
TF_CHECK_CUDA(cudaMallocManaged(&gz, N*sizeof(int)), "failed at cudaMallocManaged");
std::vector<std::unique_ptr<continue_node<continue_msg>>> tasks(g.num_nodes);
std::vector<size_t> indegree(g.num_nodes, 0);
auto source = std::make_unique<continue_node<continue_msg>>(
G, [](const continue_msg&){}
);
// create a task for each node
for(const auto& v : g.nodes) {
// cpu task
if(v.g == -1) {
tasks[v.v] = std::make_unique<continue_node<continue_msg>>(G,
[&](const continue_msg&){
for(int i=0; i<N; ++i) {
cz[i] = cx[i] + cy[i];
}
++counter;
}
);
}
else {
tasks[v.v] = std::make_unique<continue_node<continue_msg>>(G,
[&, tgt_device=v.g](const continue_msg&){
++counter;
int src_device = -1;
TF_CHECK_CUDA(cudaGetDevice(&src_device), "get device failed");
TF_CHECK_CUDA(cudaSetDevice(tgt_device), "set device failed");
cudaGraph_t cuda_graph;
TF_CHECK_CUDA(cudaGraphCreate(&cuda_graph, 0), "cudaGraphCreate failed");
// memset parameter
cudaMemsetParams msetp;
msetp.value = 0;
msetp.pitch = 0;
msetp.elementSize = sizeof(int); // either 1, 2, or 4
msetp.width = N;
msetp.height = 1;
// sgx
cudaGraphNode_t sgx;
msetp.dst = gx;
TF_CHECK_CUDA(cudaGraphAddMemsetNode(&sgx, cuda_graph, 0, 0, &msetp), "sgx failed");
// sgy
cudaGraphNode_t sgy;
msetp.dst = gy;
TF_CHECK_CUDA(cudaGraphAddMemsetNode(&sgy, cuda_graph, 0, 0, &msetp), "sgy failed");
// sgz
cudaGraphNode_t sgz;
msetp.dst = gz;
TF_CHECK_CUDA(cudaGraphAddMemsetNode(&sgz, cuda_graph, 0, 0, &msetp), "sgz failed");
// copy parameter
cudaMemcpy3DParms h2dp;
h2dp.srcArray = nullptr;
h2dp.srcPos = ::make_cudaPos(0, 0, 0);
h2dp.dstArray = nullptr;
h2dp.dstPos = ::make_cudaPos(0, 0, 0);
h2dp.extent = ::make_cudaExtent(N*sizeof(int), 1, 1);
h2dp.kind = cudaMemcpyDefault;
// h2d_gx
cudaGraphNode_t h2d_gx;
h2dp.srcPtr = ::make_cudaPitchedPtr(cx, N*sizeof(int), N, 1);
h2dp.dstPtr = ::make_cudaPitchedPtr(gx, N*sizeof(int), N, 1);
TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&h2d_gx, cuda_graph, 0, 0, &h2dp), "h2d_gx failed");
// h2d_gy
cudaGraphNode_t h2d_gy;
h2dp.srcPtr = ::make_cudaPitchedPtr(cy, N*sizeof(int), N, 1);
h2dp.dstPtr = ::make_cudaPitchedPtr(gy, N*sizeof(int), N, 1);
TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&h2d_gy, cuda_graph, 0, 0, &h2dp), "h2d_gy failed");
// h2d_gz
cudaGraphNode_t h2d_gz;
h2dp.srcPtr = ::make_cudaPitchedPtr(cz, N*sizeof(int), N, 1);
h2dp.dstPtr = ::make_cudaPitchedPtr(gz, N*sizeof(int), N, 1);
TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&h2d_gz, cuda_graph, 0, 0, &h2dp), "h2d_gz failed");
// kernel
cudaKernelNodeParams kp;
void* arguments[4] = { (void*)(&gx), (void*)(&gy), (void*)(&gz), (void*)(&N) };
kp.func = (void*)add<int>;
kp.gridDim = (N+255)/256;
kp.blockDim = 256;
kp.sharedMemBytes = 0;
kp.kernelParams = arguments;
kp.extra = nullptr;
cudaGraphNode_t kernel;
TF_CHECK_CUDA(cudaGraphAddKernelNode(&kernel, cuda_graph, 0, 0, &kp), "kernel failed");
// d2hp
cudaMemcpy3DParms d2hp;
d2hp.srcArray = nullptr;
d2hp.srcPos = ::make_cudaPos(0, 0, 0);
d2hp.dstArray = nullptr;
d2hp.dstPos = ::make_cudaPos(0, 0, 0);
d2hp.extent = ::make_cudaExtent(N*sizeof(int), 1, 1);
d2hp.kind = cudaMemcpyDefault;
// d2h_gx
cudaGraphNode_t d2h_gx;
d2hp.srcPtr = ::make_cudaPitchedPtr(gx, N*sizeof(int), N, 1);
d2hp.dstPtr = ::make_cudaPitchedPtr(cx, N*sizeof(int), N, 1);
TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&d2h_gx, cuda_graph, 0, 0, &d2hp), "d2h_gx failed");
// d2h_gy
cudaGraphNode_t d2h_gy;
d2hp.srcPtr = ::make_cudaPitchedPtr(gy, N*sizeof(int), N, 1);
d2hp.dstPtr = ::make_cudaPitchedPtr(cy, N*sizeof(int), N, 1);
TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&d2h_gy, cuda_graph, 0, 0, &d2hp), "d2h_gy failed");
// d2h_gz
cudaGraphNode_t d2h_gz;
d2hp.srcPtr = ::make_cudaPitchedPtr(gz, N*sizeof(int), N, 1);
d2hp.dstPtr = ::make_cudaPitchedPtr(cz, N*sizeof(int), N, 1);
TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&d2h_gz, cuda_graph, 0, 0, &d2hp), "d2h_gz failed");
// add dependency
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &sgx, &h2d_gx, 1), "sgx->h2d_gx");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &sgy, &h2d_gy, 1), "sgy->h2d_gy");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &sgz, &h2d_gz, 1), "sgz->h2d_gz");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &h2d_gx, &kernel, 1), "h2d_gz->kernel");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &h2d_gy, &kernel, 1), "h2d_gz->kernel");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &h2d_gz, &kernel, 1), "h2d_gz->kernel");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &kernel, &d2h_gx, 1), "kernel->d2h_gx");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &kernel, &d2h_gy, 1), "kernel->d2h_gy");
TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &kernel, &d2h_gz, 1), "kernel->d2h_gz");
// launch the graph
cudaGraphExec_t exe;
auto pts = streams.per_thread_stream(tgt_device);
TF_CHECK_CUDA(cudaGraphInstantiate(&exe, cuda_graph, 0, 0, 0), "inst failed");
TF_CHECK_CUDA(cudaGraphLaunch(exe, pts), "failed to launch cudaGraph");
TF_CHECK_CUDA(cudaStreamSynchronize(pts), "failed to sync cudaStream");
TF_CHECK_CUDA(cudaGraphExecDestroy(exe), "destroy exe failed");
TF_CHECK_CUDA(cudaGraphDestroy(cuda_graph), "cudaGraphDestroy failed");
TF_CHECK_CUDA(cudaSetDevice(src_device), "set device failed");
}
);
}
}
for(const auto& e : g.edges) {
make_edge(*tasks[e.u], *tasks[e.v]);
indegree[e.v]++;
}
for(size_t i=0; i<indegree.size(); ++i) {
if(indegree[i] == 0) {
make_edge(*source, *tasks[i]);
}
}
source->try_put(continue_msg());
G.wait_for_all();
delete [] cx;
delete [] cy;
delete [] cz;
TF_CHECK_CUDA(cudaFree(gx), "failed at cudaFree");
TF_CHECK_CUDA(cudaFree(gy), "failed at cudaFree");
TF_CHECK_CUDA(cudaFree(gz), "failed at cudaFree");
if(counter != g.num_nodes) {
throw std::runtime_error("wrong result");
}
}
std::chrono::microseconds measure_time_tbb(
const Graph& g, unsigned num_cpus, unsigned num_gpus
) {
auto beg = std::chrono::high_resolution_clock::now();
TBB(g, num_cpus, num_gpus);
auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
}