#include "graph.hpp" #include #include struct cudaStream { std::vector> streams; cudaStream(unsigned N) : streams(tf::cuda_get_num_devices()) { for(size_t i=0; i()(std::this_thread::get_id()) % streams[device].size(); return streams[device][id]; } }; void TBB(const Graph& g, unsigned num_cpus, unsigned num_gpus) { using namespace tbb; using namespace tbb::flow; tbb::global_control c( tbb::global_control::max_allowed_parallelism, num_cpus + num_gpus ); tbb::flow::graph G; cudaStream streams(num_cpus + num_gpus); std::atomic counter {0}; int* cx = new int[N]; int* cy = new int[N]; int* cz = new int[N]; int* gx = nullptr; int* gy = nullptr; int* gz = nullptr; TF_CHECK_CUDA(cudaMallocManaged(&gx, N*sizeof(int)), "failed at cudaMallocManaged"); TF_CHECK_CUDA(cudaMallocManaged(&gy, N*sizeof(int)), "failed at cudaMallocManaged"); TF_CHECK_CUDA(cudaMallocManaged(&gz, N*sizeof(int)), "failed at cudaMallocManaged"); std::vector>> tasks(g.num_nodes); std::vector indegree(g.num_nodes, 0); auto source = std::make_unique>( G, [](const continue_msg&){} ); // create a task for each node for(const auto& v : g.nodes) { // cpu task if(v.g == -1) { tasks[v.v] = std::make_unique>(G, [&](const continue_msg&){ for(int i=0; i>(G, [&, tgt_device=v.g](const continue_msg&){ ++counter; int src_device = -1; TF_CHECK_CUDA(cudaGetDevice(&src_device), "get device failed"); TF_CHECK_CUDA(cudaSetDevice(tgt_device), "set device failed"); cudaGraph_t cuda_graph; TF_CHECK_CUDA(cudaGraphCreate(&cuda_graph, 0), "cudaGraphCreate failed"); // memset parameter cudaMemsetParams msetp; msetp.value = 0; msetp.pitch = 0; msetp.elementSize = sizeof(int); // either 1, 2, or 4 msetp.width = N; msetp.height = 1; // sgx cudaGraphNode_t sgx; msetp.dst = gx; TF_CHECK_CUDA(cudaGraphAddMemsetNode(&sgx, cuda_graph, 0, 0, &msetp), "sgx failed"); // sgy cudaGraphNode_t sgy; msetp.dst = gy; TF_CHECK_CUDA(cudaGraphAddMemsetNode(&sgy, cuda_graph, 0, 0, &msetp), "sgy failed"); // sgz cudaGraphNode_t sgz; msetp.dst = gz; TF_CHECK_CUDA(cudaGraphAddMemsetNode(&sgz, cuda_graph, 0, 0, &msetp), "sgz failed"); // copy parameter cudaMemcpy3DParms h2dp; h2dp.srcArray = nullptr; h2dp.srcPos = ::make_cudaPos(0, 0, 0); h2dp.dstArray = nullptr; h2dp.dstPos = ::make_cudaPos(0, 0, 0); h2dp.extent = ::make_cudaExtent(N*sizeof(int), 1, 1); h2dp.kind = cudaMemcpyDefault; // h2d_gx cudaGraphNode_t h2d_gx; h2dp.srcPtr = ::make_cudaPitchedPtr(cx, N*sizeof(int), N, 1); h2dp.dstPtr = ::make_cudaPitchedPtr(gx, N*sizeof(int), N, 1); TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&h2d_gx, cuda_graph, 0, 0, &h2dp), "h2d_gx failed"); // h2d_gy cudaGraphNode_t h2d_gy; h2dp.srcPtr = ::make_cudaPitchedPtr(cy, N*sizeof(int), N, 1); h2dp.dstPtr = ::make_cudaPitchedPtr(gy, N*sizeof(int), N, 1); TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&h2d_gy, cuda_graph, 0, 0, &h2dp), "h2d_gy failed"); // h2d_gz cudaGraphNode_t h2d_gz; h2dp.srcPtr = ::make_cudaPitchedPtr(cz, N*sizeof(int), N, 1); h2dp.dstPtr = ::make_cudaPitchedPtr(gz, N*sizeof(int), N, 1); TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&h2d_gz, cuda_graph, 0, 0, &h2dp), "h2d_gz failed"); // kernel cudaKernelNodeParams kp; void* arguments[4] = { (void*)(&gx), (void*)(&gy), (void*)(&gz), (void*)(&N) }; kp.func = (void*)add; kp.gridDim = (N+255)/256; kp.blockDim = 256; kp.sharedMemBytes = 0; kp.kernelParams = arguments; kp.extra = nullptr; cudaGraphNode_t kernel; TF_CHECK_CUDA(cudaGraphAddKernelNode(&kernel, cuda_graph, 0, 0, &kp), "kernel failed"); // d2hp cudaMemcpy3DParms d2hp; d2hp.srcArray = nullptr; d2hp.srcPos = ::make_cudaPos(0, 0, 0); d2hp.dstArray = nullptr; d2hp.dstPos = ::make_cudaPos(0, 0, 0); d2hp.extent = ::make_cudaExtent(N*sizeof(int), 1, 1); d2hp.kind = cudaMemcpyDefault; // d2h_gx cudaGraphNode_t d2h_gx; d2hp.srcPtr = ::make_cudaPitchedPtr(gx, N*sizeof(int), N, 1); d2hp.dstPtr = ::make_cudaPitchedPtr(cx, N*sizeof(int), N, 1); TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&d2h_gx, cuda_graph, 0, 0, &d2hp), "d2h_gx failed"); // d2h_gy cudaGraphNode_t d2h_gy; d2hp.srcPtr = ::make_cudaPitchedPtr(gy, N*sizeof(int), N, 1); d2hp.dstPtr = ::make_cudaPitchedPtr(cy, N*sizeof(int), N, 1); TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&d2h_gy, cuda_graph, 0, 0, &d2hp), "d2h_gy failed"); // d2h_gz cudaGraphNode_t d2h_gz; d2hp.srcPtr = ::make_cudaPitchedPtr(gz, N*sizeof(int), N, 1); d2hp.dstPtr = ::make_cudaPitchedPtr(cz, N*sizeof(int), N, 1); TF_CHECK_CUDA(cudaGraphAddMemcpyNode(&d2h_gz, cuda_graph, 0, 0, &d2hp), "d2h_gz failed"); // add dependency TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &sgx, &h2d_gx, 1), "sgx->h2d_gx"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &sgy, &h2d_gy, 1), "sgy->h2d_gy"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &sgz, &h2d_gz, 1), "sgz->h2d_gz"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &h2d_gx, &kernel, 1), "h2d_gz->kernel"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &h2d_gy, &kernel, 1), "h2d_gz->kernel"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &h2d_gz, &kernel, 1), "h2d_gz->kernel"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &kernel, &d2h_gx, 1), "kernel->d2h_gx"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &kernel, &d2h_gy, 1), "kernel->d2h_gy"); TF_CHECK_CUDA(cudaGraphAddDependencies(cuda_graph, &kernel, &d2h_gz, 1), "kernel->d2h_gz"); // launch the graph cudaGraphExec_t exe; auto pts = streams.per_thread_stream(tgt_device); TF_CHECK_CUDA(cudaGraphInstantiate(&exe, cuda_graph, 0, 0, 0), "inst failed"); TF_CHECK_CUDA(cudaGraphLaunch(exe, pts), "failed to launch cudaGraph"); TF_CHECK_CUDA(cudaStreamSynchronize(pts), "failed to sync cudaStream"); TF_CHECK_CUDA(cudaGraphExecDestroy(exe), "destroy exe failed"); TF_CHECK_CUDA(cudaGraphDestroy(cuda_graph), "cudaGraphDestroy failed"); TF_CHECK_CUDA(cudaSetDevice(src_device), "set device failed"); } ); } } for(const auto& e : g.edges) { make_edge(*tasks[e.u], *tasks[e.v]); indegree[e.v]++; } for(size_t i=0; itry_put(continue_msg()); G.wait_for_all(); delete [] cx; delete [] cy; delete [] cz; TF_CHECK_CUDA(cudaFree(gx), "failed at cudaFree"); TF_CHECK_CUDA(cudaFree(gy), "failed at cudaFree"); TF_CHECK_CUDA(cudaFree(gz), "failed at cudaFree"); if(counter != g.num_nodes) { throw std::runtime_error("wrong result"); } } std::chrono::microseconds measure_time_tbb( const Graph& g, unsigned num_cpus, unsigned num_gpus ) { auto beg = std::chrono::high_resolution_clock::now(); TBB(g, num_cpus, num_gpus); auto end = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(end - beg); }