#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include constexpr float eps = 0.0001f; // ---------------------------------------------------------------------------- // for_each // ---------------------------------------------------------------------------- template void for_each() { tf::Taskflow taskflow; tf::Executor executor; sycl::queue queue; for(int n=1; n<=123456; n = n*2 + 1) { taskflow.clear(); T* cpu = nullptr; T* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(T))); gpu = sycl::malloc_device(n, queue); }); tf::Task gputask; gputask = taskflow.emplace_on([&](tf::syclFlow& cf) { auto d2h = cf.copy(cpu, gpu, n); auto h2d = cf.copy(gpu, cpu, n); auto kernel = cf.for_each( gpu, gpu+n, [] (T& val) { val = 65536; } ); h2d.precede(kernel); d2h.succeed(kernel); }, queue); cputask.precede(gputask); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("syclFlow.for_each.float" * doctest::timeout(300)) { for_each(); } TEST_CASE("syclFlow.for_each.double" * doctest::timeout(300)) { for_each(); } // -------------------------------------------------------- // Testcase: for_each_index // -------------------------------------------------------- template void for_each_index() { tf::Taskflow taskflow; tf::Executor executor; sycl::queue queue; for(int n=10; n<=123456; n = n*2 + 1) { taskflow.clear(); T* cpu = nullptr; T* gpu = nullptr; auto cputask = taskflow.emplace([&](){ cpu = static_cast(std::calloc(n, sizeof(T))); gpu = sycl::malloc_device(n, queue); }); auto gputask = taskflow.emplace_on([&](tf::syclFlow& cf) { auto d2h = cf.copy(cpu, gpu, n); auto h2d = cf.copy(gpu, cpu, n); auto kernel1 = cf.for_each_index( 0, n, 2, [gpu] (int i) { gpu[i] = 17; } ); auto kernel2 = cf.for_each_index( 1, n, 2, [=] (int i) { gpu[i] = -17; } ); h2d.precede(kernel1, kernel2); d2h.succeed(kernel1, kernel2); }, queue); cputask.precede(gputask); executor.run(taskflow).wait(); for(int i=0; i(); } TEST_CASE("syclFlow.for_each_index.float" * doctest::timeout(300)) { for_each_index(); } TEST_CASE("syclFlow.for_each_index.double" * doctest::timeout(300)) { for_each_index(); } // ---------------------------------------------------------------------------- // reduce // ---------------------------------------------------------------------------- template void reduce() { sycl::queue queue; tf::Taskflow taskflow; tf::Executor executor; for(int N=1; N<=1000000; N += (N/10+1)) { taskflow.clear(); T sum = 0; std::vector cpu(N); for(auto& i : cpu) { i = ::rand()%100-50; sum += i; } T sol; T* gpu = nullptr; T* res = nullptr; auto cputask = taskflow.emplace([&](){ gpu = sycl::malloc_shared(N, queue); res = sycl::malloc_shared(1, queue); }); tf::Task gputask; gputask = taskflow.emplace_on([&](tf::syclFlow& cf) { auto d2h = cf.copy(&sol, res, 1); auto h2d = cf.copy(gpu, cpu.data(), N); auto set = cf.single_task([res] () { *res = 1000; }); auto kernel = cf.reduce(gpu, gpu+N, res, std::plus()); kernel.succeed(h2d, set); d2h.succeed(kernel); }, queue); cputask.precede(gputask); executor.run(taskflow).wait(); REQUIRE(std::fabs(sum-sol+1000) < 0.0001); // ------------------------------------------------------------------------ // standard algorithms // ------------------------------------------------------------------------ tf::syclDefaultExecutionPolicy p{queue}; *res = 1000; tf::sycl_reduce(p, gpu, gpu+N, res, std::plus{}); REQUIRE(std::fabs(sum-sol+1000) < 0.0001); sycl::free(gpu, queue); sycl::free(res, queue); } } TEST_CASE("syclFlow.reduce.int" * doctest::timeout(300)) { reduce(); } TEST_CASE("syclFlow.reduce.float" * doctest::timeout(300)) { reduce(); } TEST_CASE("syclFlow.reduce.double" * doctest::timeout(300)) { reduce(); } // ---------------------------------------------------------------------------- // uninitialized_reduce // ---------------------------------------------------------------------------- template void uninitialized_reduce() { sycl::queue queue; tf::Taskflow taskflow; tf::Executor executor; for(int N=1; N<=1000000; N += (N/10+1)) { taskflow.clear(); T sum = 0; std::vector cpu(N); for(auto& i : cpu) { i = ::rand()%100-50; sum += i; } T sol; T* gpu = nullptr; T* res = nullptr; auto cputask = taskflow.emplace([&](){ gpu = sycl::malloc_shared(N, queue); res = sycl::malloc_shared(1, queue); }); tf::Task gputask; gputask = taskflow.emplace_on([&](tf::syclFlow& cf) { auto d2h = cf.copy(&sol, res, 1); auto h2d = cf.copy(gpu, cpu.data(), N); auto set = cf.single_task([res] () { *res = 1000; }); auto kernel = cf.uninitialized_reduce(gpu, gpu+N, res, std::plus()); kernel.succeed(h2d, set); d2h.succeed(kernel); }, queue); cputask.precede(gputask); executor.run(taskflow).wait(); REQUIRE(std::fabs(sum-sol) < 0.0001); // ------------------------------------------------------------------------ // standard algorithms // ------------------------------------------------------------------------ tf::syclDefaultExecutionPolicy p{queue}; *res = 1000; tf::sycl_reduce(p, gpu, gpu+N, res, std::plus{}); REQUIRE(std::fabs(sum-sol) < 0.0001); sycl::free(gpu, queue); sycl::free(res, queue); } } TEST_CASE("syclFlow.uninitialized_reduce.int" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("syclFlow.uninitialized_reduce.float" * doctest::timeout(300)) { uninitialized_reduce(); } TEST_CASE("syclFlow.uninitialized_reduce.double" * doctest::timeout(300)) { uninitialized_reduce(); } // ---------------------------------------------------------------------------- // transform // ---------------------------------------------------------------------------- void transform() { tf::Taskflow taskflow; tf::Executor executor; sycl::queue queue; for(unsigned n=1; n<=123456; n = n*2 + 1) { taskflow.clear(); int* htgt = nullptr; int* tgt = nullptr; int* hsrc1 = nullptr; int* src1 = nullptr; float* hsrc2 = nullptr; float* src2 = nullptr; double* hsrc3 = nullptr; double* src3 = nullptr; auto htgttask = taskflow.emplace([&](){ htgt = static_cast(std::calloc(n, sizeof(int))); hsrc1 = static_cast(std::calloc(n, sizeof(int))); hsrc2 = static_cast(std::calloc(n, sizeof(float))); hsrc3 = static_cast(std::calloc(n, sizeof(double))); tgt = sycl::malloc_device(n, queue); src1 = sycl::malloc_device(n, queue); src2 = sycl::malloc_device(n, queue); src3 = sycl::malloc_device(n, queue); }); auto gputask = taskflow.emplace_on([&](tf::syclFlow& cf) { auto d2h = cf.copy(htgt, tgt, n); auto d2h3 = cf.copy(hsrc3, src3, n); auto d2h2 = cf.copy(hsrc2, src2, n); auto d2h1 = cf.copy(hsrc1, src1, n); auto kernel = cf.transform( tgt, tgt+n, [] (int& v1, float& v2, double& v3) -> int { v1 = 1; v2 = 3.0f; v3 = 5.0; return 17; }, src1, src2, src3 ); auto h2d = cf.copy(tgt, htgt, n); h2d.precede(kernel); kernel.precede(d2h, d2h1, d2h2, d2h3); }, queue); htgttask.precede(gputask); executor.run(taskflow).wait(); for(unsigned i=0; i