#pragma once #include "../taskflow.hpp" #include "sycl_task.hpp" /** @file syclflow.hpp @brief main syclFlow include file */ namespace tf { // ---------------------------------------------------------------------------- // class definition: syclFlow // ---------------------------------------------------------------------------- /** @class syclFlow @brief class for building a SYCL task dependency graph */ class syclFlow { friend class Executor; struct External { syclGraph graph; }; struct Internal { Executor& executor; Internal(Executor& e) : executor {e} {} }; using handle_t = std::variant; public: /** @brief constructs a standalone %syclFlow from the given queue A standalone %syclFlow does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::syclFlow::offload). */ syclFlow(sycl::queue& queue); /** @brief destroys the %syclFlow */ ~syclFlow() = default; /** @brief queries the emptiness of the graph */ bool empty() const; /** @brief queries the number of tasks */ size_t num_tasks() const; /** @brief dumps the %syclFlow graph into a DOT format through an output stream */ void dump(std::ostream& os) const; /** @brief clear the associated graph */ void clear(); // ------------------------------------------------------------------------ // Generic device operations // ------------------------------------------------------------------------ /** @brief creates a task that launches the given command group function object @tparam F type of command group function object @param func function object that is constructible from std::function Creates a task that is associated from the given command group. In SYCL, each command group function object is given a unique command group handler object to perform all the necessary work required to correctly process data on a device using a kernel. */ template , void>* = nullptr > syclTask on(F&& func); /** @brief updates the task to the given command group function object Similar to tf::syclFlow::on but operates on an existing task. */ template , void>* = nullptr > void on(syclTask task, F&& func); /** @brief creates a memcpy task that copies untyped data in bytes @param tgt pointer to the target memory block @param src pointer to the source memory block @param bytes bytes to copy @return a tf::syclTask handle A memcpy task transfers @c bytes of data from a source locationA @c src to a target location @c tgt. Both @c src and @c tgt may be either host or USM pointers. */ syclTask memcpy(void* tgt, const void* src, size_t bytes); /** @brief creates a memset task that fills untyped data with a byte value @param ptr pointer to the destination device memory area @param value value to set for each byte of specified memory @param bytes number of bytes to set @return a tf::syclTask handle Fills @c bytes of memory beginning at address @c ptr with @c value. @c ptr must be a USM allocation. @c value is interpreted as an unsigned char. */ syclTask memset(void* ptr, int value, size_t bytes); /** @brief creates a fill task that fills typed data with the given value @tparam T trivially copyable value type @param ptr pointer to the memory to fill @param pattern pattern value to fill into the memory @param count number of items to fill the value Creates a task that fills the specified memory with the specified value. */ template syclTask fill(void* ptr, const T& pattern, size_t count); /** @brief creates a copy task that copies typed data from a source to a target memory block @tparam T trivially copyable value type @param target pointer to the memory to fill @param source pointer to the pattern value to fill into the memory @param count number of items to fill the value Creates a task that copies @c count items of type @c T from a source memory location to a target memory location. */ template , void>* = nullptr > syclTask copy(T* target, const T* source, size_t count); /** @brief creates a kernel task @tparam ArgsT arguments types @param args arguments to forward to the parallel_for methods defined in the handler object Creates a kernel task from a parallel_for method through the handler object associated with a command group. */ template syclTask parallel_for(ArgsT&&... args); // ------------------------------------------------------------------------ // algorithms // ------------------------------------------------------------------------ /** @brief invokes a SYCL kernel function using only one thread @tparam F kernel function type @param func kernel function Creates a task that launches the given function object using only one kernel thread. */ template syclTask single_task(F&& func); /** @brief applies a callable to each dereferenced element of the data array @tparam I iterator type @tparam C callable type @param first iterator to the beginning (inclusive) @param last iterator to the end (exclusive) @param callable a callable object to apply to the dereferenced iterator @return a tf::syclTask handle This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} for(auto itr = first; itr != last; itr++) { callable(*itr); } @endcode */ template syclTask for_each(I first, I last, C&& callable); /** @brief applies a callable to each index in the range with the step size @tparam I index type @tparam C callable type @param first beginning index @param last last index @param step step size @param callable the callable to apply to each element in the data array @return a tf::syclTask handle This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} // step is positive [first, last) for(auto i=first; ilast; i+=step) { callable(i); } @endcode */ template syclTask for_each_index(I first, I last, I step, C&& callable); /** @brief applies a callable to a source range and stores the result in a target range @tparam I iterator type @tparam C callable type @tparam S source types @param first iterator to the beginning (inclusive) @param last iterator to the end (exclusive) @param callable the callable to apply to each element in the range @param srcs iterators to the source ranges @return a tf::syclTask handle This method is equivalent to the parallel execution of the following loop on a SYCL device: @code{.cpp} while (first != last) { *first++ = callable(*src1++, *src2++, *src3++, ...); } @endcode */ template syclTask transform(I first, I last, C&& callable, S... srcs); /** @brief performs parallel reduction over a range of items @tparam I input iterator type @tparam T value type @tparam C callable type @param first iterator to the beginning (inclusive) @param last iterator to the end (exclusive) @param result pointer to the result with an initialized value @param op binary reduction operator @return a tf::syclTask handle This method is equivalent to the parallel execution of the following loop on a SYCL device: @code{.cpp} while (first != last) { *result = op(*result, *first++); } @endcode */ template syclTask reduce(I first, I last, T* result, C&& op); /** @brief similar to tf::syclFlow::reduce but does not assume any initial value to reduce This method is equivalent to the parallel execution of the following loop on a SYCL device: @code{.cpp} *result = *first++; // no initial values partitipcate in the loop while (first != last) { *result = op(*result, *first++); } @endcode */ template syclTask uninitialized_reduce(I first, I last, T* result, C&& op); // ------------------------------------------------------------------------ // offload methods // ------------------------------------------------------------------------ /** @brief offloads the %syclFlow onto a GPU and repeatedly runs it until the predicate becomes true @tparam P predicate type (a binary callable) @param predicate a binary predicate (returns @c true for stop) Repetitively executes the present %syclFlow through the given queue object until the predicate returns @c true. By default, if users do not offload the %syclFlow, the executor will offload it once. */ template void offload_until(P&& predicate); /** @brief offloads the %syclFlow and executes it by the given times @param N number of executions */ void offload_n(size_t N); /** @brief offloads the %syclFlow and executes it once */ void offload(); // ------------------------------------------------------------------------ // update methods // ------------------------------------------------------------------------ /** @brief rebinds the task to a memcpy task Similar to tf::syclFlow::memcpy but operates on an existing task. */ void memcpy(syclTask task, void* tgt, const void* src, size_t bytes); /** @brief rebinds the task to a memset task Similar to tf::syclFlow::memset but operates on an existing task. */ void memset(syclTask task, void* ptr, int value, size_t bytes); /** @brief rebinds the task to a fill task Similar to tf::syclFlow::fill but operates on an existing task. */ template void fill(syclTask task, void* ptr, const T& pattern, size_t count); /** @brief rebinds the task to a copy task Similar to tf::syclFlow::copy but operates on an existing task. */ template , void>* = nullptr > void copy(syclTask task, T* target, const T* source, size_t count); /** @brief rebinds the task to a parallel-for kernel task Similar to tf::syclFlow::parallel_for but operates on an existing task. */ template void parallel_for(syclTask task, ArgsT&&... args); /** @brief rebinds the task to a single-threaded kernel task Similar to tf::syclFlow::single_task but operates on an existing task. */ template void single_task(syclTask task, F&& func); private: syclFlow(Executor&, syclGraph&, sycl::queue&); sycl::queue& _queue; handle_t _handle; syclGraph& _graph; std::vector _tpg; std::queue _bfs; }; // constructor inline syclFlow::syclFlow(sycl::queue& queue) : _queue {queue}, _handle {std::in_place_type_t{}}, _graph {std::get_if(&_handle)->graph} { } // Construct the syclFlow from executor (internal graph) inline syclFlow::syclFlow(Executor& e, syclGraph& g, sycl::queue& queue) : _queue {queue}, _handle {std::in_place_type_t{}, e}, _graph {g} { } // Function: empty inline bool syclFlow::empty() const { return _graph._nodes.empty(); } // Function: num_tasks inline size_t syclFlow::num_tasks() const { return _graph._nodes.size(); } // Procedure: dump inline void syclFlow::dump(std::ostream& os) const { _graph.dump(os, nullptr, ""); } // Procedure: clear inline void syclFlow::clear() { _graph.clear(); } // Function: memcpy inline syclTask syclFlow::memcpy(void* tgt, const void* src, size_t bytes) { return on([=](sycl::handler& h){ h.memcpy(tgt, src, bytes); }); } // Function: memset inline syclTask syclFlow::memset(void* ptr, int value, size_t bytes) { return on([=](sycl::handler& h){ h.memset(ptr, value, bytes); }); } // Function: fill template syclTask syclFlow::fill(void* ptr, const T& pattern, size_t count) { return on([=](sycl::handler& h){ h.fill(ptr, pattern, count); }); } // Function: copy template , void>* > syclTask syclFlow::copy(T* target, const T* source, size_t count) { return on([=](sycl::handler& h){ h.memcpy(target, source, count*sizeof(T)); }); } // Function: on template , void>* > syclTask syclFlow::on(F&& f) { auto node = _graph.emplace_back(_graph, std::in_place_type_t{}, std::forward(f) ); return syclTask(node); } // Function: single_task template syclTask syclFlow::single_task(F&& func) { return on([f=std::forward(func)] (sycl::handler& h) { h.single_task(f); }); } // Function: parallel_for template syclTask syclFlow::parallel_for(ArgsT&&... args) { return on([args...] (sycl::handler& h) { h.parallel_for(args...); }); } // Procedure: offload_until template void syclFlow::offload_until(P&& predicate) { if(!(_graph._state & syclGraph::TOPOLOGY_CHANGED)) { goto offload; } // levelize the graph _tpg.clear(); // insert the first level of nodes into the queue for(auto& u : _graph._nodes) { u->_level = u->_dependents.size(); if(u->_level == 0) { _bfs.push(u.get()); } } while(!_bfs.empty()) { auto u = _bfs.front(); _bfs.pop(); _tpg.push_back(u); for(auto v : u->_successors) { if(--(v->_level) == 0) { v->_level = u->_level + 1; _bfs.push(v); } } } offload: // offload the syclFlow graph bool in_order = _queue.is_in_order(); while(!predicate()) { // traverse node in a topological order for(auto u : _tpg) { switch(u->_handle.index()) { // task type 1: command group handler case syclNode::COMMAND_GROUP_HANDLER: u->_event = _queue.submit([u, in_order](sycl::handler& h){ // wait on all predecessors if(!in_order) { for(auto p : u->_dependents) { h.depends_on(p->_event); } } std::get_if(&u->_handle)->work(h); }); break; } } // synchronize the execution _queue.wait(); } _graph._state = syclGraph::OFFLOADED; } // Procedure: offload_n inline void syclFlow::offload_n(size_t n) { offload_until([repeat=n] () mutable { return repeat-- == 0; }); } // Procedure: offload inline void syclFlow::offload() { offload_until([repeat=1] () mutable { return repeat-- == 0; }); } // Function: on template , void>* > void syclFlow::on(syclTask task, F&& f) { std::get_if(&task._node->_handle)->work = std::forward(f); } // Function: memcpy inline void syclFlow::memcpy( syclTask task, void* tgt, const void* src, size_t bytes ) { on(task, [=](sycl::handler& h){ h.memcpy(tgt, src, bytes); }); } // Function: memset inline void syclFlow::memset( syclTask task, void* ptr, int value, size_t bytes ) { on(task, [=](sycl::handler& h){ h.memset(ptr, value, bytes); }); } // Function: fill template void syclFlow::fill( syclTask task, void* ptr, const T& pattern, size_t count ) { on(task, [=](sycl::handler& h){ h.fill(ptr, pattern, count); }); } // Function: copy template , void>* > void syclFlow::copy( syclTask task, T* target, const T* source, size_t count ) { on(task, [=](sycl::handler& h){ h.memcpy(target, source, count*sizeof(T));} ); } // Function: parallel_for template void syclFlow::parallel_for(syclTask task, ArgsT&&... args) { on(task, [args...] (sycl::handler& h) { h.parallel_for(args...); }); } // Function: single_task template void syclFlow::single_task(syclTask task, F&& func) { on(task, [f=std::forward(func)] (sycl::handler& h) { h.single_task(f); }); } // ############################################################################ // Forward declaration: FlowBuilder // ############################################################################ // FlowBuilder::emplace_on template , void>*> Task FlowBuilder::emplace_on(C&& callable, Q&& q) { auto n = _graph._emplace_back( std::in_place_type_t{}, [c=std::forward(callable), queue=std::forward(q)] (Executor& e, Node* p) mutable { e._invoke_syclflow_task_entry(p, c, queue); }, std::make_unique() ); return Task(n); } // FlowBuilder::emplace template , void>*> Task FlowBuilder::emplace(C&& callable) { return emplace_on(std::forward(callable), sycl::queue{}); } // ############################################################################ // Forward declaration: Executor // ############################################################################ // Procedure: _invoke_syclflow_task_entry (syclFlow) template , void>* > void Executor::_invoke_syclflow_task_entry(Node* node, C&& c, Q& queue) { auto h = std::get_if(&node->_handle); syclGraph* g = dynamic_cast(h->graph.get()); g->clear(); syclFlow sf(*this, *g, queue); c(sf); if(!(g->_state & syclGraph::OFFLOADED)) { sf.offload(); } } } // end of namespace tf -----------------------------------------------------