tf tf::IsPod tf::SmallVectorBase tf::SmallVectorStorage tf::SmallVectorTemplateCommon tf::SmallVectorTemplateBase tf::SmallVectorTemplateBase< T, true > tf::SmallVectorImpl tf::SmallVectorStorage< T, 1 > tf::SmallVectorStorage< T, 0 > tf::SmallVector tf::Graph tf::Runtime tf::TaskParams tf::DefaultTaskParams tf::Node tf::UnboundedTaskQueue tf::BoundedTaskQueue tf::FlowBuilder tf::Subflow tf::Worker tf::WorkerView tf::Executor tf::Task tf::TaskView tf::AsyncTask tf::Semaphore tf::Taskflow tf::Future tf::Segment tf::Timeline tf::ProfileData tf::ObserverInterface tf::ChromeObserver tf::TFProfObserver tf::TFProfManager tf::DefaultClosureWrapper tf::IsPartitioner tf::PartitionerBase tf::GuidedPartitioner tf::DynamicPartitioner tf::StaticPartitioner tf::RandomPartitioner tf::DeferredPipeflow tf::Pipeflow tf::Pipe tf::Pipeline tf::ScalablePipeline tf::DataPipe tf::DataPipeline tf::cudaScopedDevice tf::cudaSharedMemory tf::cudaSharedMemory< int > tf::cudaSharedMemory< unsigned int > tf::cudaSharedMemory< char > tf::cudaSharedMemory< unsigned char > tf::cudaSharedMemory< short > tf::cudaSharedMemory< unsigned short > tf::cudaSharedMemory< long > tf::cudaSharedMemory< unsigned long > tf::cudaSharedMemory< bool > tf::cudaSharedMemory< float > tf::cudaSharedMemory< double > tf::cudaDeviceAllocator tf::cudaUSMAllocator tf::cudaDeviceVector tf::cudaStreamCreator tf::cudaStreamDeleter tf::cudaStream tf::cudaEventCreator tf::cudaEventDeleter tf::cudaEvent tf::cudaTask tf::cudaFlow tf::cudaFlowOptimizerBase tf::cudaFlowSequentialOptimizer tf::cudaFlowLinearOptimizer tf::cudaFlowRoundRobinOptimizer tf::cudaFlowCapturer tf::cudaExecutionPolicy tf::detail tf::pt int TaskType PLACEHOLDER = 0 placeholder task type STATIC static task type SUBFLOW dynamic (subflow) task type CONDITION condition task type MODULE module task type ASYNC asynchronous task type UNDEFINED undefined task type (for internal use only) enumeration of all task types int ObserverType TFPROF = 0 CHROME UNDEFINED enumeration of all observer types int PartitionerType STATIC static partitioner type DYNAMIC dynamic partitioner type enumeration of all partitioner types int PipeType PARALLEL = 1 parallel type SERIAL = 2 serial type enumeration of all pipe types int cudaTaskType EMPTY = 0 empty task type HOST host task type MEMSET memory set task type MEMCPY memory copy task type KERNEL memory copy task type SUBFLOW subflow (child graph) task type CAPTURE capture task type UNDEFINED undefined task type enumeration of all cudaTask types NonblockingNotifierV2 using tf::DefaultNotifier = typedef NonblockingNotifierV2 DefaultNotifier std::chrono::time_point< std::chrono::steady_clock > using tf::observer_stamp_t = typedef std::chrono::time_point<std::chrono::steady_clock> observer_stamp_t default time point type of observers GuidedPartitioner<> using tf::DefaultPartitioner = typedef GuidedPartitioner<> DefaultPartitioner default partitioner set to tf::GuidedPartitioner Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms. cudaExecutionPolicy< 512, 7 > using tf::cudaDefaultExecutionPolicy = typedef cudaExecutionPolicy<512, 7> cudaDefaultExecutionPolicy default execution policy typename P constexpr bool constexpr bool tf::is_task_params_v is_task_params_v = std::is_same_v<std::decay_t<P>, TaskParams> || std::is_same_v<std::decay_t<P>, DefaultTaskParams> || std::is_constructible_v<std::string, P> determines if the given type is a task parameter type Task parameters can be specified in one of the following types: tf::TaskParams: assign the struct of defined parameters tf::DefaultTaskParams: assign nothing std::string: assign a name to the task constexpr std::array< TaskType, 6 > constexpr std::array<TaskType, 6> tf::TASK_TYPES TASK_TYPES = { TaskType::PLACEHOLDER, TaskType::STATIC, TaskType::SUBFLOW, TaskType::CONDITION, TaskType::MODULE, TaskType::ASYNC, } array of all task types (used for iterating task types) typename C constexpr bool constexpr bool tf::is_subflow_task_v is_subflow_task_v = std::is_invocable_r_v<void, C, Subflow&> && !std::is_invocable_r_v<void, C, Runtime&> determines if a callable is a dynamic task A dynamic task is a callable object constructible from std::function<void(Subflow&)>. typename C constexpr bool constexpr bool tf::is_condition_task_v is_condition_task_v = (std::is_invocable_r_v<int, C> || std::is_invocable_r_v<int, C, Runtime&>) && !is_subflow_task_v<C> determines if a callable is a condition task A condition task is a callable object constructible from std::function<int()> or std::function<int(tf::Runtime&)>. typename C constexpr bool constexpr bool tf::is_multi_condition_task_v is_multi_condition_task_v = (std::is_invocable_r_v<SmallVector<int>, C> || std::is_invocable_r_v<SmallVector<int>, C, Runtime&>) && !is_subflow_task_v<C> determines if a callable is a multi-condition task A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()> or std::function<tf::SmallVector<int>(tf::Runtime&)>. typename C constexpr bool constexpr bool tf::is_static_task_v is_static_task_v = (std::is_invocable_r_v<void, C> || std::is_invocable_r_v<void, C, Runtime&>) && !is_condition_task_v<C> && !is_multi_condition_task_v<C> && !is_subflow_task_v<C> determines if a callable is a static task A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>. typename P constexpr bool constexpr bool tf::is_partitioner_v is_partitioner_v = std::is_base_of<IsPartitioner, P>::value determines if a type is a partitioner A partitioner is a derived type from tf::PartitionerBase. typename T unsigned N N size_t static size_t tf::capacity_in_bytes (const SmallVector< T, N > &X) capacity_in_bytes const SmallVector< T, N > & X typename... ArgsT ArgsT TF_FORCE_INLINE Node * TF_FORCE_INLINE Node* tf::animate (ArgsT &&... args) animate ArgsT &&... args TF_FORCE_INLINE void TF_FORCE_INLINE void tf::recycle (Node *ptr) recycle Node * ptr const char * const char* tf::to_string (TaskType type) to_string TaskType type convert a task type to a human-readable string The name of each task type is the litte-case string of its characters. TaskType::PLACEHOLDER->"placeholder" TaskType::STATIC->"static" TaskType::SUBFLOW->"subflow" TaskType::CONDITION->"condition" TaskType::MODULE->"module" TaskType::ASYNC->"async" std::ostream & std::ostream& tf::operator<< (std::ostream &os, const Task &task) operator<< std::ostream & os const Task & task overload of ostream inserter operator for Task typename I std::enable_if_t< std::is_same_v< deref_t< I >, Semaphore >, void > * nullptr bool bool tf::try_acquire (I first, I last) try_acquire I first I last tries to acquire all semaphores in the specified range I iterator type first iterator to the beginning (inclusive) last iterator to the end (exclusive) Tries to acquire all semaphores in the specified range. true if all semaphores are acquired, otherwise false typename... S S std::enable_if_t< all_same_v< Semaphore, std::decay_t< S >... >, void > * nullptr bool bool tf::try_acquire (S &&... semaphores) try_acquire S &&... semaphores tries to acquire all semaphores semaphores semaphores to acquire Tries to acquire all the semaphores. true if all semaphores are acquired, otherwise false typename I std::enable_if_t< std::is_same_v< deref_t< I >, Semaphore >, void > * nullptr void void tf::release (I first, I last) release I first I last tries to acquire all semaphores in the specified range I iterator type first iterator to the beginning (inclusive) last iterator to the end (exclusive) Releases all the semaphores in the given range. typename... S S std::enable_if_t< all_same_v< Semaphore, std::decay_t< S >... >, void > * nullptr void void tf::release (S &&... semaphores) release S &&... semaphores tries to acquire all semaphores semaphores semaphores to release Releases all the semaphores. const char * const char* tf::to_string (ObserverType type) to_string ObserverType type convert an observer type to a human-readable string typename Input typename Output typename C auto auto tf::make_data_pipe (PipeType d, C &&callable) make_data_pipe PipeType d C && callable function to construct a data pipe (tf::DataPipe) Input input data type Output output data type C callable type tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type. tf::make_data_pipe<int,std::string>( tf::PipeType::SERIAL, [](int&input){ returnstd::to_string(input+100); } ); The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number. tf::make_data_pipe<int,std::string>( tf::PipeType::SERIAL, [](int&input,tf::Pipeflow&pf){ printf("token=%lu,line=%lu\n",pf.token(),pf.line()); returnstd::to_string(input+100); } ); size_t size_t tf::cuda_get_num_devices () cuda_get_num_devices queries the number of available devices int int tf::cuda_get_device () cuda_get_device gets the current device associated with the caller thread void void tf::cuda_set_device (int id) cuda_set_device int id switches to a given device context void void tf::cuda_get_device_property (int i, cudaDeviceProp &p) cuda_get_device_property int i cudaDeviceProp & p obtains the device property cudaDeviceProp cudaDeviceProp tf::cuda_get_device_property (int i) cuda_get_device_property int i obtains the device property void void tf::cuda_dump_device_property (std::ostream &os, const cudaDeviceProp &p) cuda_dump_device_property std::ostream & os const cudaDeviceProp & p dumps the device property size_t size_t tf::cuda_get_device_max_threads_per_block (int d) cuda_get_device_max_threads_per_block int d queries the maximum threads per block on a device size_t size_t tf::cuda_get_device_max_x_dim_per_block (int d) cuda_get_device_max_x_dim_per_block int d queries the maximum x-dimension per block on a device size_t size_t tf::cuda_get_device_max_y_dim_per_block (int d) cuda_get_device_max_y_dim_per_block int d queries the maximum y-dimension per block on a device size_t size_t tf::cuda_get_device_max_z_dim_per_block (int d) cuda_get_device_max_z_dim_per_block int d queries the maximum z-dimension per block on a device size_t size_t tf::cuda_get_device_max_x_dim_per_grid (int d) cuda_get_device_max_x_dim_per_grid int d queries the maximum x-dimension per grid on a device size_t size_t tf::cuda_get_device_max_y_dim_per_grid (int d) cuda_get_device_max_y_dim_per_grid int d queries the maximum y-dimension per grid on a device size_t size_t tf::cuda_get_device_max_z_dim_per_grid (int d) cuda_get_device_max_z_dim_per_grid int d queries the maximum z-dimension per grid on a device size_t size_t tf::cuda_get_device_max_shm_per_block (int d) cuda_get_device_max_shm_per_block int d queries the maximum shared memory size in bytes per block on a device size_t size_t tf::cuda_get_device_warp_size (int d) cuda_get_device_warp_size int d queries the warp size on a device int int tf::cuda_get_device_compute_capability_major (int d) cuda_get_device_compute_capability_major int d queries the major number of compute capability of a device int int tf::cuda_get_device_compute_capability_minor (int d) cuda_get_device_compute_capability_minor int d queries the minor number of compute capability of a device bool bool tf::cuda_get_device_unified_addressing (int d) cuda_get_device_unified_addressing int d queries if the device supports unified addressing int int tf::cuda_get_driver_version () cuda_get_driver_version queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver int int tf::cuda_get_runtime_version () cuda_get_runtime_version queries the CUDA Runtime version (1000 * major + 10 * minor) size_t size_t tf::cuda_get_free_mem (int d) cuda_get_free_mem int d queries the free memory (expensive call) size_t size_t tf::cuda_get_total_mem (int d) cuda_get_total_mem int d queries the total available memory (expensive call) typename T T * T* tf::cuda_malloc_device (size_t N, int d) cuda_malloc_device size_t N int d allocates memory on the given device for holding N elements of type T The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory. typename T T * T* tf::cuda_malloc_device (size_t N) cuda_malloc_device size_t N allocates memory on the current device associated with the caller The function calls malloc_device from the current device associated with the caller. typename T T * T* tf::cuda_malloc_shared (size_t N) cuda_malloc_shared size_t N allocates shared memory for holding N elements of type T The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory. typename T void void tf::cuda_free (T *ptr, int d) cuda_free T * ptr int d frees memory on the GPU device T pointer type ptr device pointer to memory to free d device context identifier This methods call cudaFree to free the memory space pointed to by ptr using the given device context. typename T void void tf::cuda_free (T *ptr) cuda_free T * ptr frees memory on the GPU device T pointer type ptr device pointer to memory to free This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller. void void tf::cuda_memcpy_async (cudaStream_t stream, void *dst, const void *src, size_t count) cuda_memcpy_async cudaStream_t stream void * dst const void * src size_t count copies data between host and device asynchronously through a stream stream stream identifier dst destination memory address src source memory address count size in bytes to copy The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap. void void tf::cuda_memset_async (cudaStream_t stream, void *devPtr, int value, size_t count) cuda_memset_async cudaStream_t stream void * devPtr int value size_t count initializes or sets GPU memory to the given value byte by byte stream stream identifier devPtr pointer to GPU memory value value to set for each byte of the specified memory count size in bytes to set The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value. constexpr const char * constexpr const char* tf::to_string (cudaTaskType type) to_string cudaTaskType type convert a cuda_task type to a human-readable string std::ostream & std::ostream& tf::operator<< (std::ostream &os, const cudaTask &ct) operator<< std::ostream & os const cudaTask & ct overload of ostream inserter operator for cudaTask typename P typename C void void tf::cuda_single_task (P &&p, C c) cuda_single_task P && p C c runs a callable asynchronously using one kernel thread P execution policy type C closure type p execution policy c closure to run by one kernel thread The function launches a single kernel thread to run the given callable through the stream in the execution policy object. typename P typename I typename C void void tf::cuda_for_each (P &&p, I first, I last, C c) cuda_for_each P && p I first I last C c performs asynchronous parallel iterations over a range of items P execution policy type I input iterator type C unary operator type p execution policy object first iterator to the beginning of the range last iterator to the end of the range c unary operator to apply to each dereferenced iterator This function is equivalent to a parallel execution of the following loop on a GPU: for(autoitr=first;itr!=last;itr++){ c(*itr); } typename P typename I typename C void void tf::cuda_for_each_index (P &&p, I first, I last, I inc, C c) cuda_for_each_index P && p I first I last I inc C c performs asynchronous parallel iterations over an index-based range of items P execution policy type I input index type C unary operator type p execution policy object first index to the beginning of the range last index to the end of the range inc step size between successive iterations c unary operator to apply to each index This function is equivalent to a parallel execution of the following loop on a GPU: //stepispositive[first,last) for(autoi=first;i<last;i+=step){ c(i); } //stepisnegative[first,last) for(autoi=first;i>last;i+=step){ c(i); } typename C __global__ void __global__ void tf::cuda_single_task (C callable) cuda_single_task C callable typename P typename I typename O typename C void void tf::cuda_transform (P &&p, I first, I last, O output, C op) cuda_transform P && p I first I last O output C op performs asynchronous parallel transforms over a range of items P execution policy type I input iterator type O output iterator type C unary operator type p execution policy first iterator to the beginning of the range last iterator to the end of the range output iterator to the beginning of the output range op unary operator to apply to transform each item This method is equivalent to the parallel execution of the following loop on a GPU: while(first!=last){ *output++=op(*first++); } typename P typename I1 typename I2 typename O typename C void void tf::cuda_transform (P &&p, I1 first1, I1 last1, I2 first2, O output, C op) cuda_transform P && p I1 first1 I1 last1 I2 first2 O output C op performs asynchronous parallel transforms over two ranges of items P execution policy type I1 first input iterator type I2 second input iterator type O output iterator type C binary operator type p execution policy first1 iterator to the beginning of the first range last1 iterator to the end of the first range first2 iterator to the beginning of the second range output iterator to the beginning of the output range op binary operator to apply to transform each pair of items This method is equivalent to the parallel execution of the following loop on a GPU: while(first1!=last1){ *output++=op(*first1++,*first2++); } typename P typename I typename T typename O void void tf::cuda_reduce (P &&p, I first, I last, T *res, O op, void *buf) cuda_reduce P && p I first I last T * res O op void * buf performs asynchronous parallel reduction over a range of items P execution policy type I input iterator type T value type O binary operator type p execution policy first iterator to the beginning of the range last iterator to the end of the range res pointer to the result op binary operator to apply to reduce elements buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: while(first!=last){ *result=op(*result,*first++); } typename P typename I typename T typename O void void tf::cuda_uninitialized_reduce (P &&p, I first, I last, T *res, O op, void *buf) cuda_uninitialized_reduce P && p I first I last T * res O op void * buf performs asynchronous parallel reduction over a range of items without an initial value P execution policy type I input iterator type T value type O binary operator type p execution policy first iterator to the beginning of the range last iterator to the end of the range res pointer to the result op binary operator to apply to reduce elements buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: *result=*first++;//noinitialvaluespartitipcateintheloop while(first!=last){ *result=op(*result,*first++); } typename P typename I typename T typename O typename U void void tf::cuda_transform_reduce (P &&p, I first, I last, T *res, O bop, U uop, void *buf) cuda_transform_reduce P && p I first I last T * res O bop U uop void * buf performs asynchronous parallel reduction over a range of transformed items without an initial value P execution policy type I input iterator type T value type O binary operator type U unary operator type p execution policy first iterator to the beginning of the range last iterator to the end of the range res pointer to the result bop binary operator to apply to reduce elements uop unary operator to apply to transform elements buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: while(first!=last){ *result=bop(*result,uop(*first++)); } typename P typename I typename T typename O typename U void void tf::cuda_uninitialized_transform_reduce (P &&p, I first, I last, T *res, O bop, U uop, void *buf) cuda_uninitialized_transform_reduce P && p I first I last T * res O bop U uop void * buf performs asynchronous parallel reduction over a range of transformed items with an initial value P execution policy type I input iterator type T value type O binary operator type U unary operator type p execution policy first iterator to the beginning of the range last iterator to the end of the range res pointer to the result bop binary operator to apply to reduce elements uop unary operator to apply to transform elements buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: *result=uop(*first++);//noinitialvaluespartitipcateintheloop while(first!=last){ *result=bop(*result,uop(*first++)); } typename P typename I typename O typename C void void tf::cuda_inclusive_scan (P &&p, I first, I last, O output, C op, void *buf) cuda_inclusive_scan P && p I first I last O output C op void * buf performs asynchronous inclusive scan over a range of items P execution policy type I input iterator O output iterator C binary operator type p execution policy first iterator to the beginning of the input range last iterator to the end of the input range output iterator to the beginning of the output range op binary operator to apply to scan buf pointer to the temporary buffer typename P typename I typename O typename C typename U void void tf::cuda_transform_inclusive_scan (P &&p, I first, I last, O output, C bop, U uop, void *buf) cuda_transform_inclusive_scan P && p I first I last O output C bop U uop void * buf performs asynchronous inclusive scan over a range of transformed items P execution policy type I input iterator O output iterator C binary operator type U unary operator type p execution policy first iterator to the beginning of the input range last iterator to the end of the input range output iterator to the beginning of the output range bop binary operator to apply to scan uop unary operator to apply to transform each item before scan buf pointer to the temporary buffer typename P typename I typename O typename C void void tf::cuda_exclusive_scan (P &&p, I first, I last, O output, C op, void *buf) cuda_exclusive_scan P && p I first I last O output C op void * buf performs asynchronous exclusive scan over a range of items P execution policy type I input iterator O output iterator C binary operator type p execution policy first iterator to the beginning of the input range last iterator to the end of the input range output iterator to the beginning of the output range op binary operator to apply to scan buf pointer to the temporary buffer typename P typename I typename O typename C typename U void void tf::cuda_transform_exclusive_scan (P &&p, I first, I last, O output, C bop, U uop, void *buf) cuda_transform_exclusive_scan P && p I first I last O output C bop U uop void * buf performs asynchronous exclusive scan over a range of items P execution policy type I input iterator O output iterator C binary operator type U unary operator type p execution policy first iterator to the beginning of the input range last iterator to the end of the input range output iterator to the beginning of the output range bop binary operator to apply to scan uop unary operator to apply to transform each item before scan buf pointer to the temporary buffer typename P typename a_keys_it typename a_vals_it typename b_keys_it typename b_vals_it typename c_keys_it typename c_vals_it typename C void void tf::cuda_merge_by_key (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void *buf) cuda_merge_by_key P && p a_keys_it a_keys_first a_keys_it a_keys_last a_vals_it a_vals_first b_keys_it b_keys_first b_keys_it b_keys_last b_vals_it b_vals_first c_keys_it c_keys_first c_vals_it c_vals_first C comp void * buf performs asynchronous key-value merge over a range of keys and values P execution policy type a_keys_it first key iterator type a_vals_it first value iterator type b_keys_it second key iterator type b_vals_it second value iterator type c_keys_it output key iterator type c_vals_it output value iterator type C comparator type p execution policy a_keys_first iterator to the beginning of the first key range a_keys_last iterator to the end of the first key range a_vals_first iterator to the beginning of the first value range b_keys_first iterator to the beginning of the second key range b_keys_last iterator to the end of the second key range b_vals_first iterator to the beginning of the second value range c_keys_first iterator to the beginning of the output key range c_vals_first iterator to the beginning of the output value range comp comparator buf pointer to the temporary buffer Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order. At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key. For example, assume: a_keys = {1, 8}; a_vals = {2, 1}; b_keys = {3, 7}; b_vals = {3, 4}; After the merge, we have: c_keys = {1, 3, 7, 8} c_vals = {2, 3, 4, 1} typename P typename a_keys_it typename b_keys_it typename c_keys_it typename C void void tf::cuda_merge (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void *buf) cuda_merge P && p a_keys_it a_keys_first a_keys_it a_keys_last b_keys_it b_keys_first b_keys_it b_keys_last c_keys_it c_keys_first C comp void * buf performs asynchronous key-only merge over a range of keys P execution policy type a_keys_it first key iterator type b_keys_it second key iterator type c_keys_it output key iterator type C comparator type p execution policy a_keys_first iterator to the beginning of the first key range a_keys_last iterator to the end of the first key range b_keys_first iterator to the beginning of the second key range b_keys_last iterator to the end of the second key range c_keys_first iterator to the beginning of the output key range comp comparator buf pointer to the temporary buffer This function is equivalent to tf::cuda_merge_by_key without values. typename P typename K typename V cudaEmpty unsigned unsigned tf::cuda_sort_buffer_size (unsigned count) cuda_sort_buffer_size unsigned count queries the buffer size in bytes needed to call sort kernels for the given number of elements P execution policy type K key type V value type (default tf::cudaEmpty) count number of keys/values to sort The function is used to allocate a buffer for calling tf::cuda_sort. typename P typename K_it typename V_it typename C void void tf::cuda_sort_by_key (P &&p, K_it k_first, K_it k_last, V_it v_first, C comp, void *buf) cuda_sort_by_key P && p K_it k_first K_it k_last V_it v_first C comp void * buf performs asynchronous key-value sort on a range of items P execution policy type K_it key iterator type V_it value iterator type C comparator type p execution policy k_first iterator to the beginning of the key range k_last iterator to the end of the key range v_first iterator to the beginning of the value range comp binary comparator buf pointer to the temporary buffer Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false. For example, assume: keys are {1, 4, 2, 8, 5, 7} values are {'a', 'b', 'c', 'd', 'e', 'f'} After sort: keys are {1, 2, 4, 5, 7, 8} values are {'a', 'c', 'b', 'e', 'f', 'd'} typename P typename K_it typename C void void tf::cuda_sort (P &&p, K_it k_first, K_it k_last, C comp, void *buf) cuda_sort P && p K_it k_first K_it k_last C comp void * buf performs asynchronous key-only sort on a range of items P execution policy type K_it key iterator type C comparator type p execution policy k_first iterator to the beginning of the key range k_last iterator to the end of the key range comp binary comparator buf pointer to the temporary buffer This method is equivalent to tf::cuda_sort_by_key without values. typename P typename I typename U void void tf::cuda_find_if (P &&p, I first, I last, unsigned *idx, U op) cuda_find_if P && p I first I last unsigned * idx U op finds the index of the first element that satisfies the given criteria P execution policy type I input iterator type U unary operator type p execution policy first iterator to the beginning of the range last iterator to the end of the range idx pointer to the index of the found element op unary operator which returns true for the required element The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop: unsignedidx=0; for(;first!=last;++first,++idx){ if(p(*first)){ returnidx; } } returnidx; typename P typename I typename O void void tf::cuda_min_element (P &&p, I first, I last, unsigned *idx, O op, void *buf) cuda_min_element P && p I first I last unsigned * idx O op void * buf finds the index of the minimum element in a range P execution policy type I input iterator type O comparator type p execution policy object first iterator to the beginning of the range last iterator to the end of the range idx solution index of the minimum element op comparison function object buf pointer to the buffer The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop: if(first==last){ return0; } autosmallest=first; for(++first;first!=last;++first){ if(op(*first,*smallest)){ smallest=first; } } returnstd::distance(first,smallest); typename P typename I typename O void void tf::cuda_max_element (P &&p, I first, I last, unsigned *idx, O op, void *buf) cuda_max_element P && p I first I last unsigned * idx O op void * buf finds the index of the maximum element in a range P execution policy type I input iterator type O comparator type p execution policy object first iterator to the beginning of the range last iterator to the end of the range idx solution index of the maximum element op comparison function object buf pointer to the buffer The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop: if(first==last){ return0; } autolargest=first; for(++first;first!=last;++first){ if(op(*largest,*first)){ largest=first; } } returnstd::distance(first,largest); constexpr const char * constexpr const char* tf::version () version queries the version information in a string format major.minor.patch Release notes are available here: https://taskflow.github.io/taskflow/Releases.html taskflow namespace