tf::detail tf::detail::cudaBlockReduce tf::detail::cudaScanResult tf::detail::cudaScanResult< T, vt, true > tf::detail::cudaBlockScan tf::detail::cudaMergePair tf::detail::cudaMergeRange tf::detail::cudaBlockSort tf::detail::cudaFindPair int cudaScanType EXCLUSIVE = 1 INCLUSIVE cudaMergeBoundType LOWER UPPER merge bound type constexpr unsigned constexpr unsigned tf::detail::cudaScanRecursionThreshold cudaScanRecursionThreshold = 8 uint64_t uint64_t tf::detail::NextCapacity (uint64_t A) NextCapacity uint64_t A NextCapacity - Returns the next power of two (in 64-bits) that is strictly greater than A. Returns zero on overflow. this function assumes A to be positive. size_t nt nt size_t vt vt typename I typename C __global__ void __global__ void tf::detail::cuda_for_each_kernel (I first, unsigned count, C c) cuda_for_each_kernel I first unsigned count C c size_t nt nt size_t vt vt typename I typename C __global__ void __global__ void tf::detail::cuda_for_each_index_kernel (I first, I inc, unsigned count, C c) cuda_for_each_index_kernel I first I inc unsigned count C c size_t nt nt size_t vt vt typename I typename O typename C __global__ void __global__ void tf::detail::cuda_transform_kernel (I first, unsigned count, O output, C op) cuda_transform_kernel I first unsigned count O output C op size_t nt nt size_t vt vt typename I1 typename I2 typename O typename C __global__ void __global__ void tf::detail::cuda_transform_kernel (I1 first1, I2 first2, unsigned count, O output, C op) cuda_transform_kernel I1 first1 I2 first2 unsigned count O output C op size_t nt nt size_t vt vt typename I typename T typename O __global__ void __global__ void tf::detail::cuda_reduce_kernel (I input, unsigned count, T *res, O op, void *ptr) cuda_reduce_kernel I input unsigned count T * res O op void * ptr typename P typename I typename T typename O void void tf::detail::cuda_reduce_loop (P &&p, I input, unsigned count, T *res, O op, void *ptr) cuda_reduce_loop P && p I input unsigned count T * res O op void * ptr size_t nt nt size_t vt vt typename I typename T typename O __global__ void __global__ void tf::detail::cuda_uninitialized_reduce_kernel (I input, unsigned count, T *res, O op, void *ptr) cuda_uninitialized_reduce_kernel I input unsigned count T * res O op void * ptr typename P typename I typename T typename O void void tf::detail::cuda_uninitialized_reduce_loop (P &&p, I input, unsigned count, T *res, O op, void *ptr) cuda_uninitialized_reduce_loop P && p I input unsigned count T * res O op void * ptr typename P typename I typename O typename C void void tf::detail::cuda_single_pass_scan (P &&p, cudaScanType scan_type, I input, unsigned count, O output, C op) cuda_single_pass_scan P && p cudaScanType scan_type I input unsigned count O output C op single-pass scan for small input typename P typename I typename O typename C void void tf::detail::cuda_scan_loop (P &&p, cudaScanType scan_type, I input, unsigned count, O output, C op, void *ptr) cuda_scan_loop P && p cudaScanType scan_type I input unsigned count O output C op void * ptr main scan loop cudaMergeBoundType bounds bounds cudaMergeBoundType::LOWER typename a_keys_it typename b_keys_it typename comp_t __device__ auto __device__ auto tf::detail::cuda_merge_path (a_keys_it a_keys, unsigned a_count, b_keys_it b_keys, unsigned b_count, unsigned diag, comp_t comp) cuda_merge_path a_keys_it a_keys unsigned a_count b_keys_it b_keys unsigned b_count unsigned diag comp_t comp cudaMergeBoundType bounds bounds typename keys_it typename comp_t __device__ auto __device__ auto tf::detail::cuda_merge_path (keys_it keys, cudaMergeRange range, unsigned diag, comp_t comp) cuda_merge_path keys_it keys cudaMergeRange range unsigned diag comp_t comp cudaMergeBoundType bounds bounds bool range_check range_check typename T typename comp_t __device__ bool __device__ bool tf::detail::cuda_merge_predicate (T a_key, T b_key, cudaMergeRange range, comp_t comp) cuda_merge_predicate T a_key T b_key cudaMergeRange range comp_t comp __device__ auto __device__ auto tf::detail::cuda_compute_merge_range (unsigned a_count, unsigned b_count, unsigned partition, unsigned spacing, unsigned mp0, unsigned mp1) cuda_compute_merge_range unsigned a_count unsigned b_count unsigned partition unsigned spacing unsigned mp0 unsigned mp1 unsigned nt nt unsigned vt vt typename T __device__ auto __device__ auto tf::detail::cuda_load_two_streams_reg (const T *a, unsigned a_count, const T *b, unsigned b_count, unsigned tid) cuda_load_two_streams_reg const T * a unsigned a_count const T * b unsigned b_count unsigned tid Specialization that emits just one LD instruction. Can only reliably used with raw pointer types. Fixed not to use pointer arithmetic so that we don't get undefined behaviors with unaligned types. unsigned nt nt unsigned vt vt typename T typename a_it typename b_it __device__ std::enable_if_t< !(std::is_pointer< a_it >::value &&std::is_pointer< b_it >::value), cudaArray< T, vt >> __device__ std::enable_if_t< !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), cudaArray<T, vt>> tf::detail::load_two_streams_reg (a_it a, unsigned a_count, b_it b, unsigned b_count, unsigned tid) load_two_streams_reg a_it a unsigned a_count b_it b unsigned b_count unsigned tid unsigned nt nt unsigned vt vt typename A typename B typename T unsigned S S __device__ void __device__ void tf::detail::cuda_load_two_streams_shared (A a, unsigned a_count, B b, unsigned b_count, unsigned tid, T(&shared)[S], bool sync=true) cuda_load_two_streams_shared A a unsigned a_count B b unsigned b_count unsigned tid T(&) shared [S] bool sync true unsigned nt nt unsigned vt vt typename T __device__ auto __device__ auto tf::detail::cuda_gather_two_streams_strided (const T *a, unsigned a_count, const T *b, unsigned b_count, cudaArray< unsigned, vt > indices, unsigned tid) cuda_gather_two_streams_strided const T * a unsigned a_count const T * b unsigned b_count cudaArray< unsigned, vt > indices unsigned tid unsigned nt nt unsigned vt vt typename T typename a_it typename b_it __device__ std::enable_if_t< !(std::is_pointer< a_it >::value &&std::is_pointer< b_it >::value), cudaArray< T, vt >> __device__ std::enable_if_t< !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), cudaArray<T, vt>> tf::detail::cuda_gather_two_streams_strided (a_it a, unsigned a_count, b_it b, unsigned b_count, cudaArray< unsigned, vt > indices, unsigned tid) cuda_gather_two_streams_strided a_it a unsigned a_count b_it b unsigned b_count cudaArray< unsigned, vt > indices unsigned tid unsigned nt nt unsigned vt vt typename a_it typename b_it typename c_it __device__ void __device__ void tf::detail::cuda_transfer_two_streams_strided (a_it a, unsigned a_count, b_it b, unsigned b_count, cudaArray< unsigned, vt > indices, unsigned tid, c_it c) cuda_transfer_two_streams_strided a_it a unsigned a_count b_it b unsigned b_count cudaArray< unsigned, vt > indices unsigned tid c_it c cudaMergeBoundType bounds bounds unsigned vt vt typename T typename comp_t __device__ auto __device__ auto tf::detail::cuda_serial_merge (const T *keys_shared, cudaMergeRange range, comp_t comp, bool sync=true) cuda_serial_merge const T * keys_shared cudaMergeRange range comp_t comp bool sync true This function must be able to dereference keys[a_begin] and keys[b_begin], no matter the indices for each. The caller should allocate at least nt * vt + 1 elements for cudaMergeBoundType bounds bounds unsigned nt nt unsigned vt vt typename a_it typename b_it typename T typename comp_t unsigned S S __device__ auto __device__ auto tf::detail::block_merge_from_mem (a_it a, b_it b, cudaMergeRange range_mem, unsigned tid, comp_t comp, T(&keys_shared)[S]) block_merge_from_mem a_it a b_it b cudaMergeRange range_mem unsigned tid comp_t comp T(&) keys_shared [S] Load arrays a and b from global memory and merge unsignedo register. cudaMergeBoundType bounds bounds typename P typename a_keys_it typename b_keys_it typename comp_t void void tf::detail::cuda_merge_path_partitions (P &&p, a_keys_it a, unsigned a_count, b_keys_it b, unsigned b_count, unsigned spacing, comp_t comp, unsigned *buf) cuda_merge_path_partitions P && p a_keys_it a unsigned a_count b_keys_it b unsigned b_count unsigned spacing comp_t comp unsigned * buf typename P typename a_keys_it typename a_vals_it typename b_keys_it typename b_vals_it typename c_keys_it typename c_vals_it typename comp_t void void tf::detail::cuda_merge_loop (P &&p, a_keys_it a_keys, a_vals_it a_vals, unsigned a_count, b_keys_it b_keys, b_vals_it b_vals, unsigned b_count, c_keys_it c_keys, c_vals_it c_vals, comp_t comp, void *ptr) cuda_merge_loop P && p a_keys_it a_keys a_vals_it a_vals unsigned a_count b_keys_it b_keys b_vals_it b_vals unsigned b_count c_keys_it c_keys c_vals_it c_vals comp_t comp void * ptr constexpr int constexpr int tf::detail::cuda_clz (int x) cuda_clz int x counts the number of leading zeros starting from the most significant bit constexpr int constexpr int tf::detail::cuda_find_log2 (int x, bool round_up=false) cuda_find_log2 int x bool round_up false finds log2(x) and optionally round up to the next integer logarithm. typename T unsigned vt vt typename C __device__ auto __device__ auto tf::detail::cuda_odd_even_sort (cudaArray< T, vt > x, C comp, int flags=0) cuda_odd_even_sort cudaArray< T, vt > x C comp int flags 0 typename K typename V unsigned vt vt typename C __device__ auto __device__ auto tf::detail::cuda_odd_even_sort (cudaKVArray< K, V, vt > x, C comp, int flags=0) cuda_odd_even_sort cudaKVArray< K, V, vt > x C comp int flags 0 __device__ int __device__ int tf::detail::cuda_out_of_range_flags (int first, int vt, int count) cuda_out_of_range_flags int first int vt int count __device__ auto __device__ auto tf::detail::cuda_compute_merge_sort_frame (unsigned partition, unsigned coop, unsigned spacing) cuda_compute_merge_sort_frame unsigned partition unsigned coop unsigned spacing __device__ auto __device__ auto tf::detail::cuda_compute_merge_sort_range (unsigned count, unsigned partition, unsigned coop, unsigned spacing) cuda_compute_merge_sort_range unsigned count unsigned partition unsigned coop unsigned spacing __device__ auto __device__ auto tf::detail::cuda_compute_merge_sort_range (unsigned count, unsigned partition, unsigned coop, unsigned spacing, unsigned mp0, unsigned mp1) cuda_compute_merge_sort_range unsigned count unsigned partition unsigned coop unsigned spacing unsigned mp0 unsigned mp1 typename P typename K typename C void void tf::detail::cuda_merge_sort_partitions (P &&p, K keys, unsigned count, unsigned coop, unsigned spacing, C comp, unsigned *buf) cuda_merge_sort_partitions P && p K keys unsigned count unsigned coop unsigned spacing C comp unsigned * buf typename P typename K_it typename V_it typename C void void tf::detail::merge_sort_loop (P &&p, K_it keys_input, V_it vals_input, unsigned count, C comp, void *buf) merge_sort_loop P && p K_it keys_input V_it vals_input unsigned count C comp void * buf typename P typename I typename U void void tf::detail::cuda_find_if_loop (P &&p, I input, unsigned count, unsigned *idx, U pred) cuda_find_if_loop P && p I input unsigned count unsigned * idx U pred typename P typename I typename O void void tf::detail::cuda_min_element_loop (P &&p, I input, unsigned count, unsigned *idx, O op, void *ptr) cuda_min_element_loop P && p I input unsigned count unsigned * idx O op void * ptr typename P typename I typename O void void tf::detail::cuda_max_element_loop (P &&p, I input, unsigned count, unsigned *idx, O op, void *ptr) cuda_max_element_loop P && p I input unsigned count unsigned * idx O op void * ptr