#pragma once #include "sycl_execution_policy.hpp" namespace tf { // default warp size inline constexpr unsigned SYCL_WARP_SIZE = 32; // empty type struct syclEmpty { }; // ---------------------------------------------------------------------------- // iterator unrolling // ---------------------------------------------------------------------------- // Template unrolled looping construct. template struct syclIterate { template static void eval(F f) { f(i); syclIterate::eval(f); } }; template struct syclIterate { template static void eval(F) { } }; template void sycl_iterate(F f) { syclIterate::eval(f); } template void sycl_iterate(F f) { sycl_iterate<0, count>(f); } template T reduce(const T(&x)[count]) { T y; sycl_iterate([&](auto i) { y = i ? x[i] + y : x[i]; }); return y; } template void fill(T(&x)[count], T val) { sycl_iterate([&](auto i) { x[i] = val; }); } // Invoke unconditionally. template void sycl_strided_iterate(F f, unsigned tid) { sycl_iterate([=](auto i) { f(i, nt * i + tid); }); } // Check range. template void sycl_strided_iterate(F f, unsigned tid, unsigned count) { // Unroll the first vt0 elements of each thread. if(vt0 > 1 && count >= nt * vt0) { sycl_strided_iterate(f, tid); // No checking } else { sycl_iterate([=](auto i) { auto j = nt * i + tid; if(j < count) f(i, j); }); } // TODO: seems dummy when vt0 == vt sycl_iterate([=](auto i) { auto j = nt * i + tid; if(j < count) f(i, j); }); } template void sycl_thread_iterate(F f, unsigned tid) { sycl_iterate([=](auto i) { f(i, vt * tid + i); }); } // ---------------------------------------------------------------------------- // syclRange // ---------------------------------------------------------------------------- // syclRange struct syclRange { unsigned begin, end; unsigned size() const { return end - begin; } unsigned count() const { return size(); } bool valid() const { return end > begin; } }; inline syclRange sycl_get_tile(unsigned b, unsigned nv, unsigned count) { return syclRange { nv * b, std::min(count, nv * (b + 1)) }; } // ---------------------------------------------------------------------------- // syclArray // ---------------------------------------------------------------------------- template struct syclArray { T data[size]; T operator[](unsigned i) const { return data[i]; } T& operator[](unsigned i) { return data[i]; } syclArray() = default; syclArray(const syclArray&) = default; syclArray& operator=(const syclArray&) = default; // Fill the array with x. syclArray(T x) { sycl_iterate([&](unsigned i) { data[i] = x; }); } }; template struct syclArray { T operator[](unsigned) const { return T(); } T& operator[](unsigned) { return *(T*)nullptr; } }; template struct syclKVArray { syclArray keys; syclArray vals; }; // ---------------------------------------------------------------------------- // thread reg <-> global mem // ---------------------------------------------------------------------------- template auto sycl_mem_to_reg_strided(I mem, unsigned tid, unsigned count) { using T = typename std::iterator_traits::value_type; syclArray x; sycl_strided_iterate( [&](auto i, auto j) { x[i] = mem[j]; }, tid, count ); return x; } template void sycl_reg_to_mem_strided( syclArray x, unsigned tid, unsigned count, it_t mem) { sycl_strided_iterate( [=](auto i, auto j) { mem[j] = x[i]; }, tid, count ); } template auto sycl_transform_mem_to_reg_strided( I mem, unsigned tid, unsigned count, O op ) { using T = std::invoke_result_t::value_type>; syclArray x; sycl_strided_iterate( [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count ); return x; } // ---------------------------------------------------------------------------- // thread reg <-> shared // ---------------------------------------------------------------------------- //template //void sycl_reg_to_shared_thread( // syclArray x, unsigned tid, T (&shared)[shared_size], bool sync = true //) { // // static_assert(shared_size >= nt * vt, // "reg_to_shared_thread must have at least nt * vt storage"); // // sycl_thread_iterate([&](auto i, auto j) { shared[j] = x[i]; }, tid); // // if(sync) __syncthreads(); //} // //template //auto sycl_shared_to_reg_thread( // const T (&shared)[shared_size], unsigned tid, bool sync = true //) { // // static_assert(shared_size >= nt * vt, // "reg_to_shared_thread must have at least nt * vt storage"); // // syclArray x; // sycl_thread_iterate([&](auto i, auto j) { // x[i] = shared[j]; // }, tid); // // if(sync) __syncthreads(); // // return x; //} // //template //void sycl_reg_to_shared_strided( // syclArray x, unsigned tid, T (&shared)[shared_size], bool sync = true //) { // // static_assert(shared_size >= nt * vt, // "reg_to_shared_strided must have at least nt * vt storage"); // // sycl_strided_iterate( // [&](auto i, auto j) { shared[j] = x[i]; }, tid // ); // // if(sync) __syncthreads(); //} // //template //auto sycl_shared_to_reg_strided( // const T (&shared)[shared_size], unsigned tid, bool sync = true //) { // // static_assert(shared_size >= nt * vt, // "shared_to_reg_strided must have at least nt * vt storage"); // // syclArray x; // sycl_strided_iterate([&](auto i, auto j) { x[i] = shared[j]; }, tid); // if(sync) __syncthreads(); // // return x; //} // //template< // unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, // unsigned shared_size //> //auto sycl_reg_to_mem_thread( // syclArray x, unsigned tid, // unsigned count, it_t mem, T (&shared)[shared_size] //) { // sycl_reg_to_shared_thread(x, tid, shared); // auto y = sycl_shared_to_reg_strided(shared, tid); // sycl_reg_to_mem_strided(y, tid, count, mem); //} // //template< // unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, // unsigned shared_size //> //auto sycl_mem_to_reg_thread( // it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size] //) { // // auto x = sycl_mem_to_reg_strided(mem, tid, count); // sycl_reg_to_shared_strided(x, tid, shared); // auto y = sycl_shared_to_reg_thread(shared, tid); // return y; //} // //template //auto sycl_shared_gather( // const T(&data)[S], syclArray indices, bool sync = true //) { // // static_assert(S >= nt * vt, // "shared_gather must have at least nt * vt storage"); // // syclArray x; // sycl_iterate([&](auto i) { x[i] = data[indices[i]]; }); // // if(sync) __syncthreads(); // // return x; //} // // // //// ---------------------------------------------------------------------------- //// reg<->reg //// ---------------------------------------------------------------------------- // //template //auto sycl_reg_thread_to_strided( // syclArray x, unsigned tid, T (&shared)[S] //) { // sycl_reg_to_shared_thread(x, tid, shared); // return sycl_shared_to_reg_strided(shared, tid); //} // //template //auto sycl_reg_strided_to_thread( // syclArray x, unsigned tid, T (&shared)[S] //) { // sycl_reg_to_shared_strided(x, tid, shared); // return sycl_shared_to_reg_thread(shared, tid); //} // ---------------------------------------------------------------------------- // syclLoadStoreIterator // ---------------------------------------------------------------------------- template struct syclLoadStoreIterator : std::iterator_traits { L load; S store; I base; syclLoadStoreIterator(L load_, S store_, I base_) : load(load_), store(store_), base(base_) { } struct assign_t { L load; S store; I index; assign_t& operator=(T rhs) { static_assert(!std::is_same::value, "load_iterator is being stored to."); store(rhs, index); return *this; } operator T() const { static_assert(!std::is_same::value, "store_iterator is being loaded from."); return load(index); } }; assign_t operator[](I index) const { return assign_t { load, store, base + index }; } assign_t operator*() const { return assign_t { load, store, base }; } syclLoadStoreIterator operator+(I offset) const { syclLoadStoreIterator cp = *this; cp += offset; return cp; } syclLoadStoreIterator& operator+=(I offset) { base += offset; return *this; } syclLoadStoreIterator operator-(I offset) const { syclLoadStoreIterator cp = *this; cp -= offset; return cp; } syclLoadStoreIterator& operator-=(I offset) { base -= offset; return *this; } }; //template //struct trivial_load_functor { // template // T operator()(I index) const { // return T(); // } //}; //template //struct trivial_store_functor { // template // void operator()(T v, I index) const { } //}; template auto sycl_make_load_store_iterator(L load, S store, I base = 0) { return syclLoadStoreIterator(load, store, base); } template auto sycl_make_load_iterator(L load, I base = 0) { return sycl_make_load_store_iterator(load, syclEmpty(), base); } template auto sycl_make_store_iterator(S store, I base = 0) { return sycl_make_load_store_iterator(syclEmpty(), store, base); } // ---------------------------------------------------------------------------- // swap // ---------------------------------------------------------------------------- template void sycl_swap(T& a, T& b) { auto c = a; a = b; b = c; } // ---------------------------------------------------------------------------- // launch kernel // ---------------------------------------------------------------------------- //template //__global__ void sycl_kernel(F f, args_t... args) { // f(threadIdx.x, blockIdx.x, args...); //} // ---------------------------------------------------------------------------- // operators // ---------------------------------------------------------------------------- template struct sycl_plus : public std::binary_function { T operator()(T a, T b) const { return a + b; } }; template struct sycl_minus : public std::binary_function { T operator()(T a, T b) const { return a - b; } }; template struct sycl_multiplies : public std::binary_function { T operator()(T a, T b) const { return a * b; } }; template struct sycl_maximum : public std::binary_function { T operator()(T a, T b) const { return a > b ? a : b; } }; template struct sycl_minimum : public std::binary_function { T operator()(T a, T b) const { return a < b ? a : b; } }; template struct sycl_less : public std::binary_function { T operator()(T a, T b) const { return a < b; } }; template struct sycl_greater : public std::binary_function { T operator()(T a, T b) const { return a > b; } }; // ---------------------------------------------------------------------------- // Memory Object // ---------------------------------------------------------------------------- /** @private */ template class syclScopedDeviceMemory { public: syclScopedDeviceMemory() = delete; syclScopedDeviceMemory(size_t N, sycl::queue& queue) : _queue {queue}, _N {N} { if(N) { _data = sycl::malloc_device(N, _queue); } } syclScopedDeviceMemory(syclScopedDeviceMemory&& rhs) : _queue{std::move(rhs._queue)}, _data{rhs._data}, _N {rhs._N} { rhs._data = nullptr; rhs._N = 0; } ~syclScopedDeviceMemory() { if(_data) { sycl::free(_data, _queue); } } syclScopedDeviceMemory& operator = (syclScopedDeviceMemory&& rhs) { if(_data) { sycl::free(_data, _queue); } _queue = std::move(rhs._queue); _data = rhs._data; _N = rhs._N; rhs._data = nullptr; rhs._N = 0; return *this; } size_t size() const { return _N; } T* data() { return _data; } const T* data() const { return _data; } syclScopedDeviceMemory(const syclScopedDeviceMemory&) = delete; syclScopedDeviceMemory& operator = (const syclScopedDeviceMemory&) = delete; private: sycl::queue& _queue; T* _data {nullptr}; size_t _N {0}; }; } // end of namespace tf -----------------------------------------------------