517 lines
14 KiB
C++
517 lines
14 KiB
C++
#pragma once
|
|
|
|
#include "sycl_execution_policy.hpp"
|
|
|
|
namespace tf {
|
|
|
|
// default warp size
|
|
inline constexpr unsigned SYCL_WARP_SIZE = 32;
|
|
|
|
// empty type
|
|
struct syclEmpty { };
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// iterator unrolling
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// Template unrolled looping construct.
|
|
template<unsigned i, unsigned count, bool valid = (i < count)>
|
|
struct syclIterate {
|
|
template<typename F>
|
|
static void eval(F f) {
|
|
f(i);
|
|
syclIterate<i + 1, count>::eval(f);
|
|
}
|
|
};
|
|
|
|
template<unsigned i, unsigned count>
|
|
struct syclIterate<i, count, false> {
|
|
template<typename F>
|
|
static void eval(F) { }
|
|
};
|
|
|
|
template<unsigned begin, unsigned end, typename F>
|
|
void sycl_iterate(F f) {
|
|
syclIterate<begin, end>::eval(f);
|
|
}
|
|
|
|
template<unsigned count, typename F>
|
|
void sycl_iterate(F f) {
|
|
sycl_iterate<0, count>(f);
|
|
}
|
|
|
|
template<unsigned count, typename T>
|
|
T reduce(const T(&x)[count]) {
|
|
T y;
|
|
sycl_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; });
|
|
return y;
|
|
}
|
|
|
|
template<unsigned count, typename T>
|
|
void fill(T(&x)[count], T val) {
|
|
sycl_iterate<count>([&](auto i) { x[i] = val; });
|
|
}
|
|
|
|
// Invoke unconditionally.
|
|
template<unsigned nt, unsigned vt, typename F>
|
|
void sycl_strided_iterate(F f, unsigned tid) {
|
|
sycl_iterate<vt>([=](auto i) { f(i, nt * i + tid); });
|
|
}
|
|
|
|
// Check range.
|
|
template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F>
|
|
void sycl_strided_iterate(F f, unsigned tid, unsigned count) {
|
|
// Unroll the first vt0 elements of each thread.
|
|
if(vt0 > 1 && count >= nt * vt0) {
|
|
sycl_strided_iterate<nt, vt0>(f, tid); // No checking
|
|
} else {
|
|
sycl_iterate<vt0>([=](auto i) {
|
|
auto j = nt * i + tid;
|
|
if(j < count) f(i, j);
|
|
});
|
|
}
|
|
|
|
// TODO: seems dummy when vt0 == vt
|
|
sycl_iterate<vt0, vt>([=](auto i) {
|
|
auto j = nt * i + tid;
|
|
if(j < count) f(i, j);
|
|
});
|
|
}
|
|
|
|
template<unsigned vt, typename F>
|
|
void sycl_thread_iterate(F f, unsigned tid) {
|
|
sycl_iterate<vt>([=](auto i) { f(i, vt * tid + i); });
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// syclRange
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// syclRange
|
|
struct syclRange {
|
|
unsigned begin, end;
|
|
unsigned size() const { return end - begin; }
|
|
unsigned count() const { return size(); }
|
|
bool valid() const { return end > begin; }
|
|
};
|
|
|
|
inline syclRange sycl_get_tile(unsigned b, unsigned nv, unsigned count) {
|
|
return syclRange { nv * b, std::min(count, nv * (b + 1)) };
|
|
}
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// syclArray
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template<typename T, unsigned size>
|
|
struct syclArray {
|
|
T data[size];
|
|
|
|
T operator[](unsigned i) const { return data[i]; }
|
|
T& operator[](unsigned i) { return data[i]; }
|
|
|
|
syclArray() = default;
|
|
syclArray(const syclArray&) = default;
|
|
syclArray& operator=(const syclArray&) = default;
|
|
|
|
// Fill the array with x.
|
|
syclArray(T x) {
|
|
sycl_iterate<size>([&](unsigned i) { data[i] = x; });
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
struct syclArray<T, 0> {
|
|
T operator[](unsigned) const { return T(); }
|
|
T& operator[](unsigned) { return *(T*)nullptr; }
|
|
};
|
|
|
|
template<typename T, typename V, unsigned size>
|
|
struct syclKVArray {
|
|
syclArray<T, size> keys;
|
|
syclArray<V, size> vals;
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// thread reg <-> global mem
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I>
|
|
auto sycl_mem_to_reg_strided(I mem, unsigned tid, unsigned count) {
|
|
using T = typename std::iterator_traits<I>::value_type;
|
|
syclArray<T, vt> x;
|
|
sycl_strided_iterate<nt, vt, vt0>(
|
|
[&](auto i, auto j) { x[i] = mem[j]; }, tid, count
|
|
);
|
|
return x;
|
|
}
|
|
|
|
template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t>
|
|
void sycl_reg_to_mem_strided(
|
|
syclArray<T, vt> x, unsigned tid, unsigned count, it_t mem) {
|
|
|
|
sycl_strided_iterate<nt, vt, vt0>(
|
|
[=](auto i, auto j) { mem[j] = x[i]; }, tid, count
|
|
);
|
|
}
|
|
|
|
template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O>
|
|
auto sycl_transform_mem_to_reg_strided(
|
|
I mem, unsigned tid, unsigned count, O op
|
|
) {
|
|
using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>;
|
|
syclArray<T, vt> x;
|
|
sycl_strided_iterate<nt, vt, vt0>(
|
|
[&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count
|
|
);
|
|
return x;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// thread reg <-> shared
|
|
// ----------------------------------------------------------------------------
|
|
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
|
|
//void sycl_reg_to_shared_thread(
|
|
// syclArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
|
|
//) {
|
|
//
|
|
// static_assert(shared_size >= nt * vt,
|
|
// "reg_to_shared_thread must have at least nt * vt storage");
|
|
//
|
|
// sycl_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid);
|
|
//
|
|
// if(sync) __syncthreads();
|
|
//}
|
|
//
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
|
|
//auto sycl_shared_to_reg_thread(
|
|
// const T (&shared)[shared_size], unsigned tid, bool sync = true
|
|
//) {
|
|
//
|
|
// static_assert(shared_size >= nt * vt,
|
|
// "reg_to_shared_thread must have at least nt * vt storage");
|
|
//
|
|
// syclArray<T, vt> x;
|
|
// sycl_thread_iterate<vt>([&](auto i, auto j) {
|
|
// x[i] = shared[j];
|
|
// }, tid);
|
|
//
|
|
// if(sync) __syncthreads();
|
|
//
|
|
// return x;
|
|
//}
|
|
//
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
|
|
//void sycl_reg_to_shared_strided(
|
|
// syclArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
|
|
//) {
|
|
//
|
|
// static_assert(shared_size >= nt * vt,
|
|
// "reg_to_shared_strided must have at least nt * vt storage");
|
|
//
|
|
// sycl_strided_iterate<nt, vt>(
|
|
// [&](auto i, auto j) { shared[j] = x[i]; }, tid
|
|
// );
|
|
//
|
|
// if(sync) __syncthreads();
|
|
//}
|
|
//
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
|
|
//auto sycl_shared_to_reg_strided(
|
|
// const T (&shared)[shared_size], unsigned tid, bool sync = true
|
|
//) {
|
|
//
|
|
// static_assert(shared_size >= nt * vt,
|
|
// "shared_to_reg_strided must have at least nt * vt storage");
|
|
//
|
|
// syclArray<T, vt> x;
|
|
// sycl_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid);
|
|
// if(sync) __syncthreads();
|
|
//
|
|
// return x;
|
|
//}
|
|
//
|
|
//template<
|
|
// unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
|
|
// unsigned shared_size
|
|
//>
|
|
//auto sycl_reg_to_mem_thread(
|
|
// syclArray<T, vt> x, unsigned tid,
|
|
// unsigned count, it_t mem, T (&shared)[shared_size]
|
|
//) {
|
|
// sycl_reg_to_shared_thread<nt>(x, tid, shared);
|
|
// auto y = sycl_shared_to_reg_strided<nt, vt>(shared, tid);
|
|
// sycl_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem);
|
|
//}
|
|
//
|
|
//template<
|
|
// unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
|
|
// unsigned shared_size
|
|
//>
|
|
//auto sycl_mem_to_reg_thread(
|
|
// it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size]
|
|
//) {
|
|
//
|
|
// auto x = sycl_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
|
|
// sycl_reg_to_shared_strided<nt, vt>(x, tid, shared);
|
|
// auto y = sycl_shared_to_reg_thread<nt, vt>(shared, tid);
|
|
// return y;
|
|
//}
|
|
//
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned S>
|
|
//auto sycl_shared_gather(
|
|
// const T(&data)[S], syclArray<unsigned, vt> indices, bool sync = true
|
|
//) {
|
|
//
|
|
// static_assert(S >= nt * vt,
|
|
// "shared_gather must have at least nt * vt storage");
|
|
//
|
|
// syclArray<T, vt> x;
|
|
// sycl_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; });
|
|
//
|
|
// if(sync) __syncthreads();
|
|
//
|
|
// return x;
|
|
//}
|
|
//
|
|
//
|
|
//
|
|
//// ----------------------------------------------------------------------------
|
|
//// reg<->reg
|
|
//// ----------------------------------------------------------------------------
|
|
//
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned S>
|
|
//auto sycl_reg_thread_to_strided(
|
|
// syclArray<T, vt> x, unsigned tid, T (&shared)[S]
|
|
//) {
|
|
// sycl_reg_to_shared_thread<nt>(x, tid, shared);
|
|
// return sycl_shared_to_reg_strided<nt, vt>(shared, tid);
|
|
//}
|
|
//
|
|
//template<unsigned nt, unsigned vt, typename T, unsigned S>
|
|
//auto sycl_reg_strided_to_thread(
|
|
// syclArray<T, vt> x, unsigned tid, T (&shared)[S]
|
|
//) {
|
|
// sycl_reg_to_shared_strided<nt>(x, tid, shared);
|
|
// return sycl_shared_to_reg_thread<nt, vt>(shared, tid);
|
|
//}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// syclLoadStoreIterator
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template<typename L, typename S, typename T, typename I>
|
|
struct syclLoadStoreIterator : std::iterator_traits<const T*> {
|
|
|
|
L load;
|
|
S store;
|
|
I base;
|
|
|
|
syclLoadStoreIterator(L load_, S store_, I base_) :
|
|
load(load_), store(store_), base(base_) { }
|
|
|
|
struct assign_t {
|
|
L load;
|
|
S store;
|
|
I index;
|
|
|
|
assign_t& operator=(T rhs) {
|
|
static_assert(!std::is_same<S, syclEmpty>::value,
|
|
"load_iterator is being stored to.");
|
|
store(rhs, index);
|
|
return *this;
|
|
}
|
|
operator T() const {
|
|
static_assert(!std::is_same<L, syclEmpty>::value,
|
|
"store_iterator is being loaded from.");
|
|
return load(index);
|
|
}
|
|
};
|
|
|
|
assign_t operator[](I index) const {
|
|
return assign_t { load, store, base + index };
|
|
}
|
|
assign_t operator*() const {
|
|
return assign_t { load, store, base };
|
|
}
|
|
|
|
syclLoadStoreIterator operator+(I offset) const {
|
|
syclLoadStoreIterator cp = *this;
|
|
cp += offset;
|
|
return cp;
|
|
}
|
|
|
|
syclLoadStoreIterator& operator+=(I offset) {
|
|
base += offset;
|
|
return *this;
|
|
}
|
|
|
|
syclLoadStoreIterator operator-(I offset) const {
|
|
syclLoadStoreIterator cp = *this;
|
|
cp -= offset;
|
|
return cp;
|
|
}
|
|
|
|
syclLoadStoreIterator& operator-=(I offset) {
|
|
base -= offset;
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
//template<typename T>
|
|
//struct trivial_load_functor {
|
|
// template<typename I>
|
|
// T operator()(I index) const {
|
|
// return T();
|
|
// }
|
|
//};
|
|
|
|
//template<typename T>
|
|
//struct trivial_store_functor {
|
|
// template<typename I>
|
|
// void operator()(T v, I index) const { }
|
|
//};
|
|
|
|
template <typename T, typename I = int, typename L, typename S>
|
|
auto sycl_make_load_store_iterator(L load, S store, I base = 0) {
|
|
return syclLoadStoreIterator<L, S, T, I>(load, store, base);
|
|
}
|
|
|
|
template <typename T, typename I = int, typename L>
|
|
auto sycl_make_load_iterator(L load, I base = 0) {
|
|
return sycl_make_load_store_iterator<T>(load, syclEmpty(), base);
|
|
}
|
|
|
|
template <typename T, typename I = int, typename S>
|
|
auto sycl_make_store_iterator(S store, I base = 0) {
|
|
return sycl_make_load_store_iterator<T>(syclEmpty(), store, base);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// swap
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template<typename T>
|
|
void sycl_swap(T& a, T& b) {
|
|
auto c = a;
|
|
a = b;
|
|
b = c;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// launch kernel
|
|
// ----------------------------------------------------------------------------
|
|
|
|
//template<typename F, typename... args_t>
|
|
//__global__ void sycl_kernel(F f, args_t... args) {
|
|
// f(threadIdx.x, blockIdx.x, args...);
|
|
//}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// operators
|
|
// ----------------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
struct sycl_plus : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a + b; }
|
|
};
|
|
|
|
template <typename T>
|
|
struct sycl_minus : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a - b; }
|
|
};
|
|
|
|
template <typename T>
|
|
struct sycl_multiplies : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a * b; }
|
|
};
|
|
|
|
template <typename T>
|
|
struct sycl_maximum : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a > b ? a : b; }
|
|
};
|
|
|
|
template <typename T>
|
|
struct sycl_minimum : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a < b ? a : b; }
|
|
};
|
|
|
|
template <typename T>
|
|
struct sycl_less : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a < b; }
|
|
};
|
|
|
|
template <typename T>
|
|
struct sycl_greater : public std::binary_function<T, T, T> {
|
|
T operator()(T a, T b) const { return a > b; }
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Memory Object
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@private
|
|
*/
|
|
template <typename T>
|
|
class syclScopedDeviceMemory {
|
|
|
|
public:
|
|
|
|
syclScopedDeviceMemory() = delete;
|
|
|
|
syclScopedDeviceMemory(size_t N, sycl::queue& queue) :
|
|
_queue {queue},
|
|
_N {N} {
|
|
if(N) {
|
|
_data = sycl::malloc_device<T>(N, _queue);
|
|
}
|
|
}
|
|
|
|
syclScopedDeviceMemory(syclScopedDeviceMemory&& rhs) :
|
|
_queue{std::move(rhs._queue)}, _data{rhs._data}, _N {rhs._N} {
|
|
rhs._data = nullptr;
|
|
rhs._N = 0;
|
|
}
|
|
|
|
~syclScopedDeviceMemory() {
|
|
if(_data) {
|
|
sycl::free(_data, _queue);
|
|
}
|
|
}
|
|
|
|
syclScopedDeviceMemory& operator = (syclScopedDeviceMemory&& rhs) {
|
|
if(_data) {
|
|
sycl::free(_data, _queue);
|
|
}
|
|
_queue = std::move(rhs._queue);
|
|
_data = rhs._data;
|
|
_N = rhs._N;
|
|
rhs._data = nullptr;
|
|
rhs._N = 0;
|
|
return *this;
|
|
}
|
|
|
|
size_t size() const { return _N; }
|
|
|
|
T* data() { return _data; }
|
|
const T* data() const { return _data; }
|
|
|
|
syclScopedDeviceMemory(const syclScopedDeviceMemory&) = delete;
|
|
syclScopedDeviceMemory& operator = (const syclScopedDeviceMemory&) = delete;
|
|
|
|
private:
|
|
|
|
sycl::queue& _queue;
|
|
|
|
T* _data {nullptr};
|
|
size_t _N {0};
|
|
};
|
|
|
|
|
|
} // end of namespace tf -----------------------------------------------------
|
|
|
|
|
|
|