mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/stencilReduceOCL.hpp

2028 lines
78 KiB
C++
Raw Normal View History

2025-01-04 01:25:05 +01:00
/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/*!
* \file stencilReduceOCL.hpp
* \ingroup high_level_patterns
*
* \brief StencilReduceLoop data-parallel pattern and derived data-parallel patterns
*
*/
/* ***************************************************************************
*
* FastFlow is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License version 3 as
* published by the Free Software Foundation.
* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
****************************************************************************
*/
/*
* Authors:
* Maurizio Drocco
* Massimo Torquati
* Marco Aldinucci
*
*/
#ifndef FF_STENCILREDUCE_OCL_HPP
#define FF_STENCILREDUCE_OCL_HPP
#ifdef FF_OPENCL
#include <string>
#include <fstream>
#include <tuple>
#include <algorithm>
#include <ff/bitflags.hpp>
#include <ff/oclnode.hpp>
#include <ff/node.hpp>
#include <ff/oclallocator.hpp>
#include <ff/stencilReduceOCL_macros.hpp>
namespace ff {
enum reduceMode { REDUCE_INPUT, REDUCE_OUTPUT };
/**
* a task to be executed by a 1D stencilReduceLoop node.
*/
template<typename TaskT_, typename Tin_, typename Tout_ = Tin_>
class baseOCLTask {
public:
typedef TaskT_ TaskT;
typedef Tin_ Tin;
typedef Tout_ Tout;
baseOCLTask(): inPtr(NULL),outPtr(NULL),reduceVar(NULL),
size_in(0),size_out(0),iter(0),
tuple_in(std::make_tuple(true,false,false)),
tuple_out(std::make_tuple(true,false,false)) { }
virtual ~baseOCLTask() { }
// user must override this method using:
// - setInPtr for setting the host-pointer to the input array
// - setOutPtr for setting the host-pointer to the output array
// - setEnvPtr for adding to env-list the host-pointer to a read-only env
// - other methods from classes derived from baseOCLTask
// NOTE: order of setEnvPtr calls matters! TODO refine interface?
virtual void setTask(TaskT *t) = 0;
/* --- the user may overrider these methods --- */
// called at the end of the stencil-reduce loop. It may be used to
// perform per-task host memory cleanup (i.e. releasing the host memory
// previously allocated in the setTask function) or to execute a
// post-elaboration phase
virtual void releaseTask(TaskT *t) {}
// computes the loop iteration condition
virtual bool iterCondition(const Tout&, size_t) { return false; }
// host reduce function
virtual Tout combinator(const Tout&, const Tout&) { return Tout(); }
// step functions
virtual void incIter() { ++iter; }
virtual size_t getIter() const { return iter; }
virtual void resetIter(const size_t val=0) { iter = val; }
/* -------------------------------------------- */
void resetTask() {
envPtr.resize(0);
copyEnv.resize(0);
}
/**
* set the host-pointer to the input array.
*
* @param _inPtr the host-pointer
* @param sizeIn the number of elements in the input array
* @param copy TODO
* @param reuse TODO
* @param release TODO
*/
void setInPtr(Tin* _inPtr, size_t sizeIn,
const CopyFlags copy =CopyFlags::COPY,
const ReuseFlags reuse =ReuseFlags::DONTREUSE,
const ReleaseFlags release=ReleaseFlags::DONTRELEASE) {
inPtr = _inPtr; size_in = sizeIn;
tuple_in = std::make_tuple(copy==CopyFlags::COPY,
reuse==ReuseFlags::REUSE,
release==ReleaseFlags::RELEASE);
}
/**
* set the host-pointer to the input array.
*
* @see setInPtr()
*/
void setInPtr(Tin* _inPtr, size_t sizeIn, const MemoryFlags &flags) {
inPtr = _inPtr; size_in = sizeIn;
tuple_in = std::make_tuple(flags.copy==CopyFlags::COPY,
flags.reuse==ReuseFlags::REUSE,
flags.release==ReleaseFlags::RELEASE);
}
/**
* set the host-pointer to the output array.
*
* @see setInPtr()
*/
void setOutPtr(Tout* _outPtr, size_t sizeOut,
const CopyFlags copyback =CopyFlags::COPY,
const ReuseFlags reuse =ReuseFlags::DONTREUSE,
const ReleaseFlags release =ReleaseFlags::DONTRELEASE) {
outPtr = _outPtr; size_out = sizeOut;
tuple_out = std::make_tuple(copyback==CopyFlags::COPY,
reuse==ReuseFlags::REUSE,
release==ReleaseFlags::RELEASE);
}
/**
* set the host-pointer to the output array.
*
* @see setInPtr()
*/
void setOutPtr(Tout* _outPtr, size_t sizeOut, const MemoryFlags &flags ) {
outPtr = _outPtr; size_out = sizeOut;
tuple_out = std::make_tuple(flags.copy==CopyFlags::COPY,
flags.reuse==ReuseFlags::REUSE,
flags.release==ReleaseFlags::RELEASE);
}
/**
* add to env-list the host-pointer to a read-only env.
*
* @see setInPtr()
*/
template<typename ptrT>
void setEnvPtr(const ptrT* _envPtr, size_t size,
const CopyFlags copy =CopyFlags::COPY,
const ReuseFlags reuse =ReuseFlags::DONTREUSE,
const ReleaseFlags release=ReleaseFlags::DONTRELEASE) {
assert(envPtr.size() == copyEnv.size());
envPtr.push_back(std::make_pair((void*)_envPtr,size*sizeof(ptrT)));
copyEnv.push_back(std::make_tuple(sizeof(ptrT),
copy==CopyFlags::COPY,
reuse==ReuseFlags::REUSE,
release==ReleaseFlags::RELEASE));
}
/**
* add to env-list the host-pointer to a read-only env.
*
* @see setInPtr()
*/
template<typename ptrT>
void setEnvPtr(const ptrT* _envPtr, size_t size, const MemoryFlags &flags) {
assert(envPtr.size() == copyEnv.size());
envPtr.push_back(std::make_pair((void*)_envPtr,size*sizeof(ptrT)));
copyEnv.push_back(std::make_tuple(sizeof(ptrT),
flags.copy==CopyFlags::COPY,
flags.reuse==ReuseFlags::REUSE,
flags.release==ReleaseFlags::RELEASE));
}
Tin * getInPtr() const { return inPtr; }
Tout * getOutPtr() const { return outPtr; }
template<typename ptrT>
void getEnvPtr(const size_t idx, ptrT *& ptr) const {
assert(idx < envPtr.size());
ptr = reinterpret_cast<ptrT*>(envPtr[idx].first);
}
size_t getEnvNum() const {
assert(envPtr.size() == copyEnv.size());
return envPtr.size();
}
bool getCopyEnv(const size_t idx) const {
assert(idx < copyEnv.size());
return std::get<1>(copyEnv[idx]);
}
bool getReuseEnv(const size_t idx) const {
assert(idx < copyEnv.size());
return std::get<2>(copyEnv[idx]);
}
bool getReleaseEnv(const size_t idx) const {
assert(idx < copyEnv.size());
return std::get<3>(copyEnv[idx]);
}
bool getCopyIn() const { return std::get<0>(tuple_in); }
bool getReuseIn() const { return std::get<1>(tuple_in); }
bool getReleaseIn() const { return std::get<2>(tuple_in); }
bool getCopyOut() const { return std::get<0>(tuple_out); }
bool getReuseOut() const { return std::get<1>(tuple_out); }
bool getReleaseOut() const { return std::get<2>(tuple_out); }
size_t getSizeIn() const { return size_in; }
size_t getSizeOut() const { return (size_out==0)?size_in:size_out; }
size_t getSizeEnv(const size_t idx) const {
assert(idx < copyEnv.size());
return std::get<0>(copyEnv[idx]);
}
size_t getBytesizeIn() const { return getSizeIn() * sizeof(Tin); }
size_t getBytesizeOut() const { return getSizeOut() * sizeof(Tout); }
size_t getBytesizeEnv(const size_t idx) const {
assert(idx < envPtr.size());
return envPtr[idx].second;
}
void setReduceVar(const Tout *r) { reduceVar = (Tout*)r; }
Tout *getReduceVar() const { return reduceVar; }
void writeReduceVar(const Tout &r) { *reduceVar = r; }
void setIdentityVal(const Tout &x) { identityVal = x;}
Tout getIdentityVal() const { return identityVal; }
bool iterCondition_aux() {
return iterCondition(*reduceVar, iter);
}
protected:
Tin *inPtr;
Tout *outPtr;
Tout *reduceVar, identityVal;
size_t size_in, size_out, iter;
std::tuple<bool,bool,bool> tuple_in;
std::tuple<bool,bool,bool> tuple_out;
std::vector<std::pair<void*,size_t> > envPtr; // pointer and byte-size
std::vector<std::tuple<size_t,bool,bool,bool> > copyEnv; // size and flags
};
/**
* a virtual OpenCL accelerator.for 1D kernels
*/
template<typename T, typename TOCL = T>
class ff_oclAccelerator {
public:
typedef typename TOCL::Tin Tin;
typedef typename TOCL::Tout Tout;
ff_oclAccelerator(ff_oclallocator *alloc, const size_t width_, const Tout &identityVal, const bool from_source=false) :
from_source(from_source), my_own_allocator(false), allocator(alloc), halo_half(width_), identityVal(identityVal), events_h2d(16), deviceId(NULL) {
wgsize_map_static = wgsize_reduce_static = 0;
wgsize_map_max = wgsize_reduce_max = 0;
inputBuffer = outputBuffer = reduceBuffer = NULL;
sizeInput = sizeInput_padded = 0;
lenInput = offset1_in = halo_in_left = halo_in_right = lenInput_global = 0;
sizeOutput = sizeOutput_padded = 0;
lenOutput = offset1_out = halo_out_left = halo_out_right = lenOutput_global = 0;
nevents_h2d = nevents_map = 0;
event_d2h = event_map = event_reduce1 = event_reduce2 = NULL;
wgsize_map = nthreads_map = 0;
wgsize_reduce = nthreads_reduce = nwg_reduce = wg_red_mem = 0;
reduceVar = identityVal;
kernel_map = kernel_reduce = kernel_init = NULL;
context = NULL;
program = NULL;
cmd_queue = NULL;
reduce_mode = REDUCE_OUTPUT;
if (!allocator) {
my_own_allocator = true;
allocator = new ff_oclallocator;
assert(allocator);
}
}
virtual ~ff_oclAccelerator() {
if (my_own_allocator) {
allocator->releaseAllBuffers(context);
delete allocator;
allocator = NULL;
my_own_allocator = false;
}
}
int init(cl_device_id dId, reduceMode m, const std::string &kernel_code, const std::string &kernel_name1,
const std::string &kernel_name2, const bool save_binary, const bool reuse_binary) {
#ifdef FF_OPENCL_LOG
fprintf(stderr, "initializing virtual accelerator @%p mapped to device:\n", this);
std::cerr << ff::clEnvironment::instance()->getDeviceInfo(dId) << std::endl;
#endif
reduce_mode = m;
//set OCL objects
deviceId = dId;
const oclParameter *param = clEnvironment::instance()->getParameter(deviceId);
assert(param);
context = param->context;
cmd_queue = param->commandQueue;
//build OCL kernels
cl_int status = buildKernels(kernel_code, kernel_name1,kernel_name2, from_source, save_binary, reuse_binary);
checkResult(status, "build kernels");
//compute static heuristics for kernel sizing
setSizingHeuristics();
return status == CL_SUCCESS;
}
void releaseAll() { if (deviceId) { svc_releaseOclObjects(); deviceId = NULL; }}
void releaseInput(const Tin *inPtr) {
if (allocator->releaseBuffer(inPtr, context, inputBuffer) != CL_SUCCESS)
checkResult(CL_INVALID_MEM_OBJECT, "releaseInput");
inputBuffer = NULL;
}
void releaseOutput(const Tout *outPtr) {
if (allocator->releaseBuffer(outPtr, context, outputBuffer) != CL_SUCCESS)
checkResult(CL_INVALID_MEM_OBJECT, "releaseOutput");
outputBuffer = NULL;
}
void releaseEnv(size_t idx, const void *envPtr) {
if (allocator->releaseBuffer(envPtr, context, envBuffer[idx].first) != CL_SUCCESS)
checkResult(CL_INVALID_MEM_OBJECT, "releaseEnv");
envBuffer[idx].first = NULL, envBuffer[idx].second = 0;
}
void swapBuffers() {
cl_mem tmp = inputBuffer;
inputBuffer = outputBuffer;
outputBuffer = tmp;
}
void setSizingHeuristics() {
cl_int status;
//get device-dependent max wg size
size_t max_device_wgsize;
status = clGetDeviceInfo(deviceId,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(max_device_wgsize),&max_device_wgsize, NULL);
checkResult(status, "clGetDeviceInfo (map)");
if(kernel_map) { //map kernel
//get kernel-dependent max wg size
size_t max_kernel_wgsize;
status = clGetKernelWorkGroupInfo(kernel_map, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_kernel_wgsize, 0);
checkResult(status, "GetKernelWorkGroupInfo (map)");
wgsize_map_max = std::min<size_t>(max_device_wgsize,max_kernel_wgsize);
//get size of the atomic scheduling unit (analogous to CUDA wrap size)
//typical values are 16 or 32
size_t wg_multiple;
status = clGetKernelWorkGroupInfo(kernel_map, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wg_multiple, 0);
wgsize_map_static = std::max<size_t>(64, wg_multiple * 4); //64 or 128
wgsize_map_static = std::min<size_t>(wgsize_map_static,wgsize_map_max);
}
if(kernel_reduce) { //reduce kernel
//get kernel-dependent max wg size
size_t max_kernel_wgsize;
status = clGetKernelWorkGroupInfo(kernel_reduce, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_kernel_wgsize, 0);
checkResult(status, "GetKernelWorkGroupInfo (reduce)");
wgsize_reduce_max = std::min<size_t>(max_device_wgsize,max_kernel_wgsize);
//get size of the atomic scheduling unit (analogous to CUDA wrap size)
//typical values are 16 or 32
size_t wg_multiple;
status = clGetKernelWorkGroupInfo(kernel_reduce, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wg_multiple, 0);
wgsize_reduce_static = std::max<size_t>(64, wg_multiple * 4); //64 or 128
wgsize_reduce_static = std::min<size_t>(wgsize_reduce_static,wgsize_reduce_max);
}
#ifdef FF_OPENCL_LOG
std::cerr << "[virtual accelerator @"<<this<<"]\n";
std::cerr << "+ static heuristics for kernel sizing parameters:\n";
std::cerr << "- MAP workgroup-size = " <<wgsize_map_static<< "\n";
std::cerr << "- MAP max workgroup-size = " <<wgsize_map_max<< "\n";
std::cerr << "- RED workgroup-size = " <<wgsize_reduce_static<< " \n";
std::cerr << "- RED max workgroup-size = " <<wgsize_reduce_max<< " \n";
#endif
}
//see comments for members
void adjustInputBufferOffset(const Tin *newPtr, const Tin *oldPtr, std::pair<size_t, size_t> &P, size_t len_global) {
offset1_in = P.first;
lenInput = P.second;
lenInput_global = len_global;
halo_in_left = (std::min)(halo_half, offset1_in);
halo_in_right = (std::min)(halo_half, lenInput_global - lenInput - offset1_in);
sizeInput = lenInput * sizeof(Tin);
sizeInput_padded = sizeInput + (halo_in_left + halo_in_right) * sizeof(Tin);
if (oldPtr != NULL) allocator->updateKey(oldPtr, newPtr, context);
}
void relocateInputBuffer(const Tin *inPtr, const bool reuseIn, const Tout *reducePtr) {
cl_int status;
if (reuseIn) {
inputBuffer = allocator->createBufferUnique(inPtr, context,
CL_MEM_READ_WRITE, sizeInput_padded, &status);
checkResult(status, "CreateBuffer(Unique) input");
} else {
if (inputBuffer) allocator->releaseBuffer(inPtr, context, inputBuffer);
//allocate input-size + pre/post-windows
inputBuffer = allocator->createBuffer(inPtr, context,
CL_MEM_READ_WRITE, sizeInput_padded, &status);
checkResult(status, "CreateBuffer input");
}
//set workgroup size and nthreads for map
// MA patch - map not defined => workgroup_size_map
//size_t wgsize = wgsize_map_static==0?wgsize_reduce_static:wgsize_map_static;
if(kernel_map) {
// if the output buffer is bigger than input buffer then we use the size of the
// output buffer to compute the number of threads
if (lenInput < lenOutput) {
wgsize_map = std::min<size_t>(lenOutput, wgsize_map_static);
nthreads_map = wgsize_map * ((lenOutput + wgsize_map - 1) / wgsize_map); //round up
}
else {
wgsize_map = std::min<size_t>(lenInput, wgsize_map_static);
nthreads_map = wgsize_map * ((lenInput + wgsize_map - 1) / wgsize_map); //round up
}
#ifdef FF_OPENCL_LOG
std::cerr << "[virtual accelerator @"<<this<<"]\n";
std::cerr << "+ computed MAP kernel sizing parameters:\n";
std::cerr << "- MAP workgroup-size = " << wgsize_map << "\n";
std::cerr << "- MAP n. threads = " << nthreads_map << " \n";
#endif
}
//set workgroup size and nthreads for reduce
if (kernel_reduce && reduce_mode == REDUCE_INPUT) {
resetReduce(lenInput, sizeof(Tin), (void *)reducePtr);
}
}
void adjustOutputBufferOffset(const Tout *newPtr, const Tout *oldPtr, std::pair<size_t, size_t> &P, size_t len_global) {
offset1_out = P.first;
lenOutput = P.second;
lenOutput_global = len_global;
halo_out_left = (std::min)(halo_half, offset1_out);
halo_out_right = (std::min)(halo_half, lenOutput_global - lenOutput - offset1_out);
sizeOutput = lenOutput * sizeof(Tout);
sizeOutput_padded = sizeOutput + (halo_out_left + halo_out_right) * sizeof(Tout);
if (oldPtr != NULL) allocator->updateKey(oldPtr, newPtr, context);
}
void relocateOutputBuffer(const Tout *outPtr, const Tout *reducePtr) {
cl_int status;
if (outputBuffer) allocator->releaseBuffer(outPtr, context, outputBuffer);
outputBuffer = allocator->createBuffer(outPtr, context,
CL_MEM_READ_WRITE, sizeOutput_padded,&status);
checkResult(status, "CreateBuffer output");
//set workgroup size and nthreads for reduce
if (kernel_reduce && reduce_mode == REDUCE_OUTPUT) {
resetReduce(lenOutput, sizeof(Tout), (void *)reducePtr);
}
}
void relocateEnvBuffer(const void *envptr, const bool reuseEnv, const size_t idx, const size_t envbytesize) {
cl_int status = CL_SUCCESS;
if (idx >= envBuffer.size()) {
cl_mem envb;
if (reuseEnv)
envb = allocator->createBufferUnique(envptr, context,
CL_MEM_READ_WRITE, envbytesize, &status);
else
envb = allocator->createBuffer(envptr, context,
CL_MEM_READ_WRITE, envbytesize, &status);
if (checkResult(status, "CreateBuffer envBuffer"))
envBuffer.push_back(std::make_pair(envb,envbytesize));
} else {
if (reuseEnv) {
envBuffer[idx].first = allocator->createBufferUnique(envptr, context,
CL_MEM_READ_WRITE, envbytesize, &status);
if (checkResult(status, "CreateBuffer envBuffer"))
envBuffer[idx].second = envbytesize;
} else {
if (envBuffer[idx].second < envbytesize) {
if (envBuffer[idx].first) allocator->releaseBuffer(envptr, context, envBuffer[idx].first);
envBuffer[idx].first = allocator->createBuffer(envptr, context,
CL_MEM_READ_WRITE, envbytesize, &status);
if (checkResult(status, "CreateBuffer envBuffer"))
envBuffer[idx].second = envbytesize;
}
}
}
}
void setInPlace(Tout *reducePtr) {
outputBuffer = inputBuffer;
lenOutput = lenInput;
lenOutput_global = lenInput_global;
halo_out_left = halo_in_left;
halo_out_right = halo_in_right;
offset1_out = offset1_in;
sizeOutput = sizeInput;
sizeOutput_padded = sizeInput_padded;
//set workgroup size and nthreads for reduce
if (kernel_reduce && reduce_mode == REDUCE_OUTPUT) {
resetReduce(lenOutput, sizeof(Tout), reducePtr);
}
}
void swap() {
cl_mem tmp = inputBuffer;
inputBuffer = outputBuffer;
outputBuffer = tmp;
//set iteration-dynamic MAP kernel args
cl_int status = clSetKernelArg(kernel_map, 0, sizeof(cl_mem), &inputBuffer);
checkResult(status, "setKernelArg input");
status = clSetKernelArg(kernel_map, 1, sizeof(cl_mem), &outputBuffer);
checkResult(status, "setKernelArg output");
}
virtual void setMapKernelArgs(const size_t envSize) {
cl_uint idx = 0;
//set iteration-dynamic MAP kernel args (init)
cl_int status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &inputBuffer);
checkResult(status, "setKernelArg input");
status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &outputBuffer);
checkResult(status, "setKernelArg output");
//set iteration-invariant MAP kernel args
status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &lenInput_global);
checkResult(status, "setKernelArg global input length");
status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &lenOutput);
checkResult(status, "setKernelArg local input length");
status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &offset1_in);
checkResult(status, "setKernelArg offset");
status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &halo_out_left); // CHECK !!!
checkResult(status, "setKernelArg pad");
for(size_t k=0; k < envSize; ++k) {
status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &envBuffer[k].first);
checkResult(status, "setKernelArg env");
}
}
size_t asyncH2Dinput(Tin *p) {
if (nevents_h2d >= events_h2d.size()) events_h2d.reserve(nevents_h2d);
p += offset1_in - halo_in_left;
cl_int status = clEnqueueWriteBuffer(cmd_queue, inputBuffer, CL_FALSE, 0,
sizeInput_padded, p, 0, NULL, &events_h2d[nevents_h2d++]);
checkResult(status, "copying Task to device input-buffer");
return sizeInput_padded;
}
size_t asyncH2Denv(const size_t idx, char *p) {
if (nevents_h2d >= events_h2d.size()) events_h2d.reserve(nevents_h2d);
cl_int status = clEnqueueWriteBuffer(cmd_queue, envBuffer[idx].first, CL_FALSE, 0,
envBuffer[idx].second, p, 0, NULL, &events_h2d[nevents_h2d++]);
checkResult(status, "copying Task to device env-buffer");
return envBuffer[idx].second;
}
size_t asyncH2Dborders(Tout *p) {
if (halo_out_left) {
cl_int status = clEnqueueWriteBuffer(cmd_queue, outputBuffer, CL_FALSE, 0,
halo_out_left * sizeof(Tout), p + offset1_out - halo_out_left, 0, NULL,
&events_h2d[nevents_h2d++]);
checkResult(status, "copying left border to device");
return halo_out_left * sizeof(Tout);
}
if (halo_out_right) {
cl_int status = clEnqueueWriteBuffer(cmd_queue, outputBuffer, CL_FALSE,
(halo_out_left + lenOutput) * sizeof(Tout), halo_out_right * sizeof(Tout), // NOTE: in a loop Tin == Tout !!
p + offset1_out + lenOutput, 0, NULL, &events_h2d[nevents_h2d++]);
checkResult(status, "copying right border to device");
}
return halo_out_left * sizeof(Tout);
}
size_t asyncD2Houtput(Tout *p) {
cl_int status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE,
halo_out_left * sizeof(Tout), sizeOutput, p + offset1_out, 0, NULL, &event_d2h);
checkResult(status, "copying output back from device");
return sizeOutput;
}
size_t asyncD2Hborders(Tout *p) {
cl_int status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE,
halo_out_left * sizeof(Tout), halo_half * sizeof(Tout), p + offset1_out, 0, NULL,
&events_h2d[0]);
checkResult(status, "copying border1 back from device");
++nevents_h2d;
status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE,
(halo_out_left + lenOutput - halo_half) * sizeof(Tout), halo_half * sizeof(Tout),
p + offset1_out + lenOutput - halo_half, 0, NULL, &event_d2h);
checkResult(status, "copying border2 back from device");
return halo_half * sizeof(Tout);
}
//void initReduce(const Tout &initReduceVal, reduceMode m = REDUCE_OUTPUT) {
void initReduce() {
//set kernel args for reduce1
int idx = 0;
cl_mem tmp = (reduce_mode == REDUCE_OUTPUT) ? outputBuffer : inputBuffer;
cl_uint len = (reduce_mode == REDUCE_OUTPUT) ? lenOutput : lenInput;
cl_int status = clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &tmp);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(uint), &halo_in_left);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_uint), (void *) &len);
status |= clSetKernelArg(kernel_reduce, idx++, wg_red_mem, NULL);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(Tout), (void *) &identityVal);
checkResult(status, "setKernelArg reduce-1");
}
void asyncExecMapKernel() {
//execute MAP kernel
cl_int status = clEnqueueNDRangeKernel(cmd_queue, kernel_map, 1, NULL,
&nthreads_map, &wgsize_map, 0, NULL, &event_map);
checkResult(status, "executing map kernel");
//std::cerr << "Exec map WI " << globalThreadsMap << " localThreadsMap " << localThreadsMap << "\n";
++nevents_map;
}
void asyncExecReduceKernel1() {
//std::cerr << "Exec reduce1 WI " << globalThreadsReduce << " localThreadsReduce " << localThreadsReduce << "\n";
cl_int status = clEnqueueNDRangeKernel(cmd_queue, kernel_reduce, 1, NULL,
&nthreads_reduce, &wgsize_reduce, nevents_map,
(nevents_map==0)?NULL:&event_map,
&event_reduce1);
checkResult(status, "exec kernel reduce-1");
nevents_map = 0;
}
void asyncExecReduceKernel2() {
cl_uint zeropad = 0;
int idx = 0;
cl_int status = clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(uint), &zeropad);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_uint),(void*) &nwg_reduce);
status |= clSetKernelArg(kernel_reduce, idx++, wg_red_mem, NULL);
status |= clSetKernelArg(kernel_reduce, idx++, sizeof(Tout), (void *) &identityVal);
checkResult(status, "setKernelArg reduce-2");
//std::cerr << "Exec reduce2 WI " << globalThreadsReduce << " localThreadsReduce " << localThreadsReduce << "\n";
status = clEnqueueNDRangeKernel(cmd_queue, kernel_reduce, 1, NULL,
&wgsize_reduce, &wgsize_reduce, 1, &event_reduce1,
&event_reduce2);
checkResult(status, "exec kernel reduce-2");
}
Tout getReduceVar() {
cl_int status = clEnqueueReadBuffer(cmd_queue, reduceBuffer, CL_TRUE, 0,
sizeof(Tout), &reduceVar, 0, NULL, NULL);
checkResult(status, "d2h reduceVar");
return reduceVar;
}
void waitforh2d() {
if (nevents_h2d>0) {
cl_int status = clWaitForEvents(nevents_h2d, events_h2d.data());
checkResult(status, "h2d wait for");
nevents_h2d = 0;
}
}
void waitford2h() {
cl_int status = clWaitForEvents(1, &event_d2h);
checkResult(status, "d2h wait for");
}
void waitforreduce() {
cl_int status = clWaitForEvents(1, &event_reduce2);
checkResult(status, "wait for reduce");
}
void waitformap() {
cl_int status = clWaitForEvents(nevents_map, &event_map);
nevents_map = 0;
checkResult(status, "wait for map");
}
/**
* dynamically sets half-size of the 1D halo
*/
void setHaloHalf(const size_t h) {
halo_half = h;
}
private:
int buildProgram(cl_device_id dId) {
cl_int status = clBuildProgram(program, 1, &dId, /*"-cl-fast-relaxed-math"*/NULL, NULL,NULL);
checkResult(status, "building program");
// DEBUGGING CODE for checking OCL compilation errors
if (status != CL_SUCCESS) {
printf("\nFail to build the program\n");
size_t len;
clGetProgramBuildInfo(program, dId, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
printf("LOG len %ld\n", len);
char *buffer = (char*) calloc(len, sizeof(char));
assert(buffer);
clGetProgramBuildInfo(program, dId, CL_PROGRAM_BUILD_LOG, len * sizeof(char),
buffer, NULL);
printf("LOG: %s\n\n", buffer);
return -1;
}
return 0;
}
int buildKernelCode(const std::string &kc, cl_device_id dId) {
cl_int status;
size_t sourceSize = kc.length();
const char* code = kc.c_str();
#ifdef FF_OPENCL_LOG
printf("/* ------------------------------------------------------- */\n");
printf("buildKernelCode:\n%s\n", code);
printf("/* ------------------------------------------------------- */\n");
#endif
program = clCreateProgramWithSource(context, 1, &code, &sourceSize, &status);
if (!program) {
checkResult(status, "creating program with source");
return -1;
}
return buildProgram(dId);
}
// create the program with the binary file or from the source code
int createProgram(const std::string &filepath, cl_device_id dId, const bool save_binary, const bool reuse_binary) {
cl_int status, binaryStatus;
bool binary = false;
const std::string binpath = filepath + ".bin";
std::ifstream ifs;
if (reuse_binary) {
ifs.open(binpath, std::ios::binary );
if (!ifs.is_open()) { // try with filepath
ifs.open(filepath, std::ios::binary);
if (!ifs.is_open()) {
error("createProgram: cannot open %s (nor %s)\n", filepath.c_str(), binpath.c_str());
return -1;
}
} else binary = true;
} else {
ifs.open(filepath, std::ios::binary);
if (!ifs.is_open()) {
error("createProgram: cannot open source file %s\n", filepath.c_str());
return -1;
}
}
std::vector<char> buf((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
ifs.close();
size_t bufsize = buf.size();
const char *bufptr = buf.data();
status = CL_INVALID_BINARY;
if (binary) {
program = clCreateProgramWithBinary(context, 1, &dId, &bufsize,
reinterpret_cast<const unsigned char **>(&bufptr),
&binaryStatus, &status);
}
if (status != CL_SUCCESS) { // maybe is not a binary file
program = clCreateProgramWithSource(context, 1,&bufptr, &bufsize, &status);
if (!program) {
checkResult(status, "creating program with source");
return -1;
}
if (buildProgram(dId)<0) return -1;
if (save_binary) { // TODO: the logical deviceId has to be attached to the file name !
size_t programBinarySize;
status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * 1,
&programBinarySize, NULL);
checkResult(status, "createProgram clGetProgramInfo (binary size)");
std::vector<char> binbuf(programBinarySize);
const char *binbufptr = binbuf.data();
status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * 1,
&binbufptr, NULL);
checkResult(status, "createProgram clGetProgramInfo (binary data)");
std::ofstream ofs(binpath, std::ios::out | std::ios::binary);
ofs.write(binbuf.data(), binbuf.size());
ofs.close();
}
return 0;
}
return buildProgram(dId);
}
cl_int buildKernels(const std::string &kernel_code,
const std::string &kernel_name1, const std::string &kernel_name2,
const bool from_source, const bool save_binary, const bool reuse_binary) {
cl_int status_ = CL_SUCCESS;
if (!from_source) { //compile kernel on device
if (buildKernelCode(kernel_code, deviceId)<0) return -1;
} else { // kernel_code is the path to the (binary?) source
if (createProgram(kernel_code, deviceId, save_binary, reuse_binary)<0) return -1;
}
//create kernel objects
if (kernel_name1 != "") { //map kernel
cl_int status;
kernel_map = clCreateKernel(program, kernel_name1.c_str(), &status);
checkResult(status, "CreateKernel (map)");
status_ |= status;
}
if (kernel_name2 != "") { //reduce kernel
cl_int status;
kernel_reduce = clCreateKernel(program, kernel_name2.c_str(), &status);
checkResult(status, "CreateKernel (reduce)");
status_ |= status;
}
return status_;
}
void svc_releaseOclObjects() {
if (kernel_map) clReleaseKernel(kernel_map);
if (kernel_reduce) clReleaseKernel(kernel_reduce);
clReleaseProgram(program);
// if (inputBuffer) clReleaseMemObject(inputBuffer);
// for(size_t i=0; i < envBuffer.size(); ++i)
// clReleaseMemObject(envBuffer[i].first);
// if (outputBuffer && outputBuffer != inputBuffer)
// clReleaseMemObject(outputBuffer);
// if (reduceBuffer)
// clReleaseMemObject(reduceBuffer);
allocator->releaseAllBuffers(context);
}
void resetReduce(size_t lenReduceInput, size_t elem_size, const void *reducePtr) {
// 64 and 256 are the max number of blocks and threads we want to use
//getBlocksAndThreads(lenInput, 64, 256, nwg_reduce, wgsize_reduce);
//nthreads_reduce = nwg_reduce * wgsize_reduce;
nthreads_reduce = lenReduceInput;
if(!isPowerOf2(nthreads_reduce))
nthreads_reduce = nextPowerOf2(nthreads_reduce);
wgsize_reduce = std::min<size_t>(nthreads_reduce, wgsize_reduce_static);
nwg_reduce = nthreads_reduce / wgsize_reduce;
//compute size of per-workgroup working memory
wg_red_mem = (wgsize_reduce * elem_size)
+ (wgsize_reduce <= 32) * (wgsize_reduce * elem_size);
//compute size of global reduce working memory
size_t global_red_mem = nwg_reduce * elem_size;
//allocate global memory for storing intermediate per-workgroup reduce results
cl_int status;
if (reduceBuffer)
allocator->releaseBuffer(reducePtr, context, reduceBuffer);
reduceBuffer = allocator->createBuffer(reducePtr, context,
CL_MEM_READ_WRITE, global_red_mem, &status);
checkResult(status, "CreateBuffer reduce");
#ifdef FF_OPENCL_LOG
std::cerr << "[virtual accelerator @"<<this<<"]\n";
std::cerr << "+ computed REDUCE kernel sizing parameters:\n";
std::cerr << "- REDUCE workgroup-size = " <<wgsize_reduce<< " \n";
std::cerr << "- REDUCE n. threads = " <<nthreads_reduce<< " \n";
std::cerr << "- REDUCE n. workgroups = " <<nwg_reduce<< " \n";
std::cerr << "- REDUCE per-wg memory = " <<wg_red_mem<< " \n";
#endif
}
/*!
* Computes the number of threads and blocks to use for the reduction kernel.
*/
inline void getBlocksAndThreads(const size_t size, const size_t maxBlocks,
const size_t maxThreads, size_t & blocks, size_t &threads) {
const size_t half = (size + 1) / 2;
threads =
(size < maxThreads * 2) ?
(isPowerOf2(half) ? nextPowerOf2(half + 1) : nextPowerOf2(half)) :
maxThreads;
blocks = (size + (threads * 2 - 1)) / (threads * 2);
blocks = std::min(maxBlocks, blocks);
}
protected:
cl_context context;
cl_program program;
cl_command_queue cmd_queue;
cl_mem reduceBuffer;
cl_kernel kernel_map, kernel_reduce, kernel_init;
protected:
const bool from_source;
bool my_own_allocator;
ff_oclallocator *allocator;
size_t halo_half; //half-size of the 1D halo
const Tout identityVal;
Tout reduceVar;
cl_mem inputBuffer, outputBuffer;
std::vector<std::pair<cl_mem, size_t> > envBuffer;
/*
* each accelerator works on the following subset of the input array:
* [left-border][input-portion][right-border]
* input portion is accessed RW, border are accessed read-only
*/
size_t sizeInput; //byte-size of the input-portion
size_t sizeInput_padded; //byte-size of the input-portion plus left and right borders
size_t lenInput; //n. elements in the input-portion
size_t offset1_in; //left-offset (begin input-portion wrt to begin input)
size_t halo_in_left; //n. elements in the left-halo
size_t halo_in_right; //n. elements in the right-halo
size_t lenInput_global; //n. elements in the input
size_t sizeOutput, sizeOutput_padded;
size_t lenOutput, offset1_out, halo_out_left, halo_out_right, lenOutput_global;
//static input-independent estimation of workgroup sizing
size_t wgsize_map_static, wgsize_reduce_static;
//static input-independent upper bounds for workgroup sizing
size_t wgsize_map_max, wgsize_reduce_max;
//input-dependent workgroup sizing
size_t wgsize_map, wgsize_reduce;
//input-dependent number of threads
size_t nthreads_map, nthreads_reduce;
//number of workgroups executing first on-device reduce
size_t nwg_reduce;
//reduce workgroup-local memory
size_t wg_red_mem;
//OCL events
std::vector<cl_event> events_h2d;
size_t nevents_h2d, nevents_map;
cl_event event_d2h, event_map, event_reduce1, event_reduce2;
//switch for the input of the reduce
reduceMode reduce_mode;
//the OCL Id the accelerator is mapped to
cl_device_id deviceId;
};
/*!
* \class ff_stencilReduceLoopOCL_1D
* \ingroup high_level_patterns
*
* \brief The OpenCL-based StencilReduceLoop pattern in 1 dimension
*
* This class is defined in \ref stencilReduceOCL.hpp
*/
template<typename T, typename TOCL = T, typename accelerator_t = ff_oclAccelerator<T, TOCL> >
class ff_stencilReduceLoopOCL_1D: public ff_oclNode_t<T> {
public:
typedef typename TOCL::Tin Tin;
typedef typename TOCL::Tout Tout;
// build the program from the mapf and reducef functions
ff_stencilReduceLoopOCL_1D(const std::string &mapf, //OpenCL elemental function
const std::string &reducef = std::string(""), //OpenCL combinator function
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1, const int width = 1) :
oneshot(false), saveBinary(false), reuseBinary(false),
accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS),
stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) {
setcode(mapf, reducef);
for(size_t i = 0; i< NACCELERATORS; ++i)
accelerators[i]= new accelerator_t(allocator, width,identityVal);
#ifdef FF_OPENCL_LOG
fprintf(stderr,"[ff_stencilReduceLoopOCL_1D node @%p]\n",this);
fprintf(stderr,"map-kernel code:\n%s\n", mapf.c_str());
fprintf(stderr,"reduce-kernel code:\n%s\n", reducef.c_str());
#endif
}
// build the program from source code file,
// first attempts to load a cached binary file (kernels_source in this case is the path to the binary file)
// ff that file is not available, then it creates the program from source and store the binary for future use
// with the extention ".bin"
ff_stencilReduceLoopOCL_1D(const std::string &kernels_source, // OpenCL source code path
const std::string &mapf_name, // name of the map function
const std::string &reducef_name, // name of the reduce function
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1, const int width = 1) :
oneshot(false), saveBinary(false), reuseBinary(false),
accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS),
stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) {
setsourcecode(kernels_source, mapf_name, reducef_name);
for(size_t i = 0; i< NACCELERATORS; ++i)
accelerators[i]= new accelerator_t(allocator, width, identityVal, true);
}
// the task is provided in the constructor -- one shot computation
ff_stencilReduceLoopOCL_1D(const T &task,
const std::string &mapf,
const std::string &reducef = std::string(""),
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1, const int width = 1) :
oneshot(true), saveBinary(false), reuseBinary(false),
accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS),
stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) {
ff_node::skipfirstpop(true);
setcode(mapf, reducef);
setTask(const_cast<T&>(task));
for(size_t i = 0; i< NACCELERATORS; ++i)
accelerators[i]= new accelerator_t(allocator, width,identityVal);
#ifdef FF_OPENCL_LOG
fprintf(stderr,"[ff_stencilReduceLoopOCL_1D node @%p]\n",this);
fprintf(stderr,"map-kernel code:\n%s\n", mapf.c_str());
fprintf(stderr,"reduce-kernel code:\n%s\n", reducef.c_str());
#endif
}
// the task is provided in the constructor -- one shot computation
ff_stencilReduceLoopOCL_1D(const T &task,
const std::string &kernels_source, // OpenCL source code path
const std::string &mapf_name, // name of the map kernel function
const std::string &reducef_name, // name of the reduce kernel function
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1, const int width = 1) :
oneshot(true), saveBinary(false), reuseBinary(false),
accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS),
stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) {
setsourcecode(kernels_source, mapf_name, reducef_name);
setTask(const_cast<T&>(task));
for(size_t i = 0; i< NACCELERATORS; ++i)
accelerators[i]= new accelerator_t(allocator, width, identityVal, true);
}
virtual ~ff_stencilReduceLoopOCL_1D() {
for(size_t i = 0; i< accelerators.size(); ++i)
if (accelerators[i]) delete accelerators[i];
}
// used to set tasks when in onshot mode
void setTask(T &task) {
Task.resetTask();
Task.setTask(&task);
}
/**
* explicitly set the OpenCL devices to be used
*
* @param dev is the vector of devices (OpenCL Ids) to be used
*/
void setDevices(std::vector<cl_device_id> &dev) {
// if (dev.size() > accelerators.size()) {
// error("ff_stencilReduceLoopOCL_1D::setDevices: Too many devices provided, please increase the number of logical accelerators\n");
// return -1;
// }
devices = dev;
// return 0;
}
// force execution on the CPU
void pickCPU () {
ff_oclNode_t<T>::setDeviceType(CL_DEVICE_TYPE_CPU);
}
// force execution on the GPU - as many as requested by the co-allocation strategy
void pickGPU (size_t offset=0 /* referred to global list of devices */) {
offset_dev=offset; //TODO check numbering
ff_oclNode_t<T>::setDeviceType(CL_DEVICE_TYPE_GPU);
}
// after the compilation and building phases, the OpenCL program will be saved as binary file
// this action takes effect only if the compilation is made with source file (i.e. not using macroes)
void saveBinaryFile() { saveBinary = true; }
// tells the run-time to re-use the binary file available
void reuseBinaryFile() { reuseBinary = true; }
virtual int run(bool = false) {
return ff_node::run();
}
virtual int wait() {
return ff_node::wait();
}
virtual int run_and_wait_end() {
if (run() < 0)
return -1;
if (wait() < 0)
return -1;
return 0;
}
virtual int run_then_freeze() {
if (ff_node::isfrozen()) {
ff_node::thaw(true);
return 0;
}
return ff_node::freeze_and_run();
}
virtual int wait_freezing() {
return ff_node::wait_freezing();
}
const T* getTask() const {
return &Task;
}
unsigned int getIter() {
return Task.getIter();
}
Tout *getReduceVar() {
assert(oneshot);
return Task.getReduceVar();
}
/**
* Performs a static allocation of OpenCL devices.
* Priority is given to GPU devices, falling back to CPU devices
* if needed (e.g. no GPU devices).
* Currently it does not mix GPUs with CPU
*/
int nodeInit() {
if (ff_oclNode_t<T>::oclId < 0) { //check if already initialized
ff_oclNode_t<T>::oclId = clEnvironment::instance()->getOCLID();
if (devices.size() == 0) { // the user didn't set any specific device
switch (ff_oclNode_t<T>::getDeviceType()) {
case CL_DEVICE_TYPE_ALL:
case CL_DEVICE_TYPE_GPU: {
// Retrive multiple logical GPU devices (non-exclusive mode)
std::vector<ssize_t> logdev =
clEnvironment::instance()->coAllocateGPUDeviceRR(accelerators.size(),
(offset_dev == 0) ? -1 : offset_dev);
if (logdev.size() == 0) {
//could not fulfill allocation request
if(ff_oclNode_t<T>::getDeviceType() == CL_DEVICE_TYPE_GPU) {
error("not enough GPUs found !\n");
return -1;
}
//if user did not require require GPU devices, fallback to CPU
} else {
//convert retrieved logical devices into opencl Ids
devices.clear();
for (size_t i = 0; i < logdev.size(); ++i)
devices.push_back(clEnvironment::instance()->getDevice(logdev[i]));
for (size_t i = 0; i < devices.size(); ++i)
if (accelerators[i]->init(devices[i], getReduceMode(), kernel_code,
kernel_name1, kernel_name2, saveBinary, reuseBinary) < 0)
return -1;
break;
}
break;
}
case CL_DEVICE_TYPE_CPU: {
if (accelerators.size() > 1) {
error(
"Multiple (>1) virtual accelerators on CPU are requested. Not yet implemented.\n");
return -1;
} else {
//Retrieve the CPU device
devices.clear();
devices.push_back(clEnvironment::instance()->getDevice( //convert to OpenCL Id
clEnvironment::instance()->getCPUDevice())); //retrieve logical device
if (accelerators[0]->init(devices[0], getReduceMode(), kernel_code,
kernel_name1, kernel_name2, saveBinary, reuseBinary) < 0)
return -1;
}
}
break;
default: {
error(
"stencilReduceOCL::Other device. Not yet implemented.\n");
return -1;
}
} //end switch on ff_oclNode_t<T>::getDeviceType()
} else {
//user requested specific OpenCL devices
if (devices.size() > accelerators.size()) {
error(
"stencilReduceOCL::nodeInit: Too many devices requested, increase the number of accelerators!\n");
return -1;
}
// NOTE: the number of devices requested can be lower than the number of accelerators.
// TODO must be managed
for (size_t i = 0; i < devices.size(); ++i)
accelerators[i]->init(devices[i], getReduceMode(), kernel_code, kernel_name1,
kernel_name2, saveBinary, reuseBinary);
}
}
// for (size_t i = 0; i < devices.size(); ++i)
// std::cerr << "Using " << clEnvironment::instance()->getDeviceInfo(devices[i]) << std::endl;
return 0;
}
void nodeEnd() {}
#if defined(FF_REPARA)
/**
* Returns input data size
*/
size_t rpr_get_sizeIn() const { return ff_node::rpr_sizeIn; }
/**
* Returns output data size
*/
size_t rpr_get_sizeOut() const { return ff_node::rpr_sizeOut; }
#endif
protected:
virtual bool isPureMap() const { return false; }
virtual bool isPureReduce() const { return false; }
reduceMode getReduceMode() {
return isPureReduce() ? REDUCE_INPUT : REDUCE_OUTPUT;
}
virtual int svc_init() { return nodeInit(); }
#if 0
virtual int svc_init() {
if (ff_oclNode_t<T>::oclId < 0) {
ff_oclNode_t<T>::oclId = clEnvironment::instance()->getOCLID();
switch(ff_oclNode_t<T>::getDeviceType()) {
case CL_DEVICE_TYPE_ALL:
fprintf(stderr,"STATUS: requested ALL\n");
case CL_DEVICE_TYPE_GPU: {// One or more GPUs
// Not exclusive
// Retrive logical devices
std::vector<ssize_t> logdev = clEnvironment::instance()->coAllocateGPUDeviceRR(accelerators.size());
// Convert into opencl Ids
devices.clear();
for (size_t i = 0; i < logdev.size(); ++i)
devices.push_back(clEnvironment::instance()->getDevice(logdev[i]));
if (devices.size() == 0) {
error("stencilReduceOCL::svc_init:not enough GPUs found !\n");
return -1;
} else {
// Ok
for (size_t i = 0; i < devices.size(); ++i)
accelerators[i]->init(devices[i], kernel_code, kernel_name1,kernel_name2);
break;
}
}
case CL_DEVICE_TYPE_CPU: {
if (accelerators.size()>1) {
error ("Multiple (>1) virtual accelerators on CPU are requested. Not yet implemented.\n");
return -1;
} else {
// Ok
devices.clear();
devices.push_back(clEnvironment::instance()->getDevice(clEnvironment::instance()->getCPUDevice()));
accelerators[0]->init(devices[0], kernel_code, kernel_name1,kernel_name2);
}
} break;
default: {
error("stencilReduceOCL::Other device. Not yet implemented.\n");
} break;
}
}
return 0;
}
#endif
virtual void svc_end() {
if (!ff::ff_node::isfrozen()) nodeEnd();
}
T *svc(T *task) {
if (task) setTask(*task);
Tin *inPtr = Task.getInPtr();
Tout *outPtr = Task.getOutPtr();
Tout *reducePtr = Task.getReduceVar();
const size_t envSize = Task.getEnvNum(); //n. added environments
#if defined(FF_REPARA)
ff_node::rpr_sizeIn = ff_node::rpr_sizeOut = 0;
#endif
// if the computation is not in-place then we start from the output
if ((void*)inPtr != (void*)outPtr) {
// adjust allocator output-portions and relocate output device memory if needed
if (oldSizeOut != Task.getBytesizeOut()) {
compute_accmem(Task.getSizeOut(), acc_out);
const bool memorychange = (oldSizeOut < Task.getBytesizeOut());
for (size_t i = 0; i < accelerators.size(); ++i) {
accelerators[i]->adjustOutputBufferOffset(outPtr, (memorychange?old_outPtr:NULL), acc_out[i], Task.getSizeOut());
}
if (memorychange) {
for (size_t i = 0; i < accelerators.size(); ++i) {
accelerators[i]->relocateOutputBuffer(outPtr, reducePtr);
}
oldSizeOut = Task.getBytesizeOut();
old_outPtr = outPtr;
}
}
}
// adjust allocator input-portions and relocate input device memory if needed
if (oldBytesizeIn != Task.getBytesizeIn()) {
compute_accmem(Task.getSizeIn(), acc_in);
const bool memorychange = (oldBytesizeIn < Task.getBytesizeIn());
adjustInputBufferOffset(memorychange);
if (memorychange) {
for (size_t i = 0; i < accelerators.size(); ++i) {
accelerators[i]->relocateInputBuffer(inPtr, Task.getReuseIn(), reducePtr);
}
oldBytesizeIn = Task.getBytesizeIn();
old_inPtr = inPtr;
}
}
// in-place computation
// adjust allocator output-portions and relocate output device memory if needed
if (((void*)inPtr == (void*)outPtr) && ( oldSizeOut != Task.getBytesizeOut())) {
for (size_t i = 0; i < accelerators.size(); ++i) {
accelerators[i]->setInPlace(reducePtr);
}
}
//relocate env device memory
//TODO on-demand relocate, as for input/output memory
/* NOTE: env buffer are replicated on all devices.
* It would be nice to have replicated/partitioned polices
*/
for (size_t i = 0; i < accelerators.size(); ++i)
for(size_t k=0; k < envSize; ++k) {
char *envptr;
Task.getEnvPtr(k, envptr);
accelerators[i]->relocateEnvBuffer(envptr, Task.getReuseEnv(k), k, Task.getBytesizeEnv(k));
}
if (!isPureReduce()) //set kernel args
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->setMapKernelArgs(envSize);
//(async) copy input and environments (h2d)
for (size_t i = 0; i < accelerators.size(); ++i) {
if (Task.getCopyIn()) {
#if defined(FF_REPARA)
ff_node::rpr_sizeIn += accelerators[i]->asyncH2Dinput(Task.getInPtr()); //in
#else
accelerators[i]->asyncH2Dinput(Task.getInPtr()); //in
#endif
}
for(size_t k=0; k < envSize; ++k) {
if (Task.getCopyEnv(k)) {
char *envptr;
Task.getEnvPtr(k, envptr);
#if defined(FF_REPARA)
ff_node::rpr_sizeIn += accelerators[i]->asyncH2Denv(k, envptr);
#else
accelerators[i]->asyncH2Denv(k, envptr);
#endif
}
}
}
if (isPureReduce()) {
//init reduce
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->initReduce();
//wait for cross-accelerator h2d
waitforh2d();
//(async) device-reduce1
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncExecReduceKernel1();
//(async) device-reduce2
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncExecReduceKernel2();
waitforreduce(); //wait for cross-accelerator reduce
//host-reduce
Tout redVar = accelerators[0]->getReduceVar();
for (size_t i = 1; i < accelerators.size(); ++i)
redVar = Task.combinator(redVar, accelerators[i]->getReduceVar());
Task.writeReduceVar(redVar);
} else {
Task.resetIter();
if (isPureMap()) {
//wait for cross-accelerator h2d
waitforh2d();
//(async) exec kernel
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncExecMapKernel();
Task.incIter();
waitformap(); //join
} else { //iterative Map-Reduce (aka stencilReduceLoop)
//invalidate first swap
for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->swap();
bool go = true;
do {
//Task.before();
for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->swap();
//wait for cross-accelerator h2d
waitforh2d();
//(async) execute MAP kernel
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncExecMapKernel();
Task.incIter();
//start async-interleaved: reduce + borders sync
//init reduce
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->initReduce();
//(async) device-reduce1
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncExecReduceKernel1();
//(async) device-reduce2
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncExecReduceKernel2();
//wait for cross-accelerators reduce
waitforreduce();
//host-reduce
Tout redVar = accelerators[0]->getReduceVar();
for (size_t i = 1; i < accelerators.size(); ++i)
redVar = Task.combinator(redVar, accelerators[i]->getReduceVar());
Task.writeReduceVar(redVar);
go = Task.iterCondition_aux();
if (go) {
assert(outPtr);
//(async) read back borders (d2h)
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncD2Hborders(outPtr);
waitford2h(); //wait for cross-accelerators d2h
//(async) read borders (h2d)
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->asyncH2Dborders(outPtr);
}
//Task.after();
} while (go);
}
//(async)read back output (d2h)
if (outPtr && Task.getCopyOut()) { // do we have to copy back the output result ?
for (size_t i = 0; i < accelerators.size(); ++i) {
#if defined(FF_REPARA)
ff_node::rpr_sizeOut += accelerators[i]->asyncD2Houtput(outPtr);
#else
accelerators[i]->asyncD2Houtput(outPtr);
#endif
}
waitford2h(); //wait for cross-accelerators d2h
}
}
// device memory cleanup phase
if (Task.getReleaseIn() && (void *)outPtr != (void *)inPtr) {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->releaseInput(inPtr);
oldBytesizeIn = 0;
old_inPtr = NULL;
}
if ( Task.getReleaseOut() ) {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->releaseOutput(outPtr);
oldSizeOut = 0;
old_outPtr = NULL;
}
for(size_t k=0; k < envSize; ++k) {
if (Task.getReleaseEnv(k)) {
char *envptr;
Task.getEnvPtr(k, envptr);
if ((void*)envptr != (void*)outPtr) {
for (size_t i = 0; i < accelerators.size(); ++i) {
accelerators[i]->releaseEnv(k,envptr);
}
}
}
// TODO: management of oldEnvPtr !!
// currently the size of the envbuffer should be always the same !
}
// per task memory cleanup phase
Task.releaseTask(task);
return (oneshot ? NULL : task);
}
protected:
virtual void adjustInputBufferOffset(const bool memorychange) {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->adjustInputBufferOffset(Task.getInPtr(),
(memorychange ? old_inPtr : NULL), acc_in[i],
Task.getSizeIn());
}
void setcode(const std::string &codestr1, const std::string &codestr2) {
int n = 0;
if (codestr1 != "") {
n = codestr1.find_first_of("|");
assert(n > 0);
kernel_name1 = codestr1.substr(0, n);
const std::string &tmpstr = codestr1.substr(n + 1);
n = tmpstr.find_first_of("|");
assert(n > 0);
// checking for double type
if (tmpstr.substr(0, n) == "double") {
kernel_code = "\n#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n"
+ tmpstr.substr(n + 1);
} else
kernel_code = "\n" + tmpstr.substr(n + 1);
}
// checking for extra code needed to compile the kernels
std::ifstream ifs(FF_OPENCL_DATATYPES_FILE);
if (ifs.is_open())
kernel_code.insert(kernel_code.begin(), std::istreambuf_iterator<char>(ifs),
std::istreambuf_iterator<char>());
if (codestr2 != "") {
n = codestr2.find("|");
assert(n > 0);
kernel_name2 += codestr2.substr(0, n);
const std::string &tmpstr = codestr2.substr(n + 1);
n = tmpstr.find("|");
assert(n > 0);
// checking for double type
if (tmpstr.substr(0, n) == "double") {
kernel_code += "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
+ tmpstr.substr(n + 1);
} else
kernel_code += tmpstr.substr(n + 1);
}
}
void setsourcecode(const std::string &source, const std::string &kernel1, const std::string &kernel2) {
if (kernel1 != "") kernel_name1 = "kern_"+kernel1;
if (kernel2 != "") kernel_name2 = "kern_"+kernel2;
kernel_code = source;
}
//assign input partition to accelerators
//acc[i] = (start, size) where:
// - start is the first element assigned to accelerator i
// - size is the number of elements assigned to accelerator i
void compute_accmem(const size_t len, std::vector<std::pair<size_t,size_t> > &acc) {
size_t start = 0, step = (len + accelerators.size() - 1) / accelerators.size();
size_t i = 0;
for (; i < accelerators.size() - 1; ++i) {
acc[i]=std::make_pair(start, step);
start += step;
}
acc[i]=std::make_pair(start, len-start);
}
void waitforh2d() {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->waitforh2d();
}
void waitford2h() {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->waitford2h();
}
void waitforreduce() {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->waitforreduce();
}
void waitformap() {
for (size_t i = 0; i < accelerators.size(); ++i)
accelerators[i]->waitformap();
}
TOCL Task;
const bool oneshot;
bool saveBinary, reuseBinary;
std::vector<accelerator_t*> accelerators;
std::vector<std::pair<size_t, size_t> > acc_in;
std::vector<std::pair<size_t, size_t> > acc_out;
std::vector<cl_device_id> devices;
int stencil_width_half;
//size_t preferred_dev;
size_t offset_dev;
std::string kernel_code;
std::string kernel_name1;
std::string kernel_name2;
size_t forced_cpu;
size_t forced_gpu;
size_t forced_other;
Tin *old_inPtr;
Tout *old_outPtr;
size_t oldBytesizeIn, oldSizeOut, oldSizeReduce;
};
/*!
* \class ff_mapOCL_1D
* \ingroup high_level_patterns
*
* \brief The OpenCL-based Map pattern in 1 dimension
*
* This class is defined in \ref stencilReduceOCL.hpp
*
*/
template<typename T, typename TOCL = T>
class ff_mapOCL_1D: public ff_stencilReduceLoopOCL_1D<T, TOCL> {
public:
ff_mapOCL_1D(std::string mapf, ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(mapf, "", 0, alloc, NACCELERATORS, 0) {
}
ff_mapOCL_1D(const std::string &kernels_source, const std::string &mapf_name,
ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(kernels_source, mapf_name, "", 0, alloc, NACCELERATORS, 0) {
}
ff_mapOCL_1D(const T &task, std::string mapf,
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(task, mapf, "", 0, alloc, NACCELERATORS, 0) {
}
ff_mapOCL_1D(const T &task, const std::string &kernels_source, const std::string &mapf_name,
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(task, kernels_source, mapf_name, "", 0, alloc, NACCELERATORS, 0) {
}
bool isPureMap() const { return true; }
};
/*!
* \class ff_reduceOCL_1D
*
* \ingroup high_level_patterns
*
* \brief The OpenCL-based Reduce pattern in 1 dimension
*
* This class is defined in \ref stencilReduceOCL.hpp
*
*/
template<typename T, typename TOCL = T>
class ff_reduceOCL_1D: public ff_stencilReduceLoopOCL_1D<T, TOCL> {
public:
typedef typename TOCL::Tin Tin;
typedef typename TOCL::Tout Tout;
ff_reduceOCL_1D(std::string reducef, const Tout &identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>("", reducef, identityVal, alloc, NACCELERATORS, 0) {
}
ff_reduceOCL_1D(const std::string &kernels_source, const std::string &reducef_name, const Tout &identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(kernels_source, "", reducef_name, identityVal, alloc, NACCELERATORS, 0) {
}
ff_reduceOCL_1D(const T &task, std::string reducef, const Tout identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(task, "", reducef, identityVal, alloc, NACCELERATORS, 0) {
}
ff_reduceOCL_1D(const T &task, const std::string &kernels_source,const std::string &reducef_name,
const Tout identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(task, kernels_source, "", reducef_name, identityVal, alloc, NACCELERATORS, 0) {
}
bool isPureReduce() const { return true; }
};
/*
* \class f_mapReduceOCL_1D
* \ingroup high_level_patterns
*
* \brief The mapReduce skeleton.
*
* The mapReuce skeleton using OpenCL
*
* This class is defined in \ref map.hpp
*
*/
template<typename T, typename TOCL = T>
class ff_mapReduceOCL_1D: public ff_stencilReduceLoopOCL_1D<T, TOCL> {
public:
typedef typename TOCL::Tin Tin;
typedef typename TOCL::Tout Tout;
ff_mapReduceOCL_1D(std::string mapf, std::string reducef, const Tout &identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(mapf, reducef, identityVal, alloc, NACCELERATORS, 0) {
}
ff_mapReduceOCL_1D(const std::string &kernels_code, const std::string &mapf_name,
const std::string &reducef_name, const Tout &identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(kernels_code, mapf_name, reducef_name, identityVal, alloc, NACCELERATORS, 0) {
}
ff_mapReduceOCL_1D(const T &task, std::string mapf, std::string reducef,
const Tout &identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(task, mapf, reducef, identityVal, alloc, NACCELERATORS, 0) {
}
ff_mapReduceOCL_1D(const T &task, const std::string &kernels_code, const std::string &mapf_name,
const std::string &reducef_name, const Tout &identityVal = Tout(),
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_1D<T, TOCL>(task, kernels_code, mapf_name, reducef_name, identityVal, alloc, NACCELERATORS, 0) {
}
};
/*** 2D ***/
/**
* a task to be executed by a 2D stencilReduceLoop node.
* This class represent a computation to be performed on a
* logical 2D matrix, stored in host memory as 1D row-major array.
*/
template<typename TaskT_, typename Tin_, typename Tout_ = Tin_>
class baseOCLTask_2D: public baseOCLTask<TaskT_, Tin_, Tout_> {
public:
/**
* set the number of rows of the logical 2D input.
* To be called from setTask.
*
* @param h is the number of rows
*/
void setHeight(size_t h) {
height = h;
}
/**
* set the number of columns of the logical 2D input.
* To be called from setTask.
*
* @param h is the number of columns
*/
void setWidth(size_t w) {
width = w;
}
//runtime getter functions
size_t getHeight() const { return height;}
size_t getWidth() const {return width;}
protected:
size_t height, width;
};
/**
* a virtual OpenCL accelerator for 2D map kernels working on
* logical 2D matrices stored as row-major arrays.
*/
template<typename T, typename TOCL = T>
class ff_oclAccelerator_2D : public ff_oclAccelerator<T, TOCL> {
public:
typedef typename TOCL::Tin Tin;
typedef typename TOCL::Tout Tout;
ff_oclAccelerator_2D(ff_oclallocator *alloc, const size_t halo_width_, const Tout &identityVal, const bool from_source=false) :
ff_oclAccelerator<T,TOCL>(alloc, halo_width_, identityVal, from_source) {
heightInput_global = 0;
widthInput_global = 0;
}
void setMapKernelArgs(const size_t envSize) {
cl_uint idx = 0;
//set iteration-dynamic MAP kernel args (init)
cl_int status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &this->inputBuffer);
checkResult(status, "setKernelArg input");
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &this->outputBuffer);
checkResult(status, "setKernelArg output");
//set iteration-invariant MAP kernel args
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &heightInput_global);
checkResult(status, "setKernelArg global input height");
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &widthInput_global);
checkResult(status, "setKernelArg global input width");
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->lenOutput);
checkResult(status, "setKernelArg local input length");
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->offset1_in);
checkResult(status, "setKernelArg offset");
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->halo_out_left);
checkResult(status, "setKernelArg halo");
for(size_t k=0; k < envSize; ++k) {
status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &(this->envBuffer[k].first));
checkResult(status, "setKernelArg env");
}
}
/**
* set the global number of rows of the logical 2D input.
*/
void setHeight(size_t h) {
heightInput_global = h;
}
void setWidth(size_t w) {
widthInput_global = w;
}
private:
size_t heightInput_global, widthInput_global;
};
/**
* a stencilReduceLoop for executing 2D OCL tasks.
*/
template<typename T, typename TOCL = T>
class ff_stencilReduceLoopOCL_2D: public ff_stencilReduceLoopOCL_1D<T,TOCL,ff_oclAccelerator_2D<T,TOCL> > {
private:
typedef typename TOCL::Tin Tin;
typedef typename TOCL::Tout Tout;
typedef ff_oclAccelerator_2D<T,TOCL> accelerator_t;
typedef ff::ff_stencilReduceLoopOCL_1D<T,TOCL,accelerator_t> base_srl_t;
public:
// build the program from the mapf and reducef functions
ff_stencilReduceLoopOCL_2D(const std::string &mapf, //OpenCL elemental function
const std::string &reducef = std::string(""), //OpenCL combinator function
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1,
const int stencil_width_half_ = 1,
const int stencil_height_half_ = 1) :
//init srl 1D with no halo (1D halo depends on 2D width)
base_srl_t(mapf, reducef, identityVal, allocator, NACCELERATORS, 0),
stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {}
// build the program from source code file,
// first attempts to load a cached binary file (kernels_source in this case is the path to the binary file)
// ff that file is not available, then it creates the program from source and store the binary for future use
// with the extention ".bin"
ff_stencilReduceLoopOCL_2D(const std::string &kernels_source, // OpenCL source code path
const std::string &mapf_name, // name of the map function
const std::string &reducef_name, // name of the reduce function
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1,
const int stencil_width_half_ = 1,
const int stencil_height_half_ = 1) :
//init srl 1D with no halo (1D halo depends on 2D width)
base_srl_t(kernels_source, mapf_name, reducef_name, identityVal, allocator, NACCELERATORS, 0),
stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {}
// the task is provided in the constructor -- one shot computation
ff_stencilReduceLoopOCL_2D(const T &task,
const std::string &mapf,
const std::string &reducef = std::string(""),
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1,
const int stencil_width_half_ = 1,
const int stencil_height_half_ = 1) :
//init srl 1D with no halo (1D halo depends on 2D width)
base_srl_t(task, mapf, reducef, identityVal, allocator, NACCELERATORS, 0),
stencil_width_half(stencil_width_half_),stencil_height_half(stencil_height_half_) {}
// the task is provided in the constructor -- one shot computation
ff_stencilReduceLoopOCL_2D(const T &task,
const std::string &kernels_source, // OpenCL source code path
const std::string &mapf_name, // name of the map kernel function
const std::string &reducef_name, // name of the reduce kernel function
const Tout &identityVal = Tout(),
ff_oclallocator *allocator = nullptr,
const size_t NACCELERATORS = 1,
const int stencil_width_half_ = 1,
const int stencil_height_half_ = 1) :
//init srl 1D with no halo (1D halo depends on 2D width)
base_srl_t(task, kernels_source, mapf_name, reducef_name, identityVal, allocator, NACCELERATORS, 0),
stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {}
protected:
void adjustInputBufferOffset(const bool memorychange) {
for (size_t i = 0; i < this->accelerators.size(); ++i) {
this->accelerators[i]->setWidth(this->Task.getWidth());
this->accelerators[i]->setHeight(this->Task.getHeight());
this->accelerators[i]->setHaloHalf(halo_half(this->Task.getWidth()));
}
//accelerator uses above values to compute actual sizing
base_srl_t::adjustInputBufferOffset(memorychange);
}
private:
const size_t halo_half(const size_t width) {
return stencil_height_half * width + stencil_width_half;
}
/*
* 2D stencil can access elements in H x W rectangle around each element, where:
* H = 2 * stencil_height_half + 1
* W = 2 * stencil_width_half + 1
*/
const int stencil_width_half;
const int stencil_height_half;
};
/*!
* \class ff_mapOCL_1D
* \ingroup high_level_patterns
*
* \brief The OpenCL-based Map pattern in 1 dimension
*
* This class is defined in \ref stencilReduceOCL.hpp
*
*/
template<typename T, typename TOCL = T>
class ff_mapOCL_2D: public ff_stencilReduceLoopOCL_2D<T, TOCL> {
public:
ff_mapOCL_2D(std::string mapf, ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_2D<T, TOCL>(mapf, "", 0, alloc, NACCELERATORS) {
}
ff_mapOCL_2D(const std::string &kernels_source, const std::string &mapf_name,
ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_2D<T, TOCL>(kernels_source, mapf_name, "", 0, alloc, NACCELERATORS) {
}
ff_mapOCL_2D(const T &task, std::string mapf,
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_2D<T, TOCL>(task, mapf, "", 0, alloc, NACCELERATORS) {
}
ff_mapOCL_2D(const T &task, const std::string &kernels_source, const std::string &mapf_name,
ff_oclallocator *alloc=nullptr,
const size_t NACCELERATORS = 1) :
ff_stencilReduceLoopOCL_2D<T, TOCL>(task, kernels_source, mapf_name, "", 0, alloc, NACCELERATORS) {
}
bool isPureMap() const { return true; }
};
}// namespace ff
#endif // FF_OCL
#endif /* FF_MAP_OCL_HPP */