/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /*! * \file stencilReduceOCL.hpp * \ingroup high_level_patterns * * \brief StencilReduceLoop data-parallel pattern and derived data-parallel patterns * */ /* *************************************************************************** * * FastFlow is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License version 3 as * published by the Free Software Foundation. * Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3 * or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT) * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * **************************************************************************** */ /* * Authors: * Maurizio Drocco * Massimo Torquati * Marco Aldinucci * */ #ifndef FF_STENCILREDUCE_OCL_HPP #define FF_STENCILREDUCE_OCL_HPP #ifdef FF_OPENCL #include #include #include #include #include #include #include #include #include namespace ff { enum reduceMode { REDUCE_INPUT, REDUCE_OUTPUT }; /** * a task to be executed by a 1D stencilReduceLoop node. */ template class baseOCLTask { public: typedef TaskT_ TaskT; typedef Tin_ Tin; typedef Tout_ Tout; baseOCLTask(): inPtr(NULL),outPtr(NULL),reduceVar(NULL), size_in(0),size_out(0),iter(0), tuple_in(std::make_tuple(true,false,false)), tuple_out(std::make_tuple(true,false,false)) { } virtual ~baseOCLTask() { } // user must override this method using: // - setInPtr for setting the host-pointer to the input array // - setOutPtr for setting the host-pointer to the output array // - setEnvPtr for adding to env-list the host-pointer to a read-only env // - other methods from classes derived from baseOCLTask // NOTE: order of setEnvPtr calls matters! TODO refine interface? virtual void setTask(TaskT *t) = 0; /* --- the user may overrider these methods --- */ // called at the end of the stencil-reduce loop. It may be used to // perform per-task host memory cleanup (i.e. releasing the host memory // previously allocated in the setTask function) or to execute a // post-elaboration phase virtual void releaseTask(TaskT *t) {} // computes the loop iteration condition virtual bool iterCondition(const Tout&, size_t) { return false; } // host reduce function virtual Tout combinator(const Tout&, const Tout&) { return Tout(); } // step functions virtual void incIter() { ++iter; } virtual size_t getIter() const { return iter; } virtual void resetIter(const size_t val=0) { iter = val; } /* -------------------------------------------- */ void resetTask() { envPtr.resize(0); copyEnv.resize(0); } /** * set the host-pointer to the input array. * * @param _inPtr the host-pointer * @param sizeIn the number of elements in the input array * @param copy TODO * @param reuse TODO * @param release TODO */ void setInPtr(Tin* _inPtr, size_t sizeIn, const CopyFlags copy =CopyFlags::COPY, const ReuseFlags reuse =ReuseFlags::DONTREUSE, const ReleaseFlags release=ReleaseFlags::DONTRELEASE) { inPtr = _inPtr; size_in = sizeIn; tuple_in = std::make_tuple(copy==CopyFlags::COPY, reuse==ReuseFlags::REUSE, release==ReleaseFlags::RELEASE); } /** * set the host-pointer to the input array. * * @see setInPtr() */ void setInPtr(Tin* _inPtr, size_t sizeIn, const MemoryFlags &flags) { inPtr = _inPtr; size_in = sizeIn; tuple_in = std::make_tuple(flags.copy==CopyFlags::COPY, flags.reuse==ReuseFlags::REUSE, flags.release==ReleaseFlags::RELEASE); } /** * set the host-pointer to the output array. * * @see setInPtr() */ void setOutPtr(Tout* _outPtr, size_t sizeOut, const CopyFlags copyback =CopyFlags::COPY, const ReuseFlags reuse =ReuseFlags::DONTREUSE, const ReleaseFlags release =ReleaseFlags::DONTRELEASE) { outPtr = _outPtr; size_out = sizeOut; tuple_out = std::make_tuple(copyback==CopyFlags::COPY, reuse==ReuseFlags::REUSE, release==ReleaseFlags::RELEASE); } /** * set the host-pointer to the output array. * * @see setInPtr() */ void setOutPtr(Tout* _outPtr, size_t sizeOut, const MemoryFlags &flags ) { outPtr = _outPtr; size_out = sizeOut; tuple_out = std::make_tuple(flags.copy==CopyFlags::COPY, flags.reuse==ReuseFlags::REUSE, flags.release==ReleaseFlags::RELEASE); } /** * add to env-list the host-pointer to a read-only env. * * @see setInPtr() */ template void setEnvPtr(const ptrT* _envPtr, size_t size, const CopyFlags copy =CopyFlags::COPY, const ReuseFlags reuse =ReuseFlags::DONTREUSE, const ReleaseFlags release=ReleaseFlags::DONTRELEASE) { assert(envPtr.size() == copyEnv.size()); envPtr.push_back(std::make_pair((void*)_envPtr,size*sizeof(ptrT))); copyEnv.push_back(std::make_tuple(sizeof(ptrT), copy==CopyFlags::COPY, reuse==ReuseFlags::REUSE, release==ReleaseFlags::RELEASE)); } /** * add to env-list the host-pointer to a read-only env. * * @see setInPtr() */ template void setEnvPtr(const ptrT* _envPtr, size_t size, const MemoryFlags &flags) { assert(envPtr.size() == copyEnv.size()); envPtr.push_back(std::make_pair((void*)_envPtr,size*sizeof(ptrT))); copyEnv.push_back(std::make_tuple(sizeof(ptrT), flags.copy==CopyFlags::COPY, flags.reuse==ReuseFlags::REUSE, flags.release==ReleaseFlags::RELEASE)); } Tin * getInPtr() const { return inPtr; } Tout * getOutPtr() const { return outPtr; } template void getEnvPtr(const size_t idx, ptrT *& ptr) const { assert(idx < envPtr.size()); ptr = reinterpret_cast(envPtr[idx].first); } size_t getEnvNum() const { assert(envPtr.size() == copyEnv.size()); return envPtr.size(); } bool getCopyEnv(const size_t idx) const { assert(idx < copyEnv.size()); return std::get<1>(copyEnv[idx]); } bool getReuseEnv(const size_t idx) const { assert(idx < copyEnv.size()); return std::get<2>(copyEnv[idx]); } bool getReleaseEnv(const size_t idx) const { assert(idx < copyEnv.size()); return std::get<3>(copyEnv[idx]); } bool getCopyIn() const { return std::get<0>(tuple_in); } bool getReuseIn() const { return std::get<1>(tuple_in); } bool getReleaseIn() const { return std::get<2>(tuple_in); } bool getCopyOut() const { return std::get<0>(tuple_out); } bool getReuseOut() const { return std::get<1>(tuple_out); } bool getReleaseOut() const { return std::get<2>(tuple_out); } size_t getSizeIn() const { return size_in; } size_t getSizeOut() const { return (size_out==0)?size_in:size_out; } size_t getSizeEnv(const size_t idx) const { assert(idx < copyEnv.size()); return std::get<0>(copyEnv[idx]); } size_t getBytesizeIn() const { return getSizeIn() * sizeof(Tin); } size_t getBytesizeOut() const { return getSizeOut() * sizeof(Tout); } size_t getBytesizeEnv(const size_t idx) const { assert(idx < envPtr.size()); return envPtr[idx].second; } void setReduceVar(const Tout *r) { reduceVar = (Tout*)r; } Tout *getReduceVar() const { return reduceVar; } void writeReduceVar(const Tout &r) { *reduceVar = r; } void setIdentityVal(const Tout &x) { identityVal = x;} Tout getIdentityVal() const { return identityVal; } bool iterCondition_aux() { return iterCondition(*reduceVar, iter); } protected: Tin *inPtr; Tout *outPtr; Tout *reduceVar, identityVal; size_t size_in, size_out, iter; std::tuple tuple_in; std::tuple tuple_out; std::vector > envPtr; // pointer and byte-size std::vector > copyEnv; // size and flags }; /** * a virtual OpenCL accelerator.for 1D kernels */ template class ff_oclAccelerator { public: typedef typename TOCL::Tin Tin; typedef typename TOCL::Tout Tout; ff_oclAccelerator(ff_oclallocator *alloc, const size_t width_, const Tout &identityVal, const bool from_source=false) : from_source(from_source), my_own_allocator(false), allocator(alloc), halo_half(width_), identityVal(identityVal), events_h2d(16), deviceId(NULL) { wgsize_map_static = wgsize_reduce_static = 0; wgsize_map_max = wgsize_reduce_max = 0; inputBuffer = outputBuffer = reduceBuffer = NULL; sizeInput = sizeInput_padded = 0; lenInput = offset1_in = halo_in_left = halo_in_right = lenInput_global = 0; sizeOutput = sizeOutput_padded = 0; lenOutput = offset1_out = halo_out_left = halo_out_right = lenOutput_global = 0; nevents_h2d = nevents_map = 0; event_d2h = event_map = event_reduce1 = event_reduce2 = NULL; wgsize_map = nthreads_map = 0; wgsize_reduce = nthreads_reduce = nwg_reduce = wg_red_mem = 0; reduceVar = identityVal; kernel_map = kernel_reduce = kernel_init = NULL; context = NULL; program = NULL; cmd_queue = NULL; reduce_mode = REDUCE_OUTPUT; if (!allocator) { my_own_allocator = true; allocator = new ff_oclallocator; assert(allocator); } } virtual ~ff_oclAccelerator() { if (my_own_allocator) { allocator->releaseAllBuffers(context); delete allocator; allocator = NULL; my_own_allocator = false; } } int init(cl_device_id dId, reduceMode m, const std::string &kernel_code, const std::string &kernel_name1, const std::string &kernel_name2, const bool save_binary, const bool reuse_binary) { #ifdef FF_OPENCL_LOG fprintf(stderr, "initializing virtual accelerator @%p mapped to device:\n", this); std::cerr << ff::clEnvironment::instance()->getDeviceInfo(dId) << std::endl; #endif reduce_mode = m; //set OCL objects deviceId = dId; const oclParameter *param = clEnvironment::instance()->getParameter(deviceId); assert(param); context = param->context; cmd_queue = param->commandQueue; //build OCL kernels cl_int status = buildKernels(kernel_code, kernel_name1,kernel_name2, from_source, save_binary, reuse_binary); checkResult(status, "build kernels"); //compute static heuristics for kernel sizing setSizingHeuristics(); return status == CL_SUCCESS; } void releaseAll() { if (deviceId) { svc_releaseOclObjects(); deviceId = NULL; }} void releaseInput(const Tin *inPtr) { if (allocator->releaseBuffer(inPtr, context, inputBuffer) != CL_SUCCESS) checkResult(CL_INVALID_MEM_OBJECT, "releaseInput"); inputBuffer = NULL; } void releaseOutput(const Tout *outPtr) { if (allocator->releaseBuffer(outPtr, context, outputBuffer) != CL_SUCCESS) checkResult(CL_INVALID_MEM_OBJECT, "releaseOutput"); outputBuffer = NULL; } void releaseEnv(size_t idx, const void *envPtr) { if (allocator->releaseBuffer(envPtr, context, envBuffer[idx].first) != CL_SUCCESS) checkResult(CL_INVALID_MEM_OBJECT, "releaseEnv"); envBuffer[idx].first = NULL, envBuffer[idx].second = 0; } void swapBuffers() { cl_mem tmp = inputBuffer; inputBuffer = outputBuffer; outputBuffer = tmp; } void setSizingHeuristics() { cl_int status; //get device-dependent max wg size size_t max_device_wgsize; status = clGetDeviceInfo(deviceId,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(max_device_wgsize),&max_device_wgsize, NULL); checkResult(status, "clGetDeviceInfo (map)"); if(kernel_map) { //map kernel //get kernel-dependent max wg size size_t max_kernel_wgsize; status = clGetKernelWorkGroupInfo(kernel_map, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_kernel_wgsize, 0); checkResult(status, "GetKernelWorkGroupInfo (map)"); wgsize_map_max = std::min(max_device_wgsize,max_kernel_wgsize); //get size of the atomic scheduling unit (analogous to CUDA wrap size) //typical values are 16 or 32 size_t wg_multiple; status = clGetKernelWorkGroupInfo(kernel_map, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wg_multiple, 0); wgsize_map_static = std::max(64, wg_multiple * 4); //64 or 128 wgsize_map_static = std::min(wgsize_map_static,wgsize_map_max); } if(kernel_reduce) { //reduce kernel //get kernel-dependent max wg size size_t max_kernel_wgsize; status = clGetKernelWorkGroupInfo(kernel_reduce, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_kernel_wgsize, 0); checkResult(status, "GetKernelWorkGroupInfo (reduce)"); wgsize_reduce_max = std::min(max_device_wgsize,max_kernel_wgsize); //get size of the atomic scheduling unit (analogous to CUDA wrap size) //typical values are 16 or 32 size_t wg_multiple; status = clGetKernelWorkGroupInfo(kernel_reduce, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wg_multiple, 0); wgsize_reduce_static = std::max(64, wg_multiple * 4); //64 or 128 wgsize_reduce_static = std::min(wgsize_reduce_static,wgsize_reduce_max); } #ifdef FF_OPENCL_LOG std::cerr << "[virtual accelerator @"< &P, size_t len_global) { offset1_in = P.first; lenInput = P.second; lenInput_global = len_global; halo_in_left = (std::min)(halo_half, offset1_in); halo_in_right = (std::min)(halo_half, lenInput_global - lenInput - offset1_in); sizeInput = lenInput * sizeof(Tin); sizeInput_padded = sizeInput + (halo_in_left + halo_in_right) * sizeof(Tin); if (oldPtr != NULL) allocator->updateKey(oldPtr, newPtr, context); } void relocateInputBuffer(const Tin *inPtr, const bool reuseIn, const Tout *reducePtr) { cl_int status; if (reuseIn) { inputBuffer = allocator->createBufferUnique(inPtr, context, CL_MEM_READ_WRITE, sizeInput_padded, &status); checkResult(status, "CreateBuffer(Unique) input"); } else { if (inputBuffer) allocator->releaseBuffer(inPtr, context, inputBuffer); //allocate input-size + pre/post-windows inputBuffer = allocator->createBuffer(inPtr, context, CL_MEM_READ_WRITE, sizeInput_padded, &status); checkResult(status, "CreateBuffer input"); } //set workgroup size and nthreads for map // MA patch - map not defined => workgroup_size_map //size_t wgsize = wgsize_map_static==0?wgsize_reduce_static:wgsize_map_static; if(kernel_map) { // if the output buffer is bigger than input buffer then we use the size of the // output buffer to compute the number of threads if (lenInput < lenOutput) { wgsize_map = std::min(lenOutput, wgsize_map_static); nthreads_map = wgsize_map * ((lenOutput + wgsize_map - 1) / wgsize_map); //round up } else { wgsize_map = std::min(lenInput, wgsize_map_static); nthreads_map = wgsize_map * ((lenInput + wgsize_map - 1) / wgsize_map); //round up } #ifdef FF_OPENCL_LOG std::cerr << "[virtual accelerator @"< &P, size_t len_global) { offset1_out = P.first; lenOutput = P.second; lenOutput_global = len_global; halo_out_left = (std::min)(halo_half, offset1_out); halo_out_right = (std::min)(halo_half, lenOutput_global - lenOutput - offset1_out); sizeOutput = lenOutput * sizeof(Tout); sizeOutput_padded = sizeOutput + (halo_out_left + halo_out_right) * sizeof(Tout); if (oldPtr != NULL) allocator->updateKey(oldPtr, newPtr, context); } void relocateOutputBuffer(const Tout *outPtr, const Tout *reducePtr) { cl_int status; if (outputBuffer) allocator->releaseBuffer(outPtr, context, outputBuffer); outputBuffer = allocator->createBuffer(outPtr, context, CL_MEM_READ_WRITE, sizeOutput_padded,&status); checkResult(status, "CreateBuffer output"); //set workgroup size and nthreads for reduce if (kernel_reduce && reduce_mode == REDUCE_OUTPUT) { resetReduce(lenOutput, sizeof(Tout), (void *)reducePtr); } } void relocateEnvBuffer(const void *envptr, const bool reuseEnv, const size_t idx, const size_t envbytesize) { cl_int status = CL_SUCCESS; if (idx >= envBuffer.size()) { cl_mem envb; if (reuseEnv) envb = allocator->createBufferUnique(envptr, context, CL_MEM_READ_WRITE, envbytesize, &status); else envb = allocator->createBuffer(envptr, context, CL_MEM_READ_WRITE, envbytesize, &status); if (checkResult(status, "CreateBuffer envBuffer")) envBuffer.push_back(std::make_pair(envb,envbytesize)); } else { if (reuseEnv) { envBuffer[idx].first = allocator->createBufferUnique(envptr, context, CL_MEM_READ_WRITE, envbytesize, &status); if (checkResult(status, "CreateBuffer envBuffer")) envBuffer[idx].second = envbytesize; } else { if (envBuffer[idx].second < envbytesize) { if (envBuffer[idx].first) allocator->releaseBuffer(envptr, context, envBuffer[idx].first); envBuffer[idx].first = allocator->createBuffer(envptr, context, CL_MEM_READ_WRITE, envbytesize, &status); if (checkResult(status, "CreateBuffer envBuffer")) envBuffer[idx].second = envbytesize; } } } } void setInPlace(Tout *reducePtr) { outputBuffer = inputBuffer; lenOutput = lenInput; lenOutput_global = lenInput_global; halo_out_left = halo_in_left; halo_out_right = halo_in_right; offset1_out = offset1_in; sizeOutput = sizeInput; sizeOutput_padded = sizeInput_padded; //set workgroup size and nthreads for reduce if (kernel_reduce && reduce_mode == REDUCE_OUTPUT) { resetReduce(lenOutput, sizeof(Tout), reducePtr); } } void swap() { cl_mem tmp = inputBuffer; inputBuffer = outputBuffer; outputBuffer = tmp; //set iteration-dynamic MAP kernel args cl_int status = clSetKernelArg(kernel_map, 0, sizeof(cl_mem), &inputBuffer); checkResult(status, "setKernelArg input"); status = clSetKernelArg(kernel_map, 1, sizeof(cl_mem), &outputBuffer); checkResult(status, "setKernelArg output"); } virtual void setMapKernelArgs(const size_t envSize) { cl_uint idx = 0; //set iteration-dynamic MAP kernel args (init) cl_int status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &inputBuffer); checkResult(status, "setKernelArg input"); status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &outputBuffer); checkResult(status, "setKernelArg output"); //set iteration-invariant MAP kernel args status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &lenInput_global); checkResult(status, "setKernelArg global input length"); status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &lenOutput); checkResult(status, "setKernelArg local input length"); status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &offset1_in); checkResult(status, "setKernelArg offset"); status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &halo_out_left); // CHECK !!! checkResult(status, "setKernelArg pad"); for(size_t k=0; k < envSize; ++k) { status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &envBuffer[k].first); checkResult(status, "setKernelArg env"); } } size_t asyncH2Dinput(Tin *p) { if (nevents_h2d >= events_h2d.size()) events_h2d.reserve(nevents_h2d); p += offset1_in - halo_in_left; cl_int status = clEnqueueWriteBuffer(cmd_queue, inputBuffer, CL_FALSE, 0, sizeInput_padded, p, 0, NULL, &events_h2d[nevents_h2d++]); checkResult(status, "copying Task to device input-buffer"); return sizeInput_padded; } size_t asyncH2Denv(const size_t idx, char *p) { if (nevents_h2d >= events_h2d.size()) events_h2d.reserve(nevents_h2d); cl_int status = clEnqueueWriteBuffer(cmd_queue, envBuffer[idx].first, CL_FALSE, 0, envBuffer[idx].second, p, 0, NULL, &events_h2d[nevents_h2d++]); checkResult(status, "copying Task to device env-buffer"); return envBuffer[idx].second; } size_t asyncH2Dborders(Tout *p) { if (halo_out_left) { cl_int status = clEnqueueWriteBuffer(cmd_queue, outputBuffer, CL_FALSE, 0, halo_out_left * sizeof(Tout), p + offset1_out - halo_out_left, 0, NULL, &events_h2d[nevents_h2d++]); checkResult(status, "copying left border to device"); return halo_out_left * sizeof(Tout); } if (halo_out_right) { cl_int status = clEnqueueWriteBuffer(cmd_queue, outputBuffer, CL_FALSE, (halo_out_left + lenOutput) * sizeof(Tout), halo_out_right * sizeof(Tout), // NOTE: in a loop Tin == Tout !! p + offset1_out + lenOutput, 0, NULL, &events_h2d[nevents_h2d++]); checkResult(status, "copying right border to device"); } return halo_out_left * sizeof(Tout); } size_t asyncD2Houtput(Tout *p) { cl_int status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE, halo_out_left * sizeof(Tout), sizeOutput, p + offset1_out, 0, NULL, &event_d2h); checkResult(status, "copying output back from device"); return sizeOutput; } size_t asyncD2Hborders(Tout *p) { cl_int status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE, halo_out_left * sizeof(Tout), halo_half * sizeof(Tout), p + offset1_out, 0, NULL, &events_h2d[0]); checkResult(status, "copying border1 back from device"); ++nevents_h2d; status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE, (halo_out_left + lenOutput - halo_half) * sizeof(Tout), halo_half * sizeof(Tout), p + offset1_out + lenOutput - halo_half, 0, NULL, &event_d2h); checkResult(status, "copying border2 back from device"); return halo_half * sizeof(Tout); } //void initReduce(const Tout &initReduceVal, reduceMode m = REDUCE_OUTPUT) { void initReduce() { //set kernel args for reduce1 int idx = 0; cl_mem tmp = (reduce_mode == REDUCE_OUTPUT) ? outputBuffer : inputBuffer; cl_uint len = (reduce_mode == REDUCE_OUTPUT) ? lenOutput : lenInput; cl_int status = clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &tmp); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(uint), &halo_in_left); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_uint), (void *) &len); status |= clSetKernelArg(kernel_reduce, idx++, wg_red_mem, NULL); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(Tout), (void *) &identityVal); checkResult(status, "setKernelArg reduce-1"); } void asyncExecMapKernel() { //execute MAP kernel cl_int status = clEnqueueNDRangeKernel(cmd_queue, kernel_map, 1, NULL, &nthreads_map, &wgsize_map, 0, NULL, &event_map); checkResult(status, "executing map kernel"); //std::cerr << "Exec map WI " << globalThreadsMap << " localThreadsMap " << localThreadsMap << "\n"; ++nevents_map; } void asyncExecReduceKernel1() { //std::cerr << "Exec reduce1 WI " << globalThreadsReduce << " localThreadsReduce " << localThreadsReduce << "\n"; cl_int status = clEnqueueNDRangeKernel(cmd_queue, kernel_reduce, 1, NULL, &nthreads_reduce, &wgsize_reduce, nevents_map, (nevents_map==0)?NULL:&event_map, &event_reduce1); checkResult(status, "exec kernel reduce-1"); nevents_map = 0; } void asyncExecReduceKernel2() { cl_uint zeropad = 0; int idx = 0; cl_int status = clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(uint), &zeropad); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_uint),(void*) &nwg_reduce); status |= clSetKernelArg(kernel_reduce, idx++, wg_red_mem, NULL); status |= clSetKernelArg(kernel_reduce, idx++, sizeof(Tout), (void *) &identityVal); checkResult(status, "setKernelArg reduce-2"); //std::cerr << "Exec reduce2 WI " << globalThreadsReduce << " localThreadsReduce " << localThreadsReduce << "\n"; status = clEnqueueNDRangeKernel(cmd_queue, kernel_reduce, 1, NULL, &wgsize_reduce, &wgsize_reduce, 1, &event_reduce1, &event_reduce2); checkResult(status, "exec kernel reduce-2"); } Tout getReduceVar() { cl_int status = clEnqueueReadBuffer(cmd_queue, reduceBuffer, CL_TRUE, 0, sizeof(Tout), &reduceVar, 0, NULL, NULL); checkResult(status, "d2h reduceVar"); return reduceVar; } void waitforh2d() { if (nevents_h2d>0) { cl_int status = clWaitForEvents(nevents_h2d, events_h2d.data()); checkResult(status, "h2d wait for"); nevents_h2d = 0; } } void waitford2h() { cl_int status = clWaitForEvents(1, &event_d2h); checkResult(status, "d2h wait for"); } void waitforreduce() { cl_int status = clWaitForEvents(1, &event_reduce2); checkResult(status, "wait for reduce"); } void waitformap() { cl_int status = clWaitForEvents(nevents_map, &event_map); nevents_map = 0; checkResult(status, "wait for map"); } /** * dynamically sets half-size of the 1D halo */ void setHaloHalf(const size_t h) { halo_half = h; } private: int buildProgram(cl_device_id dId) { cl_int status = clBuildProgram(program, 1, &dId, /*"-cl-fast-relaxed-math"*/NULL, NULL,NULL); checkResult(status, "building program"); // DEBUGGING CODE for checking OCL compilation errors if (status != CL_SUCCESS) { printf("\nFail to build the program\n"); size_t len; clGetProgramBuildInfo(program, dId, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); printf("LOG len %ld\n", len); char *buffer = (char*) calloc(len, sizeof(char)); assert(buffer); clGetProgramBuildInfo(program, dId, CL_PROGRAM_BUILD_LOG, len * sizeof(char), buffer, NULL); printf("LOG: %s\n\n", buffer); return -1; } return 0; } int buildKernelCode(const std::string &kc, cl_device_id dId) { cl_int status; size_t sourceSize = kc.length(); const char* code = kc.c_str(); #ifdef FF_OPENCL_LOG printf("/* ------------------------------------------------------- */\n"); printf("buildKernelCode:\n%s\n", code); printf("/* ------------------------------------------------------- */\n"); #endif program = clCreateProgramWithSource(context, 1, &code, &sourceSize, &status); if (!program) { checkResult(status, "creating program with source"); return -1; } return buildProgram(dId); } // create the program with the binary file or from the source code int createProgram(const std::string &filepath, cl_device_id dId, const bool save_binary, const bool reuse_binary) { cl_int status, binaryStatus; bool binary = false; const std::string binpath = filepath + ".bin"; std::ifstream ifs; if (reuse_binary) { ifs.open(binpath, std::ios::binary ); if (!ifs.is_open()) { // try with filepath ifs.open(filepath, std::ios::binary); if (!ifs.is_open()) { error("createProgram: cannot open %s (nor %s)\n", filepath.c_str(), binpath.c_str()); return -1; } } else binary = true; } else { ifs.open(filepath, std::ios::binary); if (!ifs.is_open()) { error("createProgram: cannot open source file %s\n", filepath.c_str()); return -1; } } std::vector buf((std::istreambuf_iterator(ifs)), (std::istreambuf_iterator())); ifs.close(); size_t bufsize = buf.size(); const char *bufptr = buf.data(); status = CL_INVALID_BINARY; if (binary) { program = clCreateProgramWithBinary(context, 1, &dId, &bufsize, reinterpret_cast(&bufptr), &binaryStatus, &status); } if (status != CL_SUCCESS) { // maybe is not a binary file program = clCreateProgramWithSource(context, 1,&bufptr, &bufsize, &status); if (!program) { checkResult(status, "creating program with source"); return -1; } if (buildProgram(dId)<0) return -1; if (save_binary) { // TODO: the logical deviceId has to be attached to the file name ! size_t programBinarySize; status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * 1, &programBinarySize, NULL); checkResult(status, "createProgram clGetProgramInfo (binary size)"); std::vector binbuf(programBinarySize); const char *binbufptr = binbuf.data(); status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * 1, &binbufptr, NULL); checkResult(status, "createProgram clGetProgramInfo (binary data)"); std::ofstream ofs(binpath, std::ios::out | std::ios::binary); ofs.write(binbuf.data(), binbuf.size()); ofs.close(); } return 0; } return buildProgram(dId); } cl_int buildKernels(const std::string &kernel_code, const std::string &kernel_name1, const std::string &kernel_name2, const bool from_source, const bool save_binary, const bool reuse_binary) { cl_int status_ = CL_SUCCESS; if (!from_source) { //compile kernel on device if (buildKernelCode(kernel_code, deviceId)<0) return -1; } else { // kernel_code is the path to the (binary?) source if (createProgram(kernel_code, deviceId, save_binary, reuse_binary)<0) return -1; } //create kernel objects if (kernel_name1 != "") { //map kernel cl_int status; kernel_map = clCreateKernel(program, kernel_name1.c_str(), &status); checkResult(status, "CreateKernel (map)"); status_ |= status; } if (kernel_name2 != "") { //reduce kernel cl_int status; kernel_reduce = clCreateKernel(program, kernel_name2.c_str(), &status); checkResult(status, "CreateKernel (reduce)"); status_ |= status; } return status_; } void svc_releaseOclObjects() { if (kernel_map) clReleaseKernel(kernel_map); if (kernel_reduce) clReleaseKernel(kernel_reduce); clReleaseProgram(program); // if (inputBuffer) clReleaseMemObject(inputBuffer); // for(size_t i=0; i < envBuffer.size(); ++i) // clReleaseMemObject(envBuffer[i].first); // if (outputBuffer && outputBuffer != inputBuffer) // clReleaseMemObject(outputBuffer); // if (reduceBuffer) // clReleaseMemObject(reduceBuffer); allocator->releaseAllBuffers(context); } void resetReduce(size_t lenReduceInput, size_t elem_size, const void *reducePtr) { // 64 and 256 are the max number of blocks and threads we want to use //getBlocksAndThreads(lenInput, 64, 256, nwg_reduce, wgsize_reduce); //nthreads_reduce = nwg_reduce * wgsize_reduce; nthreads_reduce = lenReduceInput; if(!isPowerOf2(nthreads_reduce)) nthreads_reduce = nextPowerOf2(nthreads_reduce); wgsize_reduce = std::min(nthreads_reduce, wgsize_reduce_static); nwg_reduce = nthreads_reduce / wgsize_reduce; //compute size of per-workgroup working memory wg_red_mem = (wgsize_reduce * elem_size) + (wgsize_reduce <= 32) * (wgsize_reduce * elem_size); //compute size of global reduce working memory size_t global_red_mem = nwg_reduce * elem_size; //allocate global memory for storing intermediate per-workgroup reduce results cl_int status; if (reduceBuffer) allocator->releaseBuffer(reducePtr, context, reduceBuffer); reduceBuffer = allocator->createBuffer(reducePtr, context, CL_MEM_READ_WRITE, global_red_mem, &status); checkResult(status, "CreateBuffer reduce"); #ifdef FF_OPENCL_LOG std::cerr << "[virtual accelerator @"< > envBuffer; /* * each accelerator works on the following subset of the input array: * [left-border][input-portion][right-border] * input portion is accessed RW, border are accessed read-only */ size_t sizeInput; //byte-size of the input-portion size_t sizeInput_padded; //byte-size of the input-portion plus left and right borders size_t lenInput; //n. elements in the input-portion size_t offset1_in; //left-offset (begin input-portion wrt to begin input) size_t halo_in_left; //n. elements in the left-halo size_t halo_in_right; //n. elements in the right-halo size_t lenInput_global; //n. elements in the input size_t sizeOutput, sizeOutput_padded; size_t lenOutput, offset1_out, halo_out_left, halo_out_right, lenOutput_global; //static input-independent estimation of workgroup sizing size_t wgsize_map_static, wgsize_reduce_static; //static input-independent upper bounds for workgroup sizing size_t wgsize_map_max, wgsize_reduce_max; //input-dependent workgroup sizing size_t wgsize_map, wgsize_reduce; //input-dependent number of threads size_t nthreads_map, nthreads_reduce; //number of workgroups executing first on-device reduce size_t nwg_reduce; //reduce workgroup-local memory size_t wg_red_mem; //OCL events std::vector events_h2d; size_t nevents_h2d, nevents_map; cl_event event_d2h, event_map, event_reduce1, event_reduce2; //switch for the input of the reduce reduceMode reduce_mode; //the OCL Id the accelerator is mapped to cl_device_id deviceId; }; /*! * \class ff_stencilReduceLoopOCL_1D * \ingroup high_level_patterns * * \brief The OpenCL-based StencilReduceLoop pattern in 1 dimension * * This class is defined in \ref stencilReduceOCL.hpp */ template > class ff_stencilReduceLoopOCL_1D: public ff_oclNode_t { public: typedef typename TOCL::Tin Tin; typedef typename TOCL::Tout Tout; // build the program from the mapf and reducef functions ff_stencilReduceLoopOCL_1D(const std::string &mapf, //OpenCL elemental function const std::string &reducef = std::string(""), //OpenCL combinator function const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int width = 1) : oneshot(false), saveBinary(false), reuseBinary(false), accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL), oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) { setcode(mapf, reducef); for(size_t i = 0; i< NACCELERATORS; ++i) accelerators[i]= new accelerator_t(allocator, width,identityVal); #ifdef FF_OPENCL_LOG fprintf(stderr,"[ff_stencilReduceLoopOCL_1D node @%p]\n",this); fprintf(stderr,"map-kernel code:\n%s\n", mapf.c_str()); fprintf(stderr,"reduce-kernel code:\n%s\n", reducef.c_str()); #endif } // build the program from source code file, // first attempts to load a cached binary file (kernels_source in this case is the path to the binary file) // ff that file is not available, then it creates the program from source and store the binary for future use // with the extention ".bin" ff_stencilReduceLoopOCL_1D(const std::string &kernels_source, // OpenCL source code path const std::string &mapf_name, // name of the map function const std::string &reducef_name, // name of the reduce function const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int width = 1) : oneshot(false), saveBinary(false), reuseBinary(false), accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL), oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) { setsourcecode(kernels_source, mapf_name, reducef_name); for(size_t i = 0; i< NACCELERATORS; ++i) accelerators[i]= new accelerator_t(allocator, width, identityVal, true); } // the task is provided in the constructor -- one shot computation ff_stencilReduceLoopOCL_1D(const T &task, const std::string &mapf, const std::string &reducef = std::string(""), const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int width = 1) : oneshot(true), saveBinary(false), reuseBinary(false), accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL), oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) { ff_node::skipfirstpop(true); setcode(mapf, reducef); setTask(const_cast(task)); for(size_t i = 0; i< NACCELERATORS; ++i) accelerators[i]= new accelerator_t(allocator, width,identityVal); #ifdef FF_OPENCL_LOG fprintf(stderr,"[ff_stencilReduceLoopOCL_1D node @%p]\n",this); fprintf(stderr,"map-kernel code:\n%s\n", mapf.c_str()); fprintf(stderr,"reduce-kernel code:\n%s\n", reducef.c_str()); #endif } // the task is provided in the constructor -- one shot computation ff_stencilReduceLoopOCL_1D(const T &task, const std::string &kernels_source, // OpenCL source code path const std::string &mapf_name, // name of the map kernel function const std::string &reducef_name, // name of the reduce kernel function const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int width = 1) : oneshot(true), saveBinary(false), reuseBinary(false), accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL), oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) { setsourcecode(kernels_source, mapf_name, reducef_name); setTask(const_cast(task)); for(size_t i = 0; i< NACCELERATORS; ++i) accelerators[i]= new accelerator_t(allocator, width, identityVal, true); } virtual ~ff_stencilReduceLoopOCL_1D() { for(size_t i = 0; i< accelerators.size(); ++i) if (accelerators[i]) delete accelerators[i]; } // used to set tasks when in onshot mode void setTask(T &task) { Task.resetTask(); Task.setTask(&task); } /** * explicitly set the OpenCL devices to be used * * @param dev is the vector of devices (OpenCL Ids) to be used */ void setDevices(std::vector &dev) { // if (dev.size() > accelerators.size()) { // error("ff_stencilReduceLoopOCL_1D::setDevices: Too many devices provided, please increase the number of logical accelerators\n"); // return -1; // } devices = dev; // return 0; } // force execution on the CPU void pickCPU () { ff_oclNode_t::setDeviceType(CL_DEVICE_TYPE_CPU); } // force execution on the GPU - as many as requested by the co-allocation strategy void pickGPU (size_t offset=0 /* referred to global list of devices */) { offset_dev=offset; //TODO check numbering ff_oclNode_t::setDeviceType(CL_DEVICE_TYPE_GPU); } // after the compilation and building phases, the OpenCL program will be saved as binary file // this action takes effect only if the compilation is made with source file (i.e. not using macroes) void saveBinaryFile() { saveBinary = true; } // tells the run-time to re-use the binary file available void reuseBinaryFile() { reuseBinary = true; } virtual int run(bool = false) { return ff_node::run(); } virtual int wait() { return ff_node::wait(); } virtual int run_and_wait_end() { if (run() < 0) return -1; if (wait() < 0) return -1; return 0; } virtual int run_then_freeze() { if (ff_node::isfrozen()) { ff_node::thaw(true); return 0; } return ff_node::freeze_and_run(); } virtual int wait_freezing() { return ff_node::wait_freezing(); } const T* getTask() const { return &Task; } unsigned int getIter() { return Task.getIter(); } Tout *getReduceVar() { assert(oneshot); return Task.getReduceVar(); } /** * Performs a static allocation of OpenCL devices. * Priority is given to GPU devices, falling back to CPU devices * if needed (e.g. no GPU devices). * Currently it does not mix GPUs with CPU */ int nodeInit() { if (ff_oclNode_t::oclId < 0) { //check if already initialized ff_oclNode_t::oclId = clEnvironment::instance()->getOCLID(); if (devices.size() == 0) { // the user didn't set any specific device switch (ff_oclNode_t::getDeviceType()) { case CL_DEVICE_TYPE_ALL: case CL_DEVICE_TYPE_GPU: { // Retrive multiple logical GPU devices (non-exclusive mode) std::vector logdev = clEnvironment::instance()->coAllocateGPUDeviceRR(accelerators.size(), (offset_dev == 0) ? -1 : offset_dev); if (logdev.size() == 0) { //could not fulfill allocation request if(ff_oclNode_t::getDeviceType() == CL_DEVICE_TYPE_GPU) { error("not enough GPUs found !\n"); return -1; } //if user did not require require GPU devices, fallback to CPU } else { //convert retrieved logical devices into opencl Ids devices.clear(); for (size_t i = 0; i < logdev.size(); ++i) devices.push_back(clEnvironment::instance()->getDevice(logdev[i])); for (size_t i = 0; i < devices.size(); ++i) if (accelerators[i]->init(devices[i], getReduceMode(), kernel_code, kernel_name1, kernel_name2, saveBinary, reuseBinary) < 0) return -1; break; } break; } case CL_DEVICE_TYPE_CPU: { if (accelerators.size() > 1) { error( "Multiple (>1) virtual accelerators on CPU are requested. Not yet implemented.\n"); return -1; } else { //Retrieve the CPU device devices.clear(); devices.push_back(clEnvironment::instance()->getDevice( //convert to OpenCL Id clEnvironment::instance()->getCPUDevice())); //retrieve logical device if (accelerators[0]->init(devices[0], getReduceMode(), kernel_code, kernel_name1, kernel_name2, saveBinary, reuseBinary) < 0) return -1; } } break; default: { error( "stencilReduceOCL::Other device. Not yet implemented.\n"); return -1; } } //end switch on ff_oclNode_t::getDeviceType() } else { //user requested specific OpenCL devices if (devices.size() > accelerators.size()) { error( "stencilReduceOCL::nodeInit: Too many devices requested, increase the number of accelerators!\n"); return -1; } // NOTE: the number of devices requested can be lower than the number of accelerators. // TODO must be managed for (size_t i = 0; i < devices.size(); ++i) accelerators[i]->init(devices[i], getReduceMode(), kernel_code, kernel_name1, kernel_name2, saveBinary, reuseBinary); } } // for (size_t i = 0; i < devices.size(); ++i) // std::cerr << "Using " << clEnvironment::instance()->getDeviceInfo(devices[i]) << std::endl; return 0; } void nodeEnd() {} #if defined(FF_REPARA) /** * Returns input data size */ size_t rpr_get_sizeIn() const { return ff_node::rpr_sizeIn; } /** * Returns output data size */ size_t rpr_get_sizeOut() const { return ff_node::rpr_sizeOut; } #endif protected: virtual bool isPureMap() const { return false; } virtual bool isPureReduce() const { return false; } reduceMode getReduceMode() { return isPureReduce() ? REDUCE_INPUT : REDUCE_OUTPUT; } virtual int svc_init() { return nodeInit(); } #if 0 virtual int svc_init() { if (ff_oclNode_t::oclId < 0) { ff_oclNode_t::oclId = clEnvironment::instance()->getOCLID(); switch(ff_oclNode_t::getDeviceType()) { case CL_DEVICE_TYPE_ALL: fprintf(stderr,"STATUS: requested ALL\n"); case CL_DEVICE_TYPE_GPU: {// One or more GPUs // Not exclusive // Retrive logical devices std::vector logdev = clEnvironment::instance()->coAllocateGPUDeviceRR(accelerators.size()); // Convert into opencl Ids devices.clear(); for (size_t i = 0; i < logdev.size(); ++i) devices.push_back(clEnvironment::instance()->getDevice(logdev[i])); if (devices.size() == 0) { error("stencilReduceOCL::svc_init:not enough GPUs found !\n"); return -1; } else { // Ok for (size_t i = 0; i < devices.size(); ++i) accelerators[i]->init(devices[i], kernel_code, kernel_name1,kernel_name2); break; } } case CL_DEVICE_TYPE_CPU: { if (accelerators.size()>1) { error ("Multiple (>1) virtual accelerators on CPU are requested. Not yet implemented.\n"); return -1; } else { // Ok devices.clear(); devices.push_back(clEnvironment::instance()->getDevice(clEnvironment::instance()->getCPUDevice())); accelerators[0]->init(devices[0], kernel_code, kernel_name1,kernel_name2); } } break; default: { error("stencilReduceOCL::Other device. Not yet implemented.\n"); } break; } } return 0; } #endif virtual void svc_end() { if (!ff::ff_node::isfrozen()) nodeEnd(); } T *svc(T *task) { if (task) setTask(*task); Tin *inPtr = Task.getInPtr(); Tout *outPtr = Task.getOutPtr(); Tout *reducePtr = Task.getReduceVar(); const size_t envSize = Task.getEnvNum(); //n. added environments #if defined(FF_REPARA) ff_node::rpr_sizeIn = ff_node::rpr_sizeOut = 0; #endif // if the computation is not in-place then we start from the output if ((void*)inPtr != (void*)outPtr) { // adjust allocator output-portions and relocate output device memory if needed if (oldSizeOut != Task.getBytesizeOut()) { compute_accmem(Task.getSizeOut(), acc_out); const bool memorychange = (oldSizeOut < Task.getBytesizeOut()); for (size_t i = 0; i < accelerators.size(); ++i) { accelerators[i]->adjustOutputBufferOffset(outPtr, (memorychange?old_outPtr:NULL), acc_out[i], Task.getSizeOut()); } if (memorychange) { for (size_t i = 0; i < accelerators.size(); ++i) { accelerators[i]->relocateOutputBuffer(outPtr, reducePtr); } oldSizeOut = Task.getBytesizeOut(); old_outPtr = outPtr; } } } // adjust allocator input-portions and relocate input device memory if needed if (oldBytesizeIn != Task.getBytesizeIn()) { compute_accmem(Task.getSizeIn(), acc_in); const bool memorychange = (oldBytesizeIn < Task.getBytesizeIn()); adjustInputBufferOffset(memorychange); if (memorychange) { for (size_t i = 0; i < accelerators.size(); ++i) { accelerators[i]->relocateInputBuffer(inPtr, Task.getReuseIn(), reducePtr); } oldBytesizeIn = Task.getBytesizeIn(); old_inPtr = inPtr; } } // in-place computation // adjust allocator output-portions and relocate output device memory if needed if (((void*)inPtr == (void*)outPtr) && ( oldSizeOut != Task.getBytesizeOut())) { for (size_t i = 0; i < accelerators.size(); ++i) { accelerators[i]->setInPlace(reducePtr); } } //relocate env device memory //TODO on-demand relocate, as for input/output memory /* NOTE: env buffer are replicated on all devices. * It would be nice to have replicated/partitioned polices */ for (size_t i = 0; i < accelerators.size(); ++i) for(size_t k=0; k < envSize; ++k) { char *envptr; Task.getEnvPtr(k, envptr); accelerators[i]->relocateEnvBuffer(envptr, Task.getReuseEnv(k), k, Task.getBytesizeEnv(k)); } if (!isPureReduce()) //set kernel args for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->setMapKernelArgs(envSize); //(async) copy input and environments (h2d) for (size_t i = 0; i < accelerators.size(); ++i) { if (Task.getCopyIn()) { #if defined(FF_REPARA) ff_node::rpr_sizeIn += accelerators[i]->asyncH2Dinput(Task.getInPtr()); //in #else accelerators[i]->asyncH2Dinput(Task.getInPtr()); //in #endif } for(size_t k=0; k < envSize; ++k) { if (Task.getCopyEnv(k)) { char *envptr; Task.getEnvPtr(k, envptr); #if defined(FF_REPARA) ff_node::rpr_sizeIn += accelerators[i]->asyncH2Denv(k, envptr); #else accelerators[i]->asyncH2Denv(k, envptr); #endif } } } if (isPureReduce()) { //init reduce for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->initReduce(); //wait for cross-accelerator h2d waitforh2d(); //(async) device-reduce1 for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncExecReduceKernel1(); //(async) device-reduce2 for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncExecReduceKernel2(); waitforreduce(); //wait for cross-accelerator reduce //host-reduce Tout redVar = accelerators[0]->getReduceVar(); for (size_t i = 1; i < accelerators.size(); ++i) redVar = Task.combinator(redVar, accelerators[i]->getReduceVar()); Task.writeReduceVar(redVar); } else { Task.resetIter(); if (isPureMap()) { //wait for cross-accelerator h2d waitforh2d(); //(async) exec kernel for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncExecMapKernel(); Task.incIter(); waitformap(); //join } else { //iterative Map-Reduce (aka stencilReduceLoop) //invalidate first swap for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->swap(); bool go = true; do { //Task.before(); for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->swap(); //wait for cross-accelerator h2d waitforh2d(); //(async) execute MAP kernel for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncExecMapKernel(); Task.incIter(); //start async-interleaved: reduce + borders sync //init reduce for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->initReduce(); //(async) device-reduce1 for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncExecReduceKernel1(); //(async) device-reduce2 for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncExecReduceKernel2(); //wait for cross-accelerators reduce waitforreduce(); //host-reduce Tout redVar = accelerators[0]->getReduceVar(); for (size_t i = 1; i < accelerators.size(); ++i) redVar = Task.combinator(redVar, accelerators[i]->getReduceVar()); Task.writeReduceVar(redVar); go = Task.iterCondition_aux(); if (go) { assert(outPtr); //(async) read back borders (d2h) for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncD2Hborders(outPtr); waitford2h(); //wait for cross-accelerators d2h //(async) read borders (h2d) for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->asyncH2Dborders(outPtr); } //Task.after(); } while (go); } //(async)read back output (d2h) if (outPtr && Task.getCopyOut()) { // do we have to copy back the output result ? for (size_t i = 0; i < accelerators.size(); ++i) { #if defined(FF_REPARA) ff_node::rpr_sizeOut += accelerators[i]->asyncD2Houtput(outPtr); #else accelerators[i]->asyncD2Houtput(outPtr); #endif } waitford2h(); //wait for cross-accelerators d2h } } // device memory cleanup phase if (Task.getReleaseIn() && (void *)outPtr != (void *)inPtr) { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->releaseInput(inPtr); oldBytesizeIn = 0; old_inPtr = NULL; } if ( Task.getReleaseOut() ) { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->releaseOutput(outPtr); oldSizeOut = 0; old_outPtr = NULL; } for(size_t k=0; k < envSize; ++k) { if (Task.getReleaseEnv(k)) { char *envptr; Task.getEnvPtr(k, envptr); if ((void*)envptr != (void*)outPtr) { for (size_t i = 0; i < accelerators.size(); ++i) { accelerators[i]->releaseEnv(k,envptr); } } } // TODO: management of oldEnvPtr !! // currently the size of the envbuffer should be always the same ! } // per task memory cleanup phase Task.releaseTask(task); return (oneshot ? NULL : task); } protected: virtual void adjustInputBufferOffset(const bool memorychange) { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->adjustInputBufferOffset(Task.getInPtr(), (memorychange ? old_inPtr : NULL), acc_in[i], Task.getSizeIn()); } void setcode(const std::string &codestr1, const std::string &codestr2) { int n = 0; if (codestr1 != "") { n = codestr1.find_first_of("|"); assert(n > 0); kernel_name1 = codestr1.substr(0, n); const std::string &tmpstr = codestr1.substr(n + 1); n = tmpstr.find_first_of("|"); assert(n > 0); // checking for double type if (tmpstr.substr(0, n) == "double") { kernel_code = "\n#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n" + tmpstr.substr(n + 1); } else kernel_code = "\n" + tmpstr.substr(n + 1); } // checking for extra code needed to compile the kernels std::ifstream ifs(FF_OPENCL_DATATYPES_FILE); if (ifs.is_open()) kernel_code.insert(kernel_code.begin(), std::istreambuf_iterator(ifs), std::istreambuf_iterator()); if (codestr2 != "") { n = codestr2.find("|"); assert(n > 0); kernel_name2 += codestr2.substr(0, n); const std::string &tmpstr = codestr2.substr(n + 1); n = tmpstr.find("|"); assert(n > 0); // checking for double type if (tmpstr.substr(0, n) == "double") { kernel_code += "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" + tmpstr.substr(n + 1); } else kernel_code += tmpstr.substr(n + 1); } } void setsourcecode(const std::string &source, const std::string &kernel1, const std::string &kernel2) { if (kernel1 != "") kernel_name1 = "kern_"+kernel1; if (kernel2 != "") kernel_name2 = "kern_"+kernel2; kernel_code = source; } //assign input partition to accelerators //acc[i] = (start, size) where: // - start is the first element assigned to accelerator i // - size is the number of elements assigned to accelerator i void compute_accmem(const size_t len, std::vector > &acc) { size_t start = 0, step = (len + accelerators.size() - 1) / accelerators.size(); size_t i = 0; for (; i < accelerators.size() - 1; ++i) { acc[i]=std::make_pair(start, step); start += step; } acc[i]=std::make_pair(start, len-start); } void waitforh2d() { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->waitforh2d(); } void waitford2h() { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->waitford2h(); } void waitforreduce() { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->waitforreduce(); } void waitformap() { for (size_t i = 0; i < accelerators.size(); ++i) accelerators[i]->waitformap(); } TOCL Task; const bool oneshot; bool saveBinary, reuseBinary; std::vector accelerators; std::vector > acc_in; std::vector > acc_out; std::vector devices; int stencil_width_half; //size_t preferred_dev; size_t offset_dev; std::string kernel_code; std::string kernel_name1; std::string kernel_name2; size_t forced_cpu; size_t forced_gpu; size_t forced_other; Tin *old_inPtr; Tout *old_outPtr; size_t oldBytesizeIn, oldSizeOut, oldSizeReduce; }; /*! * \class ff_mapOCL_1D * \ingroup high_level_patterns * * \brief The OpenCL-based Map pattern in 1 dimension * * This class is defined in \ref stencilReduceOCL.hpp * */ template class ff_mapOCL_1D: public ff_stencilReduceLoopOCL_1D { public: ff_mapOCL_1D(std::string mapf, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(mapf, "", 0, alloc, NACCELERATORS, 0) { } ff_mapOCL_1D(const std::string &kernels_source, const std::string &mapf_name, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(kernels_source, mapf_name, "", 0, alloc, NACCELERATORS, 0) { } ff_mapOCL_1D(const T &task, std::string mapf, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(task, mapf, "", 0, alloc, NACCELERATORS, 0) { } ff_mapOCL_1D(const T &task, const std::string &kernels_source, const std::string &mapf_name, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(task, kernels_source, mapf_name, "", 0, alloc, NACCELERATORS, 0) { } bool isPureMap() const { return true; } }; /*! * \class ff_reduceOCL_1D * * \ingroup high_level_patterns * * \brief The OpenCL-based Reduce pattern in 1 dimension * * This class is defined in \ref stencilReduceOCL.hpp * */ template class ff_reduceOCL_1D: public ff_stencilReduceLoopOCL_1D { public: typedef typename TOCL::Tin Tin; typedef typename TOCL::Tout Tout; ff_reduceOCL_1D(std::string reducef, const Tout &identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D("", reducef, identityVal, alloc, NACCELERATORS, 0) { } ff_reduceOCL_1D(const std::string &kernels_source, const std::string &reducef_name, const Tout &identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(kernels_source, "", reducef_name, identityVal, alloc, NACCELERATORS, 0) { } ff_reduceOCL_1D(const T &task, std::string reducef, const Tout identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(task, "", reducef, identityVal, alloc, NACCELERATORS, 0) { } ff_reduceOCL_1D(const T &task, const std::string &kernels_source,const std::string &reducef_name, const Tout identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(task, kernels_source, "", reducef_name, identityVal, alloc, NACCELERATORS, 0) { } bool isPureReduce() const { return true; } }; /* * \class f_mapReduceOCL_1D * \ingroup high_level_patterns * * \brief The mapReduce skeleton. * * The mapReuce skeleton using OpenCL * * This class is defined in \ref map.hpp * */ template class ff_mapReduceOCL_1D: public ff_stencilReduceLoopOCL_1D { public: typedef typename TOCL::Tin Tin; typedef typename TOCL::Tout Tout; ff_mapReduceOCL_1D(std::string mapf, std::string reducef, const Tout &identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(mapf, reducef, identityVal, alloc, NACCELERATORS, 0) { } ff_mapReduceOCL_1D(const std::string &kernels_code, const std::string &mapf_name, const std::string &reducef_name, const Tout &identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(kernels_code, mapf_name, reducef_name, identityVal, alloc, NACCELERATORS, 0) { } ff_mapReduceOCL_1D(const T &task, std::string mapf, std::string reducef, const Tout &identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(task, mapf, reducef, identityVal, alloc, NACCELERATORS, 0) { } ff_mapReduceOCL_1D(const T &task, const std::string &kernels_code, const std::string &mapf_name, const std::string &reducef_name, const Tout &identityVal = Tout(), ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_1D(task, kernels_code, mapf_name, reducef_name, identityVal, alloc, NACCELERATORS, 0) { } }; /*** 2D ***/ /** * a task to be executed by a 2D stencilReduceLoop node. * This class represent a computation to be performed on a * logical 2D matrix, stored in host memory as 1D row-major array. */ template class baseOCLTask_2D: public baseOCLTask { public: /** * set the number of rows of the logical 2D input. * To be called from setTask. * * @param h is the number of rows */ void setHeight(size_t h) { height = h; } /** * set the number of columns of the logical 2D input. * To be called from setTask. * * @param h is the number of columns */ void setWidth(size_t w) { width = w; } //runtime getter functions size_t getHeight() const { return height;} size_t getWidth() const {return width;} protected: size_t height, width; }; /** * a virtual OpenCL accelerator for 2D map kernels working on * logical 2D matrices stored as row-major arrays. */ template class ff_oclAccelerator_2D : public ff_oclAccelerator { public: typedef typename TOCL::Tin Tin; typedef typename TOCL::Tout Tout; ff_oclAccelerator_2D(ff_oclallocator *alloc, const size_t halo_width_, const Tout &identityVal, const bool from_source=false) : ff_oclAccelerator(alloc, halo_width_, identityVal, from_source) { heightInput_global = 0; widthInput_global = 0; } void setMapKernelArgs(const size_t envSize) { cl_uint idx = 0; //set iteration-dynamic MAP kernel args (init) cl_int status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &this->inputBuffer); checkResult(status, "setKernelArg input"); status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &this->outputBuffer); checkResult(status, "setKernelArg output"); //set iteration-invariant MAP kernel args status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &heightInput_global); checkResult(status, "setKernelArg global input height"); status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &widthInput_global); checkResult(status, "setKernelArg global input width"); status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->lenOutput); checkResult(status, "setKernelArg local input length"); status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->offset1_in); checkResult(status, "setKernelArg offset"); status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->halo_out_left); checkResult(status, "setKernelArg halo"); for(size_t k=0; k < envSize; ++k) { status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &(this->envBuffer[k].first)); checkResult(status, "setKernelArg env"); } } /** * set the global number of rows of the logical 2D input. */ void setHeight(size_t h) { heightInput_global = h; } void setWidth(size_t w) { widthInput_global = w; } private: size_t heightInput_global, widthInput_global; }; /** * a stencilReduceLoop for executing 2D OCL tasks. */ template class ff_stencilReduceLoopOCL_2D: public ff_stencilReduceLoopOCL_1D > { private: typedef typename TOCL::Tin Tin; typedef typename TOCL::Tout Tout; typedef ff_oclAccelerator_2D accelerator_t; typedef ff::ff_stencilReduceLoopOCL_1D base_srl_t; public: // build the program from the mapf and reducef functions ff_stencilReduceLoopOCL_2D(const std::string &mapf, //OpenCL elemental function const std::string &reducef = std::string(""), //OpenCL combinator function const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int stencil_width_half_ = 1, const int stencil_height_half_ = 1) : //init srl 1D with no halo (1D halo depends on 2D width) base_srl_t(mapf, reducef, identityVal, allocator, NACCELERATORS, 0), stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {} // build the program from source code file, // first attempts to load a cached binary file (kernels_source in this case is the path to the binary file) // ff that file is not available, then it creates the program from source and store the binary for future use // with the extention ".bin" ff_stencilReduceLoopOCL_2D(const std::string &kernels_source, // OpenCL source code path const std::string &mapf_name, // name of the map function const std::string &reducef_name, // name of the reduce function const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int stencil_width_half_ = 1, const int stencil_height_half_ = 1) : //init srl 1D with no halo (1D halo depends on 2D width) base_srl_t(kernels_source, mapf_name, reducef_name, identityVal, allocator, NACCELERATORS, 0), stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {} // the task is provided in the constructor -- one shot computation ff_stencilReduceLoopOCL_2D(const T &task, const std::string &mapf, const std::string &reducef = std::string(""), const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int stencil_width_half_ = 1, const int stencil_height_half_ = 1) : //init srl 1D with no halo (1D halo depends on 2D width) base_srl_t(task, mapf, reducef, identityVal, allocator, NACCELERATORS, 0), stencil_width_half(stencil_width_half_),stencil_height_half(stencil_height_half_) {} // the task is provided in the constructor -- one shot computation ff_stencilReduceLoopOCL_2D(const T &task, const std::string &kernels_source, // OpenCL source code path const std::string &mapf_name, // name of the map kernel function const std::string &reducef_name, // name of the reduce kernel function const Tout &identityVal = Tout(), ff_oclallocator *allocator = nullptr, const size_t NACCELERATORS = 1, const int stencil_width_half_ = 1, const int stencil_height_half_ = 1) : //init srl 1D with no halo (1D halo depends on 2D width) base_srl_t(task, kernels_source, mapf_name, reducef_name, identityVal, allocator, NACCELERATORS, 0), stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {} protected: void adjustInputBufferOffset(const bool memorychange) { for (size_t i = 0; i < this->accelerators.size(); ++i) { this->accelerators[i]->setWidth(this->Task.getWidth()); this->accelerators[i]->setHeight(this->Task.getHeight()); this->accelerators[i]->setHaloHalf(halo_half(this->Task.getWidth())); } //accelerator uses above values to compute actual sizing base_srl_t::adjustInputBufferOffset(memorychange); } private: const size_t halo_half(const size_t width) { return stencil_height_half * width + stencil_width_half; } /* * 2D stencil can access elements in H x W rectangle around each element, where: * H = 2 * stencil_height_half + 1 * W = 2 * stencil_width_half + 1 */ const int stencil_width_half; const int stencil_height_half; }; /*! * \class ff_mapOCL_1D * \ingroup high_level_patterns * * \brief The OpenCL-based Map pattern in 1 dimension * * This class is defined in \ref stencilReduceOCL.hpp * */ template class ff_mapOCL_2D: public ff_stencilReduceLoopOCL_2D { public: ff_mapOCL_2D(std::string mapf, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_2D(mapf, "", 0, alloc, NACCELERATORS) { } ff_mapOCL_2D(const std::string &kernels_source, const std::string &mapf_name, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_2D(kernels_source, mapf_name, "", 0, alloc, NACCELERATORS) { } ff_mapOCL_2D(const T &task, std::string mapf, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_2D(task, mapf, "", 0, alloc, NACCELERATORS) { } ff_mapOCL_2D(const T &task, const std::string &kernels_source, const std::string &mapf_name, ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) : ff_stencilReduceLoopOCL_2D(task, kernels_source, mapf_name, "", 0, alloc, NACCELERATORS) { } bool isPureMap() const { return true; } }; }// namespace ff #endif // FF_OCL #endif /* FF_MAP_OCL_HPP */