mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/stencilReduceOCL.hpp

/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*!
 *  \file stencilReduceOCL.hpp
 *  \ingroup high_level_patterns
 *
 *  \brief StencilReduceLoop data-parallel pattern and derived data-parallel patterns
 *
 */

/* ***************************************************************************
 *
 *  FastFlow is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License version 3 as
 *  published by the Free Software Foundation.
 *  Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
 *  or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
 *
 *  This program is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 *  License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 ****************************************************************************
 */

/*
 *  Authors:
 *    Maurizio Drocco
 *    Massimo Torquati
 *    Marco Aldinucci 
 *
 */

#ifndef FF_STENCILREDUCE_OCL_HPP
#define FF_STENCILREDUCE_OCL_HPP

#ifdef FF_OPENCL

#include <string>
#include <fstream>
#include <tuple>
#include <algorithm>
#include <ff/bitflags.hpp>
#include <ff/oclnode.hpp>
#include <ff/node.hpp>
#include <ff/oclallocator.hpp>
#include <ff/stencilReduceOCL_macros.hpp>

namespace ff {

enum reduceMode { REDUCE_INPUT, REDUCE_OUTPUT };


/**
 * a task to be executed by a 1D stencilReduceLoop node.
 */
template<typename TaskT_, typename Tin_, typename Tout_ = Tin_>
class baseOCLTask {
public:
    typedef TaskT_  TaskT;
    typedef Tin_    Tin;
    typedef Tout_   Tout;

    baseOCLTask(): inPtr(NULL),outPtr(NULL),reduceVar(NULL),
                   size_in(0),size_out(0),iter(0),
                   tuple_in(std::make_tuple(true,false,false)),
                   tuple_out(std::make_tuple(true,false,false))   { }   
    virtual ~baseOCLTask() { }
    
    // user must override this method using:
    // - setInPtr for setting the host-pointer to the input array
    // - setOutPtr for setting the host-pointer to the output array
    // - setEnvPtr for adding to env-list the host-pointer to a read-only env
    // - other methods from classes derived from baseOCLTask
    // NOTE: order of setEnvPtr calls matters! TODO refine interface?
    virtual void setTask(TaskT *t) = 0;
    
    /* --- the user may overrider these methods --- */ 

    // called at the end of the stencil-reduce loop. It may be used to 
    // perform per-task host memory cleanup (i.e. releasing the host memory 
    // previously allocated in the setTask function) or to execute a 
    // post-elaboration phase
    virtual void   releaseTask(TaskT *t) {}
    // computes the loop iteration condition
	virtual bool   iterCondition(const Tout&, size_t) { return false; } 
    // host reduce function
	virtual Tout   combinator(const Tout&, const Tout&)  { return Tout(); }
    // step functions
    virtual void   incIter()                     { ++iter; }
	virtual size_t getIter() const               { return iter; }
	virtual void   resetIter(const size_t val=0) { iter = val; } 

    /* -------------------------------------------- */ 	

    void resetTask() {
        envPtr.resize(0);
        copyEnv.resize(0);
    }

    /**
     * set the host-pointer to the input array.
     *
     * @param _inPtr the host-pointer
     * @param sizeIn the number of elements in the input array
     * @param copy TODO
     * @param reuse TODO
     * @param release TODO
     */
    void setInPtr(Tin* _inPtr, size_t sizeIn, 
                  const CopyFlags    copy   =CopyFlags::COPY, 
                  const ReuseFlags   reuse  =ReuseFlags::DONTREUSE, 
                  const ReleaseFlags release=ReleaseFlags::DONTRELEASE)  { 
        inPtr  = _inPtr; size_in = sizeIn; 
        tuple_in = std::make_tuple(copy==CopyFlags::COPY,
                                   reuse==ReuseFlags::REUSE,
                                   release==ReleaseFlags::RELEASE);
    }

    /**
     * set the host-pointer to the input array.
     *
     * @see setInPtr()
     */
    void setInPtr(Tin* _inPtr, size_t sizeIn, const MemoryFlags &flags) { 
        inPtr  = _inPtr; size_in = sizeIn; 
        tuple_in = std::make_tuple(flags.copy==CopyFlags::COPY,
                                   flags.reuse==ReuseFlags::REUSE,
                                   flags.release==ReleaseFlags::RELEASE);
    }

    /**
     * set the host-pointer to the output array.
     *
     * @see setInPtr()
     */
    void setOutPtr(Tout* _outPtr, size_t sizeOut, 
                   const CopyFlags copyback    =CopyFlags::COPY, 
                   const ReuseFlags reuse      =ReuseFlags::DONTREUSE, 
                   const ReleaseFlags release  =ReleaseFlags::DONTRELEASE)  { 
        outPtr = _outPtr; size_out = sizeOut; 
        tuple_out = std::make_tuple(copyback==CopyFlags::COPY,
                                    reuse==ReuseFlags::REUSE,
                                    release==ReleaseFlags::RELEASE);
    }

    /**
     * set the host-pointer to the output array.
     *
     * @see setInPtr()
     */
    void setOutPtr(Tout* _outPtr, size_t sizeOut, const MemoryFlags &flags ) { 
        outPtr = _outPtr; size_out = sizeOut; 
        tuple_out = std::make_tuple(flags.copy==CopyFlags::COPY,
                                    flags.reuse==ReuseFlags::REUSE,
                                    flags.release==ReleaseFlags::RELEASE);
    }

    /**
     * add to env-list the host-pointer to a read-only env.
     *
     * @see setInPtr()
     */
    template<typename ptrT>
    void setEnvPtr(const ptrT* _envPtr, size_t size, 
                  const CopyFlags copy   =CopyFlags::COPY, 
                  const ReuseFlags reuse  =ReuseFlags::DONTREUSE, 
                  const ReleaseFlags release=ReleaseFlags::DONTRELEASE)  { 
        assert(envPtr.size() == copyEnv.size());
        envPtr.push_back(std::make_pair((void*)_envPtr,size*sizeof(ptrT)));
        copyEnv.push_back(std::make_tuple(sizeof(ptrT), 
                                          copy==CopyFlags::COPY,
                                          reuse==ReuseFlags::REUSE,
                                          release==ReleaseFlags::RELEASE));                   
    }
    
    /**
     * add to env-list the host-pointer to a read-only env.
     *
     * @see setInPtr()
     */
    template<typename ptrT>
    void setEnvPtr(const ptrT* _envPtr, size_t size, const MemoryFlags &flags) { 
        assert(envPtr.size() == copyEnv.size());
        envPtr.push_back(std::make_pair((void*)_envPtr,size*sizeof(ptrT)));
        copyEnv.push_back(std::make_tuple(sizeof(ptrT), 
                                          flags.copy==CopyFlags::COPY,
                                          flags.reuse==ReuseFlags::REUSE,
                                          flags.release==ReleaseFlags::RELEASE));
    }

    Tin *   getInPtr()    const { return inPtr;  }
    Tout *  getOutPtr()   const { return outPtr; }
    template<typename ptrT>
    void getEnvPtr(const size_t idx, ptrT *& ptr)  const { 
        assert(idx < envPtr.size());
        ptr = reinterpret_cast<ptrT*>(envPtr[idx].first);
    }
    
    size_t  getEnvNum() const { 
        assert(envPtr.size() == copyEnv.size());
        return envPtr.size();
    }
    
    bool  getCopyEnv(const size_t idx) const { 
        assert(idx < copyEnv.size());
        return std::get<1>(copyEnv[idx]);
    }
    bool  getReuseEnv(const size_t idx) const { 
        assert(idx < copyEnv.size());
        return std::get<2>(copyEnv[idx]);
    }
    bool  getReleaseEnv(const size_t idx) const { 
        assert(idx < copyEnv.size());
        return std::get<3>(copyEnv[idx]);
    }
    
    bool getCopyIn()     const { return std::get<0>(tuple_in); }
    bool getReuseIn()    const { return std::get<1>(tuple_in); }
    bool getReleaseIn()  const { return std::get<2>(tuple_in); }

    bool getCopyOut()    const { return std::get<0>(tuple_out); }
    bool getReuseOut()   const { return std::get<1>(tuple_out); }
    bool getReleaseOut() const { return std::get<2>(tuple_out); }


    size_t getSizeIn()   const { return size_in;   }
    size_t getSizeOut()  const { return (size_out==0)?size_in:size_out;  }
    size_t getSizeEnv(const size_t idx)  const { 
        assert(idx < copyEnv.size());
        return std::get<0>(copyEnv[idx]);
    }
    
    size_t getBytesizeIn()    const { return getSizeIn() * sizeof(Tin); }
    size_t getBytesizeOut()   const { return getSizeOut() * sizeof(Tout); }
    size_t getBytesizeEnv(const size_t idx)  const { 
        assert(idx < envPtr.size());
        return envPtr[idx].second;
    }
    
    void  setReduceVar(const Tout *r) { reduceVar = (Tout*)r;  } 
    Tout *getReduceVar() const { return reduceVar;  }
	void  writeReduceVar(const Tout &r) { *reduceVar = r; }    

	void  setIdentityVal(const Tout &x) { identityVal = x;} 
	Tout  getIdentityVal() const        { return identityVal; }

	bool  iterCondition_aux() { 
        return iterCondition(*reduceVar, iter);   
    }           

protected:
    Tin     *inPtr;
    Tout    *outPtr;
    Tout    *reduceVar, identityVal;
    size_t   size_in, size_out, iter;
    std::tuple<bool,bool,bool> tuple_in;
    std::tuple<bool,bool,bool> tuple_out;

    std::vector<std::pair<void*,size_t> > envPtr;             // pointer and byte-size
    std::vector<std::tuple<size_t,bool,bool,bool> > copyEnv;  // size and flags
};


/**
 * a virtual OpenCL accelerator.for 1D kernels
 */
template<typename T, typename TOCL = T>
class ff_oclAccelerator {
public:
	typedef typename TOCL::Tin  Tin;
	typedef typename TOCL::Tout Tout;

	ff_oclAccelerator(ff_oclallocator *alloc, const size_t width_, const Tout &identityVal, const bool from_source=false) :
        from_source(from_source), my_own_allocator(false), allocator(alloc), halo_half(width_), identityVal(identityVal), events_h2d(16), deviceId(NULL) {
		wgsize_map_static = wgsize_reduce_static = 0;
		wgsize_map_max = wgsize_reduce_max = 0;
		inputBuffer = outputBuffer = reduceBuffer = NULL;

		sizeInput = sizeInput_padded = 0;
		lenInput = offset1_in = halo_in_left = halo_in_right = lenInput_global = 0;

		sizeOutput = sizeOutput_padded = 0;
		lenOutput = offset1_out = halo_out_left = halo_out_right = lenOutput_global = 0;

		nevents_h2d = nevents_map = 0;
		event_d2h = event_map = event_reduce1 = event_reduce2 = NULL;
		wgsize_map = nthreads_map = 0;
		wgsize_reduce = nthreads_reduce = nwg_reduce = wg_red_mem = 0;
		reduceVar = identityVal;
		kernel_map = kernel_reduce = kernel_init = NULL;
		context = NULL;
		program = NULL;
		cmd_queue = NULL;

		reduce_mode = REDUCE_OUTPUT;

        if (!allocator) {
            my_own_allocator = true;
            allocator = new ff_oclallocator;
            assert(allocator);
        }
	}

    virtual ~ff_oclAccelerator() {
        if (my_own_allocator) {
            allocator->releaseAllBuffers(context);
            delete allocator;
            allocator = NULL;
            my_own_allocator = false;
        }
    }

	int init(cl_device_id dId, reduceMode m, const std::string &kernel_code, const std::string &kernel_name1,
             const std::string &kernel_name2, const bool save_binary, const bool reuse_binary) {
#ifdef FF_OPENCL_LOG
		fprintf(stderr, "initializing virtual accelerator @%p mapped to device:\n", this);
		std::cerr << ff::clEnvironment::instance()->getDeviceInfo(dId) << std::endl;
#endif
		reduce_mode = m;
		//set OCL objects
		deviceId = dId;
		const oclParameter *param = clEnvironment::instance()->getParameter(deviceId);
		assert(param);
		context    = param->context;
		cmd_queue  = param->commandQueue;
		//build OCL kernels
		cl_int status = buildKernels(kernel_code, kernel_name1,kernel_name2, from_source, save_binary, reuse_binary);
		checkResult(status, "build kernels");
		//compute static heuristics for kernel sizing
		setSizingHeuristics();
		return status == CL_SUCCESS;
	}

	void releaseAll() { if (deviceId) { svc_releaseOclObjects(); deviceId = NULL; }}
    void releaseInput(const Tin *inPtr)     { 
        if (allocator->releaseBuffer(inPtr, context, inputBuffer) != CL_SUCCESS)
            checkResult(CL_INVALID_MEM_OBJECT, "releaseInput");
        inputBuffer = NULL;
    }
    void releaseOutput(const Tout *outPtr)  {
        if (allocator->releaseBuffer(outPtr, context, outputBuffer) != CL_SUCCESS)
            checkResult(CL_INVALID_MEM_OBJECT, "releaseOutput");
        outputBuffer = NULL;
    }
    void releaseEnv(size_t idx, const void *envPtr) {
        if (allocator->releaseBuffer(envPtr, context, envBuffer[idx].first) != CL_SUCCESS)
            checkResult(CL_INVALID_MEM_OBJECT, "releaseEnv");
        envBuffer[idx].first = NULL, envBuffer[idx].second = 0;
    }
        
	void swapBuffers() {
		cl_mem tmp = inputBuffer;
		inputBuffer = outputBuffer;
		outputBuffer = tmp;
	}

	void setSizingHeuristics() {
		cl_int status;
		//get device-dependent max wg size
		size_t max_device_wgsize;
		status = clGetDeviceInfo(deviceId,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(max_device_wgsize),&max_device_wgsize, NULL);
		checkResult(status, "clGetDeviceInfo (map)");
		if(kernel_map) { //map kernel
			//get kernel-dependent max wg size
			size_t max_kernel_wgsize;
			status = clGetKernelWorkGroupInfo(kernel_map, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_kernel_wgsize, 0);
			checkResult(status, "GetKernelWorkGroupInfo (map)");
			wgsize_map_max = std::min<size_t>(max_device_wgsize,max_kernel_wgsize);
			//get size of the atomic scheduling unit (analogous to CUDA wrap size)
			//typical values are 16 or 32
			size_t wg_multiple;
			status = clGetKernelWorkGroupInfo(kernel_map, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wg_multiple, 0);
			wgsize_map_static = std::max<size_t>(64, wg_multiple * 4); //64 or 128
			wgsize_map_static = std::min<size_t>(wgsize_map_static,wgsize_map_max);
		}
		if(kernel_reduce) { //reduce kernel
			//get kernel-dependent max wg size
			size_t max_kernel_wgsize;
			status = clGetKernelWorkGroupInfo(kernel_reduce, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_kernel_wgsize, 0);
			checkResult(status, "GetKernelWorkGroupInfo (reduce)");
			wgsize_reduce_max = std::min<size_t>(max_device_wgsize,max_kernel_wgsize);
			//get size of the atomic scheduling unit (analogous to CUDA wrap size)
			//typical values are 16 or 32
			size_t wg_multiple;
			status = clGetKernelWorkGroupInfo(kernel_reduce, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wg_multiple, 0);
			wgsize_reduce_static = std::max<size_t>(64, wg_multiple * 4); //64 or 128
			wgsize_reduce_static = std::min<size_t>(wgsize_reduce_static,wgsize_reduce_max);
		}
#ifdef FF_OPENCL_LOG
		std::cerr <<  "[virtual accelerator @"<<this<<"]\n";
		std::cerr <<  "+ static heuristics for kernel sizing parameters:\n";
		std::cerr <<  "- MAP workgroup-size = " <<wgsize_map_static<< "\n";
		std::cerr <<  "- MAP max workgroup-size = " <<wgsize_map_max<< "\n";
		std::cerr <<  "- RED workgroup-size = " <<wgsize_reduce_static<< " \n";
		std::cerr <<  "- RED max workgroup-size = " <<wgsize_reduce_max<< " \n";
#endif
	}

	//see comments for members
    void adjustInputBufferOffset(const Tin *newPtr, const Tin *oldPtr, std::pair<size_t, size_t> &P, size_t len_global) {
        offset1_in = P.first;
        lenInput   = P.second;
        lenInput_global = len_global;
        halo_in_left = (std::min)(halo_half, offset1_in);
        halo_in_right = (std::min)(halo_half, lenInput_global - lenInput - offset1_in);
		sizeInput = lenInput * sizeof(Tin);
		sizeInput_padded = sizeInput + (halo_in_left + halo_in_right) * sizeof(Tin);

        if (oldPtr != NULL) allocator->updateKey(oldPtr, newPtr, context);
    }

	void relocateInputBuffer(const Tin *inPtr, const bool reuseIn, const Tout *reducePtr) {
		cl_int status;

        if (reuseIn) {
            inputBuffer = allocator->createBufferUnique(inPtr, context,
                                                       CL_MEM_READ_WRITE, sizeInput_padded, &status);
            checkResult(status, "CreateBuffer(Unique) input");
        } else {
            if (inputBuffer)  allocator->releaseBuffer(inPtr, context, inputBuffer);
            //allocate input-size + pre/post-windows
            inputBuffer = allocator->createBuffer(inPtr, context,
                                                 CL_MEM_READ_WRITE, sizeInput_padded, &status);
            checkResult(status, "CreateBuffer input");
        }

        //set workgroup size and nthreads for map
        // MA patch - map not defined => workgroup_size_map
        //size_t wgsize = wgsize_map_static==0?wgsize_reduce_static:wgsize_map_static;
		if(kernel_map) {

            // if the output buffer is bigger than input buffer then we use the size of the
            // output buffer to compute the number of threads
            if (lenInput < lenOutput) {
                wgsize_map = std::min<size_t>(lenOutput, wgsize_map_static);
                nthreads_map = wgsize_map * ((lenOutput + wgsize_map - 1) / wgsize_map); //round up
            }
            else {
                wgsize_map = std::min<size_t>(lenInput, wgsize_map_static);
                nthreads_map = wgsize_map * ((lenInput + wgsize_map - 1) / wgsize_map); //round up
            }
#ifdef FF_OPENCL_LOG
			std::cerr <<  "[virtual accelerator @"<<this<<"]\n";
			std::cerr << "+ computed MAP kernel sizing parameters:\n";
			std::cerr << "- MAP workgroup-size = " << wgsize_map << "\n";
			std::cerr << "- MAP n. threads     = " << nthreads_map << " \n";
#endif
		}
		//set workgroup size and nthreads for reduce
        if (kernel_reduce && reduce_mode == REDUCE_INPUT) {
            resetReduce(lenInput, sizeof(Tin), (void *)reducePtr);
        }
	}

    void adjustOutputBufferOffset(const Tout *newPtr, const Tout *oldPtr, std::pair<size_t, size_t> &P, size_t len_global) {
        offset1_out = P.first;
		lenOutput   = P.second;
		lenOutput_global = len_global;
		halo_out_left = (std::min)(halo_half, offset1_out);
		halo_out_right = (std::min)(halo_half, lenOutput_global - lenOutput - offset1_out);
		sizeOutput = lenOutput * sizeof(Tout);
		sizeOutput_padded = sizeOutput + (halo_out_left + halo_out_right) * sizeof(Tout);

        if (oldPtr != NULL) allocator->updateKey(oldPtr, newPtr, context);
    }

	void relocateOutputBuffer(const Tout  *outPtr, const Tout *reducePtr) {
		cl_int status;
		if (outputBuffer) allocator->releaseBuffer(outPtr, context, outputBuffer);
		outputBuffer = allocator->createBuffer(outPtr, context,
                                               CL_MEM_READ_WRITE, sizeOutput_padded,&status);
		checkResult(status, "CreateBuffer output");

		//set workgroup size and nthreads for reduce
		if (kernel_reduce && reduce_mode == REDUCE_OUTPUT) {
			resetReduce(lenOutput, sizeof(Tout), (void *)reducePtr);
		}
	}

	void relocateEnvBuffer(const void *envptr, const bool reuseEnv, const size_t idx, const size_t envbytesize) {
		cl_int status = CL_SUCCESS;

        if (idx >= envBuffer.size()) {
            cl_mem envb;
            if (reuseEnv) 
                envb = allocator->createBufferUnique(envptr, context,
                                                     CL_MEM_READ_WRITE, envbytesize, &status);
            else 
                envb = allocator->createBuffer(envptr, context,
                                               CL_MEM_READ_WRITE, envbytesize, &status);
            if (checkResult(status, "CreateBuffer envBuffer"))
                envBuffer.push_back(std::make_pair(envb,envbytesize));
        } else { 

            if (reuseEnv) {
                envBuffer[idx].first = allocator->createBufferUnique(envptr, context,
                                                                     CL_MEM_READ_WRITE, envbytesize, &status);
                if (checkResult(status, "CreateBuffer envBuffer"))
                    envBuffer[idx].second = envbytesize;
            } else {
                if (envBuffer[idx].second < envbytesize) {
                    if (envBuffer[idx].first) allocator->releaseBuffer(envptr, context, envBuffer[idx].first);
                    envBuffer[idx].first = allocator->createBuffer(envptr, context,
                                                                   CL_MEM_READ_WRITE, envbytesize, &status);
                    if (checkResult(status, "CreateBuffer envBuffer"))
                        envBuffer[idx].second = envbytesize;
                }
            }
        }
    }
    
	void setInPlace(Tout *reducePtr) {
        outputBuffer      = inputBuffer;          
        lenOutput         = lenInput;
		lenOutput_global  = lenInput_global;
		halo_out_left          = halo_in_left;
		halo_out_right          = halo_in_right;
		offset1_out       = offset1_in;
		sizeOutput        = sizeInput;
		sizeOutput_padded = sizeInput_padded;

		//set workgroup size and nthreads for reduce
		if (kernel_reduce && reduce_mode == REDUCE_OUTPUT) {
			resetReduce(lenOutput, sizeof(Tout), reducePtr);
		}
    }

	void swap() {
		cl_mem tmp = inputBuffer;
		inputBuffer = outputBuffer;
		outputBuffer = tmp;
		//set iteration-dynamic MAP kernel args
		cl_int status = clSetKernelArg(kernel_map, 0, sizeof(cl_mem), &inputBuffer);
		checkResult(status, "setKernelArg input");
		status = clSetKernelArg(kernel_map, 1, sizeof(cl_mem), &outputBuffer);
		checkResult(status, "setKernelArg output");
	}

	virtual void setMapKernelArgs(const size_t envSize) {
		cl_uint idx = 0;
		//set iteration-dynamic MAP kernel args (init)
		cl_int status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &inputBuffer);
		checkResult(status, "setKernelArg input");
		status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &outputBuffer);
		checkResult(status, "setKernelArg output");

		//set iteration-invariant MAP kernel args
		status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &lenInput_global);
		checkResult(status, "setKernelArg global input length");

		status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &lenOutput);
		checkResult(status, "setKernelArg local input length");
		status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &offset1_in);
		checkResult(status, "setKernelArg offset");
		status = clSetKernelArg(kernel_map, idx++, sizeof(cl_uint), (void *) &halo_out_left); // CHECK !!!
		checkResult(status, "setKernelArg pad");

        for(size_t k=0; k < envSize; ++k) {
            status = clSetKernelArg(kernel_map, idx++, sizeof(cl_mem), &envBuffer[k].first);
            checkResult(status, "setKernelArg env");
        }
	}

	size_t asyncH2Dinput(Tin *p) {
        if (nevents_h2d >= events_h2d.size()) events_h2d.reserve(nevents_h2d);
		p += offset1_in - halo_in_left;
		cl_int status = clEnqueueWriteBuffer(cmd_queue, inputBuffer, CL_FALSE, 0,
                                             sizeInput_padded, p, 0, NULL, &events_h2d[nevents_h2d++]);
		checkResult(status, "copying Task to device input-buffer");
        return sizeInput_padded;
	}

	size_t asyncH2Denv(const size_t idx, char *p) {
        if (nevents_h2d >= events_h2d.size()) events_h2d.reserve(nevents_h2d);
		cl_int status = clEnqueueWriteBuffer(cmd_queue, envBuffer[idx].first, CL_FALSE, 0,
                                             envBuffer[idx].second, p, 0, NULL, &events_h2d[nevents_h2d++]);    
		checkResult(status, "copying Task to device env-buffer");
        return envBuffer[idx].second;
	}

	size_t  asyncH2Dborders(Tout *p) {
		if (halo_out_left) {
			cl_int status = clEnqueueWriteBuffer(cmd_queue, outputBuffer, CL_FALSE, 0,
                                                 halo_out_left * sizeof(Tout), p + offset1_out - halo_out_left, 0, NULL,
                                                 &events_h2d[nevents_h2d++]);
			checkResult(status, "copying left border to device");
            return halo_out_left * sizeof(Tout);
		}
		if (halo_out_right) {
			cl_int status = clEnqueueWriteBuffer(cmd_queue, outputBuffer, CL_FALSE,
                                                 (halo_out_left + lenOutput) * sizeof(Tout), halo_out_right * sizeof(Tout),  // NOTE: in a loop Tin == Tout !!
                                                 p + offset1_out + lenOutput, 0, NULL, &events_h2d[nevents_h2d++]);
			checkResult(status, "copying right border to device");
		}
        return halo_out_left * sizeof(Tout);
	}

	size_t asyncD2Houtput(Tout *p) {
		cl_int status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE,
                                            halo_out_left * sizeof(Tout), sizeOutput, p + offset1_out, 0, NULL, &event_d2h);
		checkResult(status, "copying output back from device");
        return sizeOutput;
	}

	size_t asyncD2Hborders(Tout *p) {
		cl_int status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE,
                                            halo_out_left * sizeof(Tout), halo_half * sizeof(Tout), p + offset1_out, 0, NULL,
                                            &events_h2d[0]);
		checkResult(status, "copying border1 back from device");
		++nevents_h2d;
		status = clEnqueueReadBuffer(cmd_queue, outputBuffer, CL_FALSE,
                                     (halo_out_left + lenOutput - halo_half) * sizeof(Tout), halo_half * sizeof(Tout),
                                     p + offset1_out + lenOutput - halo_half, 0, NULL, &event_d2h);
		checkResult(status, "copying border2 back from device");
        return halo_half * sizeof(Tout);
	}


	//void initReduce(const Tout &initReduceVal, reduceMode m = REDUCE_OUTPUT) {
    void initReduce() {
		//set kernel args for reduce1
		int idx = 0;
		cl_mem tmp = (reduce_mode == REDUCE_OUTPUT) ? outputBuffer : inputBuffer;
		cl_uint len = (reduce_mode == REDUCE_OUTPUT) ? lenOutput : lenInput;
		cl_int status  = clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &tmp);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(uint), &halo_in_left);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_uint),	(void *) &len);
		status        |= clSetKernelArg(kernel_reduce, idx++, wg_red_mem, NULL);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(Tout), (void *) &identityVal);
		checkResult(status, "setKernelArg reduce-1");
	}

	void asyncExecMapKernel() {
		//execute MAP kernel
		cl_int status = clEnqueueNDRangeKernel(cmd_queue, kernel_map, 1, NULL,
                                               &nthreads_map, &wgsize_map, 0, NULL, &event_map);
		checkResult(status, "executing map kernel");
        //std::cerr << "Exec map WI " << globalThreadsMap << " localThreadsMap " << localThreadsMap << "\n";
		++nevents_map;
	}

	void asyncExecReduceKernel1() {
        //std::cerr << "Exec reduce1 WI " << globalThreadsReduce << " localThreadsReduce " << localThreadsReduce <<  "\n";
		cl_int status = clEnqueueNDRangeKernel(cmd_queue, kernel_reduce, 1, NULL,
                                               &nthreads_reduce, &wgsize_reduce, nevents_map,
                                               (nevents_map==0)?NULL:&event_map,
                                               &event_reduce1);
		checkResult(status, "exec kernel reduce-1");
		nevents_map = 0;
	}

	void asyncExecReduceKernel2() {
		cl_uint zeropad = 0;
		int idx = 0;
		cl_int status  = clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(uint), &zeropad);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_mem), &reduceBuffer);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(cl_uint),(void*) &nwg_reduce);
		status        |= clSetKernelArg(kernel_reduce, idx++, wg_red_mem, NULL);
		status        |= clSetKernelArg(kernel_reduce, idx++, sizeof(Tout), (void *) &identityVal);
		checkResult(status, "setKernelArg reduce-2");
        //std::cerr << "Exec reduce2 WI " << globalThreadsReduce << " localThreadsReduce " << localThreadsReduce <<  "\n";
		status = clEnqueueNDRangeKernel(cmd_queue, kernel_reduce, 1, NULL,
                                        &wgsize_reduce, &wgsize_reduce, 1, &event_reduce1,
                                        &event_reduce2);
		checkResult(status, "exec kernel reduce-2");
	}

	Tout getReduceVar() {
		cl_int status = clEnqueueReadBuffer(cmd_queue, reduceBuffer, CL_TRUE, 0,
                                            sizeof(Tout), &reduceVar, 0, NULL, NULL);
		checkResult(status, "d2h reduceVar");
		return reduceVar;
	}

	void waitforh2d() {
        if (nevents_h2d>0) {
            cl_int status = clWaitForEvents(nevents_h2d, events_h2d.data());
            checkResult(status, "h2d wait for");
            nevents_h2d = 0;
        }
	}

	void waitford2h() {
		cl_int status = clWaitForEvents(1, &event_d2h);
		checkResult(status, "d2h wait for");
	}

	void waitforreduce() {
		cl_int status = clWaitForEvents(1, &event_reduce2);
        checkResult(status, "wait for reduce");
	}

	void waitformap() {
		cl_int status = clWaitForEvents(nevents_map, &event_map);
		nevents_map = 0;
        checkResult(status, "wait for map");
	}

	/**
	 * dynamically sets half-size of the 1D halo
	 */
	void setHaloHalf(const size_t h) {
		halo_half = h;
	}

private:
    int buildProgram(cl_device_id dId) {
		cl_int status = clBuildProgram(program, 1, &dId, /*"-cl-fast-relaxed-math"*/NULL, NULL,NULL);
		checkResult(status, "building program");

		// DEBUGGING CODE for checking OCL compilation errors
		if (status != CL_SUCCESS) {
			printf("\nFail to build the program\n");
			size_t len;
			clGetProgramBuildInfo(program, dId, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
			printf("LOG len %ld\n", len);
			char *buffer = (char*) calloc(len, sizeof(char));
			assert(buffer);
			clGetProgramBuildInfo(program, dId, CL_PROGRAM_BUILD_LOG, len * sizeof(char),
					buffer, NULL);
			printf("LOG: %s\n\n", buffer);
            
            return -1;
		}
        return 0;
    }

	int buildKernelCode(const std::string &kc, cl_device_id dId) {
		cl_int status;

		size_t sourceSize = kc.length();
		const char* code = kc.c_str();

#ifdef FF_OPENCL_LOG
		printf("/* ------------------------------------------------------- */\n");
		printf("buildKernelCode:\n%s\n", code);
		printf("/* ------------------------------------------------------- */\n");
#endif
		program = clCreateProgramWithSource(context, 1, &code, &sourceSize, &status);
        if (!program) {
            checkResult(status, "creating program with source");
            return -1;
        }
        return buildProgram(dId);
	}

    // create the program with the binary file or from the source code
	int createProgram(const std::string &filepath, cl_device_id dId, const bool save_binary, const bool reuse_binary) {
        cl_int status, binaryStatus;
        bool binary = false;
        const std::string binpath = filepath + ".bin";
        
        std::ifstream ifs;
        if (reuse_binary) {
            ifs.open(binpath, std::ios::binary );
            if (!ifs.is_open()) { // try with filepath
                ifs.open(filepath, std::ios::binary);
                if (!ifs.is_open()) {
                    error("createProgram: cannot open %s (nor %s)\n", filepath.c_str(), binpath.c_str());
                    return -1;
                }
            } else binary = true;
        } else {
            ifs.open(filepath, std::ios::binary);
            if (!ifs.is_open()) {
                error("createProgram: cannot open source file %s\n", filepath.c_str());
                return -1;
            }
        }
        std::vector<char> buf((std::istreambuf_iterator<char>(ifs)),
                              (std::istreambuf_iterator<char>()));
        ifs.close();
        size_t bufsize      = buf.size();
        const char *bufptr  = buf.data();

        status = CL_INVALID_BINARY;
        if (binary) {
            program = clCreateProgramWithBinary(context, 1, &dId, &bufsize,
                                                reinterpret_cast<const unsigned char **>(&bufptr),
                                                &binaryStatus, &status);
        }        
        if (status != CL_SUCCESS) { // maybe is not a binary file
            program = clCreateProgramWithSource(context, 1,&bufptr, &bufsize, &status);
            if (!program) {
                checkResult(status, "creating program with source");
                return -1;  
            }
            if (buildProgram(dId)<0) return -1;
            if (save_binary) { // TODO: the logical deviceId has to be attached to the file name !                
                size_t programBinarySize;
                status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
                                          sizeof(size_t) * 1,
                                          &programBinarySize, NULL);
                checkResult(status, "createProgram clGetProgramInfo (binary size)");                    
                
                std::vector<char> binbuf(programBinarySize);
                const char *binbufptr = binbuf.data();
                status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * 1,
                                          &binbufptr, NULL);
                checkResult(status, "createProgram clGetProgramInfo (binary data)");                          
                
                std::ofstream ofs(binpath, std::ios::out | std::ios::binary);
                ofs.write(binbuf.data(), binbuf.size());
                ofs.close();
            }
            return 0;
        }             
        return buildProgram(dId);
    }


	cl_int buildKernels(const std::string &kernel_code,
                            const std::string &kernel_name1, const std::string &kernel_name2,
                            const bool from_source, const bool save_binary, const bool reuse_binary) {

		cl_int status_ = CL_SUCCESS;

        if (!from_source) { //compile kernel on device
            if (buildKernelCode(kernel_code, deviceId)<0) return -1;
        } else {  // kernel_code is the path to the (binary?) source
            if (createProgram(kernel_code, deviceId, save_binary, reuse_binary)<0) return -1;
        }
        
        //create kernel objects
        if (kernel_name1 != "") { //map kernel
        	cl_int status;
			kernel_map = clCreateKernel(program, kernel_name1.c_str(), &status);
			checkResult(status, "CreateKernel (map)");
			status_ |= status;
		}
		if (kernel_name2 != "") { //reduce kernel
			cl_int status;
			kernel_reduce = clCreateKernel(program, kernel_name2.c_str(), &status);
			checkResult(status, "CreateKernel (reduce)");
			status_ |= status;
       }

		return status_;
	}

	void svc_releaseOclObjects() {
		if (kernel_map)     clReleaseKernel(kernel_map);
		if (kernel_reduce)	clReleaseKernel(kernel_reduce);
		clReleaseProgram(program);

		// if (inputBuffer)    clReleaseMemObject(inputBuffer);
        // for(size_t i=0; i < envBuffer.size(); ++i)
        //     clReleaseMemObject(envBuffer[i].first);
		// if (outputBuffer && outputBuffer != inputBuffer)
		// 	clReleaseMemObject(outputBuffer);
		// if (reduceBuffer)
		// 	clReleaseMemObject(reduceBuffer);
        allocator->releaseAllBuffers(context);
	}

	void resetReduce(size_t lenReduceInput, size_t elem_size, const void *reducePtr) {
		// 64 and 256 are the max number of blocks and threads we want to use
		//getBlocksAndThreads(lenInput, 64, 256, nwg_reduce, wgsize_reduce);
		//nthreads_reduce = nwg_reduce * wgsize_reduce;
		nthreads_reduce = lenReduceInput;
		if(!isPowerOf2(nthreads_reduce))
			nthreads_reduce = nextPowerOf2(nthreads_reduce);
		wgsize_reduce = std::min<size_t>(nthreads_reduce, wgsize_reduce_static);
		nwg_reduce = nthreads_reduce / wgsize_reduce;

		//compute size of per-workgroup working memory
		wg_red_mem = (wgsize_reduce * elem_size)
				+ (wgsize_reduce <= 32) * (wgsize_reduce * elem_size);

		//compute size of global reduce working memory
		size_t global_red_mem = nwg_reduce * elem_size;

		//allocate global memory for storing intermediate per-workgroup reduce results
		cl_int status;
		if (reduceBuffer)
			allocator->releaseBuffer(reducePtr, context, reduceBuffer);
		reduceBuffer = allocator->createBuffer(reducePtr, context,
		CL_MEM_READ_WRITE, global_red_mem, &status);
		checkResult(status, "CreateBuffer reduce");

#ifdef FF_OPENCL_LOG
		std::cerr << "[virtual accelerator @"<<this<<"]\n";
		std::cerr << "+ computed REDUCE kernel sizing parameters:\n";
		std::cerr << "- REDUCE workgroup-size = " <<wgsize_reduce<< " \n";
		std::cerr << "- REDUCE n. threads     = " <<nthreads_reduce<< " \n";
		std::cerr << "- REDUCE n. workgroups  = " <<nwg_reduce<< " \n";
		std::cerr << "- REDUCE per-wg memory  = " <<wg_red_mem<< " \n";
#endif
	}

	/*!
	 * Computes the number of threads and blocks to use for the reduction kernel.
	 */
	inline void getBlocksAndThreads(const size_t size, const size_t maxBlocks,
			const size_t maxThreads, size_t & blocks, size_t &threads) {
		const size_t half = (size + 1) / 2;
		threads =
				(size < maxThreads * 2) ?
						(isPowerOf2(half) ? nextPowerOf2(half + 1) : nextPowerOf2(half)) :
						maxThreads;
		blocks = (size + (threads * 2 - 1)) / (threads * 2);
		blocks = std::min(maxBlocks, blocks);
	}

protected:  
	cl_context context;
	cl_program program;
	cl_command_queue cmd_queue;
	cl_mem reduceBuffer;
	cl_kernel kernel_map, kernel_reduce, kernel_init;

protected:
    const bool from_source;
    bool my_own_allocator; 
    ff_oclallocator *allocator;
	size_t halo_half; //half-size of the 1D halo
	const Tout identityVal;
	Tout reduceVar;

	cl_mem inputBuffer, outputBuffer;
    std::vector<std::pair<cl_mem, size_t> > envBuffer;

    /*
     * each accelerator works on the following subset of the input array:
     * [left-border][input-portion][right-border]
     * input portion is accessed RW, border are accessed read-only
     */
    size_t sizeInput; //byte-size of the input-portion
    size_t sizeInput_padded; //byte-size of the input-portion plus left and right borders
	size_t lenInput; //n. elements in the input-portion
	size_t offset1_in; //left-offset (begin input-portion wrt to begin input)
	size_t halo_in_left; //n. elements in the left-halo
	size_t halo_in_right; //n. elements in the right-halo
	size_t lenInput_global; //n. elements in the input

	size_t sizeOutput, sizeOutput_padded;
	size_t lenOutput, offset1_out, halo_out_left, halo_out_right, lenOutput_global;

	//static input-independent estimation of workgroup sizing
	size_t wgsize_map_static, wgsize_reduce_static;
	//static input-independent upper bounds for workgroup sizing
	size_t wgsize_map_max, wgsize_reduce_max;
	//input-dependent workgroup sizing
	size_t wgsize_map, wgsize_reduce;
	//input-dependent number of threads
	size_t nthreads_map, nthreads_reduce;
	//number of workgroups executing first on-device reduce
	size_t nwg_reduce;
	//reduce workgroup-local memory
	size_t wg_red_mem;

	//OCL events
	std::vector<cl_event> events_h2d;
	size_t nevents_h2d, nevents_map;
	cl_event event_d2h, event_map, event_reduce1, event_reduce2;

	//switch for the input of the reduce
	reduceMode reduce_mode;

	//the OCL Id the accelerator is mapped to
	cl_device_id deviceId;
};


/*!
 * \class ff_stencilReduceLoopOCL_1D
 * \ingroup high_level_patterns
 *
 * \brief The OpenCL-based StencilReduceLoop pattern in 1 dimension
 *
 * This class is defined in \ref stencilReduceOCL.hpp
 */
    
template<typename T, typename TOCL = T, typename accelerator_t = ff_oclAccelerator<T, TOCL> >
class ff_stencilReduceLoopOCL_1D: public ff_oclNode_t<T> {
public:
	typedef typename TOCL::Tin         Tin;
	typedef typename TOCL::Tout        Tout;

    // build the program from the mapf and reducef functions
	ff_stencilReduceLoopOCL_1D(const std::string &mapf,			              //OpenCL elemental function 
                               const std::string &reducef = std::string(""),  //OpenCL combinator function
                               const Tout &identityVal = Tout(),               
                               ff_oclallocator *allocator = nullptr,
                               const size_t NACCELERATORS = 1, const int width = 1) :
        oneshot(false), saveBinary(false), reuseBinary(false), 
        accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), 
        stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
        oldBytesizeIn(0), oldSizeOut(0),  oldSizeReduce(0)  {
		setcode(mapf, reducef);
        for(size_t i = 0; i< NACCELERATORS; ++i)
            accelerators[i]= new accelerator_t(allocator, width,identityVal);
#ifdef FF_OPENCL_LOG
        fprintf(stderr,"[ff_stencilReduceLoopOCL_1D node @%p]\n",this);
        fprintf(stderr,"map-kernel code:\n%s\n", mapf.c_str());
        fprintf(stderr,"reduce-kernel code:\n%s\n", reducef.c_str());
#endif
	}
    
    // build the program from source code file, 
    // first attempts to load a cached binary file (kernels_source in this case is the path to the binary file)
    // ff that file is not available, then it creates the program from source and store the binary for future use
    // with the extention ".bin"
    ff_stencilReduceLoopOCL_1D(const std::string &kernels_source,            // OpenCL source code path
                               const std::string &mapf_name,                 // name of the map function
                               const std::string &reducef_name,              // name of the reduce function
                               const Tout &identityVal = Tout(),
                               ff_oclallocator *allocator = nullptr,
                               const size_t NACCELERATORS = 1, const int width = 1) :
        oneshot(false), saveBinary(false), reuseBinary(false), 
        accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), 
        stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
        oldBytesizeIn(0), oldSizeOut(0),  oldSizeReduce(0)  {
		setsourcecode(kernels_source, mapf_name, reducef_name);
        for(size_t i = 0; i< NACCELERATORS; ++i)
            accelerators[i]= new accelerator_t(allocator, width, identityVal, true);
	}

    // the task is provided in the constructor -- one shot computation
	ff_stencilReduceLoopOCL_1D(const T &task, 
                               const std::string &mapf,	               
                               const std::string &reducef = std::string(""),	
                               const Tout &identityVal = Tout(),
                               ff_oclallocator *allocator = nullptr,
                               const size_t NACCELERATORS = 1, const int width = 1) :
        oneshot(true), saveBinary(false), reuseBinary(false), 
        accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), 
        stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
        oldBytesizeIn(0), oldSizeOut(0), oldSizeReduce(0) {
		ff_node::skipfirstpop(true);
		setcode(mapf, reducef);
        setTask(const_cast<T&>(task));
        for(size_t i = 0; i< NACCELERATORS; ++i)
            accelerators[i]= new accelerator_t(allocator, width,identityVal);
#ifdef FF_OPENCL_LOG
        fprintf(stderr,"[ff_stencilReduceLoopOCL_1D node @%p]\n",this);
        fprintf(stderr,"map-kernel code:\n%s\n", mapf.c_str());
        fprintf(stderr,"reduce-kernel code:\n%s\n", reducef.c_str());
#endif
	}

    // the task is provided in the constructor -- one shot computation
    ff_stencilReduceLoopOCL_1D(const T &task, 
                               const std::string &kernels_source,            // OpenCL source code path
                               const std::string &mapf_name,           // name of the map kernel function
                               const std::string &reducef_name,        // name of the reduce kernel function
                               const Tout &identityVal = Tout(),
                               ff_oclallocator *allocator = nullptr,
                               const size_t NACCELERATORS = 1, const int width = 1) :
        oneshot(true), saveBinary(false), reuseBinary(false), 
        accelerators(NACCELERATORS), acc_in(NACCELERATORS), acc_out(NACCELERATORS), 
        stencil_width_half(width), offset_dev(0), old_inPtr(NULL), old_outPtr(NULL),
        oldBytesizeIn(0), oldSizeOut(0),  oldSizeReduce(0)  {
		setsourcecode(kernels_source, mapf_name, reducef_name);
        setTask(const_cast<T&>(task));
        for(size_t i = 0; i< NACCELERATORS; ++i)
            accelerators[i]= new accelerator_t(allocator, width, identityVal, true);
	}
   

	virtual ~ff_stencilReduceLoopOCL_1D() {
        for(size_t i = 0; i< accelerators.size(); ++i)
            if (accelerators[i]) delete accelerators[i];
    }

    // used to set tasks when in onshot mode
    void setTask(T &task) {
        Task.resetTask();
        Task.setTask(&task);
    }

    /**
     * explicitly set the OpenCL devices to be used
     *
     * @param dev is the vector of devices (OpenCL Ids) to be used
     */
    void setDevices(std::vector<cl_device_id> &dev) {
//        if (dev.size() > accelerators.size()) {
//            error("ff_stencilReduceLoopOCL_1D::setDevices: Too many devices provided, please increase the number of logical accelerators\n");
//            return -1;
//        }
        devices = dev;
//        return 0;
    }

    // force execution on the CPU
    void pickCPU () {
        ff_oclNode_t<T>::setDeviceType(CL_DEVICE_TYPE_CPU);
    }
    
    // force execution on the GPU - as many as requested by the co-allocation strategy
    void pickGPU (size_t offset=0 /* referred to global list of devices */) {
        offset_dev=offset; //TODO check numbering
        ff_oclNode_t<T>::setDeviceType(CL_DEVICE_TYPE_GPU);
    }

    // after the compilation and building phases, the OpenCL program will be saved as binary file 
    // this action takes effect only if the compilation is made with source file (i.e. not using macroes)
    void saveBinaryFile() { saveBinary = true; }

    // tells the run-time to re-use the binary file available
    void reuseBinaryFile() { reuseBinary = true; }
    
	virtual int run(bool = false) {
		return ff_node::run();
	}

	virtual int wait() {
		return ff_node::wait();
	}

	virtual int run_and_wait_end() {
		if (run() < 0)
			return -1;
		if (wait() < 0)
			return -1;
		return 0;
	}

	virtual int run_then_freeze() {
        if (ff_node::isfrozen()) {
            ff_node::thaw(true);
            return 0;
        }
		return ff_node::freeze_and_run();
	}
	virtual int wait_freezing() {
		return ff_node::wait_freezing();
	}

	const T* getTask() const {
		return &Task;
	}

	unsigned int getIter() {
		return Task.getIter();
	}

	Tout *getReduceVar() {
		assert(oneshot);
		return Task.getReduceVar();
	}
    
    /**
	 * Performs a static allocation of OpenCL devices.
	 * Priority is given to GPU devices, falling back to CPU devices
	 * if needed (e.g. no GPU devices).
	 * Currently it does not mix GPUs with CPU
	 */
	int nodeInit() {
		if (ff_oclNode_t<T>::oclId < 0) { //check if already initialized
			ff_oclNode_t<T>::oclId = clEnvironment::instance()->getOCLID();
			if (devices.size() == 0) { // the user didn't set any specific device
				switch (ff_oclNode_t<T>::getDeviceType()) {
				case CL_DEVICE_TYPE_ALL:
				case CL_DEVICE_TYPE_GPU: {
					// Retrive multiple logical GPU devices (non-exclusive mode)
					std::vector<ssize_t> logdev =
							clEnvironment::instance()->coAllocateGPUDeviceRR(accelerators.size(),
                                                                             (offset_dev == 0) ? -1 : offset_dev);
					if (logdev.size() == 0) {
						//could not fulfill allocation request
						if(ff_oclNode_t<T>::getDeviceType() == CL_DEVICE_TYPE_GPU) {
							error("not enough GPUs found !\n");
							return -1;
						}
						//if user did not require require GPU devices, fallback to CPU
					} else {
						//convert retrieved logical devices into opencl Ids
						devices.clear();
						for (size_t i = 0; i < logdev.size(); ++i)
							devices.push_back(clEnvironment::instance()->getDevice(logdev[i]));
						for (size_t i = 0; i < devices.size(); ++i)
							if (accelerators[i]->init(devices[i], getReduceMode(), kernel_code,
                                                      kernel_name1, kernel_name2, saveBinary, reuseBinary) < 0)
								return -1;
						break;
					}
					break;
				}
				case CL_DEVICE_TYPE_CPU: {
					if (accelerators.size() > 1) {
						error(
								"Multiple (>1) virtual accelerators on CPU are requested. Not yet implemented.\n");
						return -1;
					} else {
						//Retrieve the CPU device
						devices.clear();
						devices.push_back(clEnvironment::instance()->getDevice( //convert to OpenCL Id
								clEnvironment::instance()->getCPUDevice())); //retrieve logical device
						if (accelerators[0]->init(devices[0], getReduceMode(), kernel_code,
                                                  kernel_name1, kernel_name2, saveBinary, reuseBinary) < 0)
							return -1;
					}
				}
				break;
				default: {
					error(
							"stencilReduceOCL::Other device. Not yet implemented.\n");
					return -1;
				}
				} //end switch on ff_oclNode_t<T>::getDeviceType()
			} else {
				//user requested specific OpenCL devices
				if (devices.size() > accelerators.size()) {
					error(
							"stencilReduceOCL::nodeInit: Too many devices requested, increase the number of accelerators!\n");
					return -1;
				}
				// NOTE: the number of devices requested can be lower than the number of accelerators.
				// TODO must be managed
				for (size_t i = 0; i < devices.size(); ++i)
					accelerators[i]->init(devices[i], getReduceMode(), kernel_code, kernel_name1,
                                          kernel_name2, saveBinary, reuseBinary);
			}
		}
        
//        for (size_t i = 0; i < devices.size(); ++i)
//            std::cerr << "Using " << clEnvironment::instance()->getDeviceInfo(devices[i]) << std::endl;

		return 0;
	}
    
    void nodeEnd() {}

#if defined(FF_REPARA)
    /** 
     *  Returns input data size
     */
    size_t rpr_get_sizeIn()  const { return ff_node::rpr_sizeIn; }

    /** 
     *  Returns output data size
     */
    size_t rpr_get_sizeOut() const { return ff_node::rpr_sizeOut; }
#endif
     
protected:

	virtual bool isPureMap() const    { return false; }
	virtual bool isPureReduce() const { return false; }
	reduceMode getReduceMode() {
		return isPureReduce() ? REDUCE_INPUT : REDUCE_OUTPUT;
	}

    virtual int svc_init() { return nodeInit(); }

#if 0    
	virtual int svc_init() {
        if (ff_oclNode_t<T>::oclId < 0) {
            ff_oclNode_t<T>::oclId = clEnvironment::instance()->getOCLID();
            
            switch(ff_oclNode_t<T>::getDeviceType()) {
            case CL_DEVICE_TYPE_ALL:
                fprintf(stderr,"STATUS: requested ALL\n");
            case CL_DEVICE_TYPE_GPU: {// One or more GPUs
                // Not exclusive
                // Retrive logical devices
                std::vector<ssize_t> logdev = clEnvironment::instance()->coAllocateGPUDeviceRR(accelerators.size());
                // Convert into opencl Ids
                devices.clear();
                for (size_t i = 0; i < logdev.size(); ++i)
                    devices.push_back(clEnvironment::instance()->getDevice(logdev[i]));
                if (devices.size() == 0) {
                    error("stencilReduceOCL::svc_init:not enough GPUs found !\n");
                    return -1;
                } else {
                    // Ok
                    for (size_t i = 0; i < devices.size(); ++i)
                        accelerators[i]->init(devices[i], kernel_code, kernel_name1,kernel_name2);
                    break;
                }
            }
            case CL_DEVICE_TYPE_CPU: {
                if (accelerators.size()>1) {
                    error ("Multiple (>1) virtual accelerators on CPU are requested. Not yet implemented.\n");
                    return -1;
                } else {
                    // Ok
                    devices.clear();
                    devices.push_back(clEnvironment::instance()->getDevice(clEnvironment::instance()->getCPUDevice()));
                    accelerators[0]->init(devices[0], kernel_code, kernel_name1,kernel_name2);
                }
            } break;
            default: {
                error("stencilReduceOCL::Other device. Not yet implemented.\n");
            } break;
            }
        }
        return 0;
    }
#endif 

	virtual void svc_end() {
		if (!ff::ff_node::isfrozen()) nodeEnd();
	}

	T *svc(T *task) {
		if (task) setTask(*task);
		Tin   *inPtr     = Task.getInPtr();
		Tout  *outPtr    = Task.getOutPtr();
        Tout  *reducePtr = Task.getReduceVar();
        const size_t envSize = Task.getEnvNum(); //n. added environments

#if defined(FF_REPARA)
        ff_node::rpr_sizeIn = ff_node::rpr_sizeOut = 0;
#endif
        
        // if the computation is not in-place then we start from the output 
        if ((void*)inPtr != (void*)outPtr) {

            // adjust allocator output-portions and relocate output device memory if needed
            if (oldSizeOut != Task.getBytesizeOut()) {
                compute_accmem(Task.getSizeOut(), acc_out);
                
                const bool memorychange = (oldSizeOut < Task.getBytesizeOut());
                
                for (size_t i = 0; i < accelerators.size(); ++i) {
                    accelerators[i]->adjustOutputBufferOffset(outPtr, (memorychange?old_outPtr:NULL), acc_out[i], Task.getSizeOut());
                }
                
                if (memorychange) {
                    for (size_t i = 0; i < accelerators.size(); ++i) {
                        accelerators[i]->relocateOutputBuffer(outPtr, reducePtr);
                    }
                    oldSizeOut = Task.getBytesizeOut();
                    old_outPtr = outPtr;
                }
            }                                    
        } 
        // adjust allocator input-portions and relocate input device memory if needed
        if (oldBytesizeIn != Task.getBytesizeIn()) {
            compute_accmem(Task.getSizeIn(),  acc_in);
            const bool memorychange = (oldBytesizeIn < Task.getBytesizeIn());
            adjustInputBufferOffset(memorychange);
            if (memorychange) {
                for (size_t i = 0; i < accelerators.size(); ++i) {
                    accelerators[i]->relocateInputBuffer(inPtr, Task.getReuseIn(), reducePtr);
                }
                oldBytesizeIn = Task.getBytesizeIn();
                old_inPtr = inPtr;
            }            
        }
            
        // in-place computation 
        // adjust allocator output-portions and relocate output device memory if needed
        if (((void*)inPtr == (void*)outPtr) && ( oldSizeOut != Task.getBytesizeOut())) {
            for (size_t i = 0; i < accelerators.size(); ++i) {
                accelerators[i]->setInPlace(reducePtr);
            }
        }                        

        //relocate env device memory
        //TODO on-demand relocate, as for input/output memory
        /* NOTE: env buffer are replicated on all devices.
         *       It would be nice to have replicated/partitioned polices
         */
        for (size_t i = 0; i < accelerators.size(); ++i)
            for(size_t k=0; k < envSize; ++k) {
                char *envptr;
                Task.getEnvPtr(k, envptr);
                accelerators[i]->relocateEnvBuffer(envptr, Task.getReuseEnv(k), k, Task.getBytesizeEnv(k));
            }        

		if (!isPureReduce())  //set kernel args
			for (size_t i = 0; i < accelerators.size(); ++i)
				accelerators[i]->setMapKernelArgs(envSize);

		//(async) copy input and environments (h2d)
		for (size_t i = 0; i < accelerators.size(); ++i) {

            if (Task.getCopyIn()) {
#if defined(FF_REPARA)
                ff_node::rpr_sizeIn += accelerators[i]->asyncH2Dinput(Task.getInPtr());  //in
#else
                accelerators[i]->asyncH2Dinput(Task.getInPtr());  //in
#endif
            }

            for(size_t k=0; k < envSize; ++k) {
                if (Task.getCopyEnv(k)) {
                    char *envptr;
                    Task.getEnvPtr(k, envptr);    
#if defined(FF_REPARA)
                    ff_node::rpr_sizeIn += accelerators[i]->asyncH2Denv(k, envptr);
#else
                    accelerators[i]->asyncH2Denv(k, envptr);
#endif
                }
            }
        } 

		if (isPureReduce()) {
			//init reduce
			for (size_t i = 0; i < accelerators.size(); ++i)
                accelerators[i]->initReduce();

			//wait for cross-accelerator h2d
			waitforh2d();

			//(async) device-reduce1
			for (size_t i = 0; i < accelerators.size(); ++i)
				accelerators[i]->asyncExecReduceKernel1();

			//(async) device-reduce2
			for (size_t i = 0; i < accelerators.size(); ++i)
				accelerators[i]->asyncExecReduceKernel2();

			waitforreduce(); //wait for cross-accelerator reduce

			//host-reduce
			Tout redVar = accelerators[0]->getReduceVar();
			for (size_t i = 1; i < accelerators.size(); ++i)
				redVar = Task.combinator(redVar, accelerators[i]->getReduceVar());
			Task.writeReduceVar(redVar);
		} else {
			Task.resetIter();

			if (isPureMap()) {
				//wait for cross-accelerator h2d
				waitforh2d();

				//(async) exec kernel
				for (size_t i = 0; i < accelerators.size(); ++i)
					accelerators[i]->asyncExecMapKernel();
				Task.incIter();

				waitformap(); //join

			} else { //iterative Map-Reduce (aka stencilReduceLoop)

                //invalidate first swap
				for (size_t i = 0; i < accelerators.size(); ++i)	accelerators[i]->swap();

				bool go = true;
				do {
					//Task.before();

					for (size_t i = 0; i < accelerators.size(); ++i)	accelerators[i]->swap();

					//wait for cross-accelerator h2d
					waitforh2d();

					//(async) execute MAP kernel
					for (size_t i = 0; i < accelerators.size(); ++i)
						accelerators[i]->asyncExecMapKernel();
					Task.incIter();

					//start async-interleaved: reduce + borders sync
					//init reduce
					for (size_t i = 0; i < accelerators.size(); ++i)
                        accelerators[i]->initReduce();

					//(async) device-reduce1
					for (size_t i = 0; i < accelerators.size(); ++i)
						accelerators[i]->asyncExecReduceKernel1();

					//(async) device-reduce2
					for (size_t i = 0; i < accelerators.size(); ++i)
						accelerators[i]->asyncExecReduceKernel2();

					//wait for cross-accelerators reduce
					waitforreduce();

					//host-reduce
					Tout redVar = accelerators[0]->getReduceVar();
					for (size_t i = 1; i < accelerators.size(); ++i) 
						redVar = Task.combinator(redVar, accelerators[i]->getReduceVar());
                    Task.writeReduceVar(redVar);

					go = Task.iterCondition_aux();
					if (go) {
                        assert(outPtr);
						//(async) read back borders (d2h)
						for (size_t i = 0; i < accelerators.size(); ++i)
							accelerators[i]->asyncD2Hborders(outPtr);
						waitford2h(); //wait for cross-accelerators d2h
						//(async) read borders (h2d)
						for (size_t i = 0; i < accelerators.size(); ++i)
							accelerators[i]->asyncH2Dborders(outPtr);
					}

					//Task.after();

				} while (go);
			}
            
            //(async)read back output (d2h)
            if (outPtr && Task.getCopyOut()) { // do we have to copy back the output result ?
                for (size_t i = 0; i < accelerators.size(); ++i) {
#if defined(FF_REPARA)
                    ff_node::rpr_sizeOut += accelerators[i]->asyncD2Houtput(outPtr);
#else
                    accelerators[i]->asyncD2Houtput(outPtr);
#endif
                }
                waitford2h(); //wait for cross-accelerators d2h
            }
		}

        // device memory cleanup phase

        if (Task.getReleaseIn() && (void *)outPtr != (void *)inPtr) {
            for (size_t i = 0; i < accelerators.size(); ++i) 
                accelerators[i]->releaseInput(inPtr);
            oldBytesizeIn = 0;
            old_inPtr = NULL;
        }
        if ( Task.getReleaseOut() ) {
            for (size_t i = 0; i < accelerators.size(); ++i) 
                accelerators[i]->releaseOutput(outPtr);
            oldSizeOut = 0;
            old_outPtr = NULL;
        }

        for(size_t k=0; k < envSize; ++k) {
            if (Task.getReleaseEnv(k)) {
                char *envptr;
                Task.getEnvPtr(k, envptr);
                if ((void*)envptr != (void*)outPtr) { 
                    for (size_t i = 0; i < accelerators.size(); ++i) {
                        accelerators[i]->releaseEnv(k,envptr);
                    }
                }
            }

            // TODO: management of oldEnvPtr !!
            // currently the size of the envbuffer should be always the same !
        }

        // per task memory cleanup phase
        Task.releaseTask(task);

		return (oneshot ? NULL : task);
	}

protected:
	virtual void adjustInputBufferOffset(const bool memorychange) {
		for (size_t i = 0; i < accelerators.size(); ++i)
			accelerators[i]->adjustInputBufferOffset(Task.getInPtr(),
					(memorychange ? old_inPtr : NULL), acc_in[i],
					Task.getSizeIn());
	}

	void setcode(const std::string &codestr1, const std::string &codestr2) {
		int n = 0;
		if (codestr1 != "") {
			n = codestr1.find_first_of("|");
			assert(n > 0);
			kernel_name1 = codestr1.substr(0, n);
			const std::string &tmpstr = codestr1.substr(n + 1);
			n = tmpstr.find_first_of("|");
			assert(n > 0);

			// checking for double type
			if (tmpstr.substr(0, n) == "double") {
				kernel_code = "\n#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n"
						+ tmpstr.substr(n + 1);
			} else
				kernel_code = "\n" + tmpstr.substr(n + 1);
		}

		// checking for extra code needed to compile the kernels
		std::ifstream ifs(FF_OPENCL_DATATYPES_FILE);
		if (ifs.is_open())
			kernel_code.insert(kernel_code.begin(), std::istreambuf_iterator<char>(ifs),
					std::istreambuf_iterator<char>());

		if (codestr2 != "") {
			n = codestr2.find("|");
			assert(n > 0);
			kernel_name2 += codestr2.substr(0, n);
			const std::string &tmpstr = codestr2.substr(n + 1);
			n = tmpstr.find("|");
			assert(n > 0);

			// checking for double type
			if (tmpstr.substr(0, n) == "double") {
				kernel_code += "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
						+ tmpstr.substr(n + 1);
			} else
				kernel_code += tmpstr.substr(n + 1);
		}
	}

	void setsourcecode(const std::string &source, const std::string &kernel1, const std::string &kernel2) {		
        if (kernel1 != "") kernel_name1 = "kern_"+kernel1;
        if (kernel2 != "") kernel_name2 = "kern_"+kernel2;
        kernel_code  = source;
    }

	//assign input partition to accelerators
	//acc[i] = (start, size) where:
	//         - start is the first element assigned to accelerator i
	//         - size is the number of elements assigned to accelerator i
	void compute_accmem(const size_t len, std::vector<std::pair<size_t,size_t> > &acc) {
		size_t start = 0, step = (len + accelerators.size() - 1) / accelerators.size();
		size_t i = 0;
		for (; i < accelerators.size() - 1; ++i) {
            acc[i]=std::make_pair(start, step);
			start += step;
		}
        acc[i]=std::make_pair(start, len-start);
	}

	void waitforh2d() {
		for (size_t i = 0; i < accelerators.size(); ++i)
			accelerators[i]->waitforh2d();
	}

	void waitford2h() {
		for (size_t i = 0; i < accelerators.size(); ++i)
			accelerators[i]->waitford2h();
	}

	void waitforreduce() {
		for (size_t i = 0; i < accelerators.size(); ++i)
			accelerators[i]->waitforreduce();
	}

	void waitformap() {
		for (size_t i = 0; i < accelerators.size(); ++i)
			accelerators[i]->waitformap();
	}

	TOCL Task;
	const bool oneshot;
    bool saveBinary, reuseBinary;    
    std::vector<accelerator_t*> accelerators;
    std::vector<std::pair<size_t, size_t> > acc_in;
    std::vector<std::pair<size_t, size_t> > acc_out;
    std::vector<cl_device_id> devices;
	int stencil_width_half;
    //size_t preferred_dev;
    size_t offset_dev;

	std::string kernel_code;
	std::string kernel_name1;
	std::string kernel_name2;

    size_t forced_cpu;
    size_t forced_gpu;
    size_t forced_other;
    
    Tin   *old_inPtr;
    Tout  *old_outPtr;
	size_t oldBytesizeIn, oldSizeOut, oldSizeReduce;
};


/*!
 * \class ff_mapOCL_1D
 *  \ingroup high_level_patterns
 *
 * \brief The OpenCL-based Map pattern in 1 dimension
 *
 * This class is defined in \ref stencilReduceOCL.hpp
 *
 */
template<typename T, typename TOCL = T>
class ff_mapOCL_1D: public ff_stencilReduceLoopOCL_1D<T, TOCL> {
public:
	ff_mapOCL_1D(std::string mapf, ff_oclallocator *alloc=nullptr,
                 const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(mapf, "", 0, alloc, NACCELERATORS, 0) {
	}

    ff_mapOCL_1D(const std::string &kernels_source, const std::string &mapf_name, 
                 ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(kernels_source, mapf_name, "", 0, alloc, NACCELERATORS, 0) {
	}
    

	ff_mapOCL_1D(const T &task, std::string mapf, 
                 ff_oclallocator *alloc=nullptr,
                 const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(task, mapf, "", 0, alloc, NACCELERATORS, 0) {
	}
    ff_mapOCL_1D(const T &task, const std::string &kernels_source, const std::string &mapf_name, 
                 ff_oclallocator *alloc=nullptr,
                 const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(task, kernels_source, mapf_name, "", 0, alloc, NACCELERATORS, 0) {
	}

	bool isPureMap() const { return true; }
};

/*!
 * \class ff_reduceOCL_1D
 *
 * \ingroup high_level_patterns
 *
 * \brief The OpenCL-based Reduce pattern in 1 dimension
 *
 * This class is defined in \ref stencilReduceOCL.hpp
 *
 */
template<typename T, typename TOCL = T>
class ff_reduceOCL_1D: public ff_stencilReduceLoopOCL_1D<T, TOCL> {
public:
	typedef typename TOCL::Tin  Tin;
	typedef typename TOCL::Tout Tout;

	ff_reduceOCL_1D(std::string reducef, const Tout &identityVal = Tout(),
                    ff_oclallocator *alloc=nullptr,
                    const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>("", reducef, identityVal, alloc, NACCELERATORS, 0) {
	}
	ff_reduceOCL_1D(const std::string &kernels_source, const std::string &reducef_name, const Tout &identityVal = Tout(),
                    ff_oclallocator *alloc=nullptr,
                    const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(kernels_source, "", reducef_name, identityVal, alloc, NACCELERATORS, 0) {
	}

	ff_reduceOCL_1D(const T &task, std::string reducef, const Tout identityVal = Tout(),
                    ff_oclallocator *alloc=nullptr,
                    const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(task, "", reducef, identityVal, alloc, NACCELERATORS, 0) {
	}
	ff_reduceOCL_1D(const T &task, const std::string &kernels_source,const std::string &reducef_name, 
                    const Tout identityVal = Tout(),
                    ff_oclallocator *alloc=nullptr,
                    const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(task, kernels_source, "", reducef_name, identityVal, alloc, NACCELERATORS, 0) {
	}
	bool isPureReduce() const { return true; }
};

/*
 * \class f_mapReduceOCL_1D
 *  \ingroup high_level_patterns
 *
 * \brief The mapReduce skeleton.
 *
 * The mapReuce skeleton using OpenCL
 *
 * This class is defined in \ref map.hpp
 *
 */
template<typename T, typename TOCL = T>
class ff_mapReduceOCL_1D: public ff_stencilReduceLoopOCL_1D<T, TOCL> {
public:
	typedef typename TOCL::Tin  Tin;
	typedef typename TOCL::Tout Tout;

	ff_mapReduceOCL_1D(std::string mapf, std::string reducef, const Tout &identityVal = Tout(),
                       ff_oclallocator *alloc=nullptr,
                       const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(mapf, reducef, identityVal, alloc, NACCELERATORS, 0) {
	}

	ff_mapReduceOCL_1D(const std::string &kernels_code, const std::string &mapf_name, 
                       const std::string &reducef_name, const Tout &identityVal = Tout(),
                       ff_oclallocator *alloc=nullptr,
                       const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(kernels_code, mapf_name, reducef_name, identityVal, alloc, NACCELERATORS, 0) {
	}


	ff_mapReduceOCL_1D(const T &task, std::string mapf, std::string reducef,
                       const Tout &identityVal = Tout(), 
                       ff_oclallocator *alloc=nullptr,
                       const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(task, mapf, reducef, identityVal, alloc, NACCELERATORS, 0) {
	}
	ff_mapReduceOCL_1D(const T &task, const std::string &kernels_code, const std::string &mapf_name, 
                       const std::string &reducef_name, const Tout &identityVal = Tout(), 
                       ff_oclallocator *alloc=nullptr,
                       const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_1D<T, TOCL>(task, kernels_code, mapf_name, reducef_name, identityVal, alloc, NACCELERATORS, 0) {
	}

};


/*** 2D ***/

/**
 * a task to be executed by a 2D stencilReduceLoop node.
 * This class represent a computation to be performed on a
 * logical 2D matrix, stored in host memory as 1D row-major array.
 */
template<typename TaskT_, typename Tin_, typename Tout_ = Tin_>
class baseOCLTask_2D: public baseOCLTask<TaskT_, Tin_, Tout_> {
public:
	/**
	 * set the number of rows of the logical 2D input.
	 * To be called from setTask.
	 *
	 * @param h is the number of rows
	 */
	void setHeight(size_t h) {
		height = h;
	}

	/**
	 * set the number of columns of the logical 2D input.
	 * To be called from setTask.
	 *
	 * @param h is the number of columns
	 */
	void setWidth(size_t w) {
		width = w;
	}

	//runtime getter functions
	size_t getHeight() const { return height;}
	size_t getWidth() const {return width;}

protected:
	size_t height, width;
};


/**
 * a virtual OpenCL accelerator for 2D map kernels working on
 * logical 2D matrices stored as row-major arrays.
 */
template<typename T, typename TOCL = T>
class ff_oclAccelerator_2D : public ff_oclAccelerator<T, TOCL> {
public:
	typedef typename TOCL::Tin  Tin;
	typedef typename TOCL::Tout Tout;

	ff_oclAccelerator_2D(ff_oclallocator *alloc, const size_t halo_width_, const Tout &identityVal, const bool from_source=false) :
		ff_oclAccelerator<T,TOCL>(alloc, halo_width_, identityVal, from_source) {
		heightInput_global = 0;
		widthInput_global = 0;
	}

	void setMapKernelArgs(const size_t envSize) {
		cl_uint idx = 0;
		//set iteration-dynamic MAP kernel args (init)
		cl_int status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &this->inputBuffer);
		checkResult(status, "setKernelArg input");
		status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &this->outputBuffer);
		checkResult(status, "setKernelArg output");

		//set iteration-invariant MAP kernel args
		status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &heightInput_global);
		checkResult(status, "setKernelArg global input height");
		status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &widthInput_global);
				checkResult(status, "setKernelArg global input width");

		status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->lenOutput);
		checkResult(status, "setKernelArg local input length");
		status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->offset1_in);
		checkResult(status, "setKernelArg offset");
		status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_uint), (void *) &this->halo_out_left);
		checkResult(status, "setKernelArg halo");

	    for(size_t k=0; k < envSize; ++k) {
	    	status = clSetKernelArg(this->kernel_map, idx++, sizeof(cl_mem), &(this->envBuffer[k].first));
	    	checkResult(status, "setKernelArg env");
	    }
	}

	/**
	 * set the global number of rows of the logical 2D input.
	 */
	void setHeight(size_t h) {
		heightInput_global = h;
	}

	void setWidth(size_t w) {
		widthInput_global = w;
	}

private:
	size_t heightInput_global, widthInput_global;
};


/**
 * a stencilReduceLoop for executing 2D OCL tasks.
 */
template<typename T, typename TOCL = T>
class ff_stencilReduceLoopOCL_2D: public ff_stencilReduceLoopOCL_1D<T,TOCL,ff_oclAccelerator_2D<T,TOCL> > {
private:
	typedef typename TOCL::Tin Tin;
	typedef typename TOCL::Tout Tout;
	typedef ff_oclAccelerator_2D<T,TOCL> accelerator_t;
	typedef ff::ff_stencilReduceLoopOCL_1D<T,TOCL,accelerator_t> base_srl_t;

public:
	// build the program from the mapf and reducef functions
		ff_stencilReduceLoopOCL_2D(const std::string &mapf,			              //OpenCL elemental function
	                               const std::string &reducef = std::string(""),  //OpenCL combinator function
	                               const Tout &identityVal = Tout(),
	                               ff_oclallocator *allocator = nullptr,
	                               const size_t NACCELERATORS = 1,
								   const int stencil_width_half_ = 1,
								   const int stencil_height_half_ = 1) :
									   //init srl 1D with no halo (1D halo depends on 2D width)
								   base_srl_t(mapf, reducef, identityVal, allocator, NACCELERATORS, 0),
								   stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {}

	    // build the program from source code file,
	    // first attempts to load a cached binary file (kernels_source in this case is the path to the binary file)
	    // ff that file is not available, then it creates the program from source and store the binary for future use
	    // with the extention ".bin"
	    ff_stencilReduceLoopOCL_2D(const std::string &kernels_source,            // OpenCL source code path
	                               const std::string &mapf_name,                 // name of the map function
	                               const std::string &reducef_name,              // name of the reduce function
	                               const Tout &identityVal = Tout(),
	                               ff_oclallocator *allocator = nullptr,
	                               const size_t NACCELERATORS = 1,
								   const int stencil_width_half_ = 1,
								   const int stencil_height_half_ = 1) :
									   //init srl 1D with no halo (1D halo depends on 2D width)
	                            	   base_srl_t(kernels_source, mapf_name, reducef_name, identityVal, allocator, NACCELERATORS, 0),
									   stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {}

	    // the task is provided in the constructor -- one shot computation
		ff_stencilReduceLoopOCL_2D(const T &task,
	                               const std::string &mapf,
	                               const std::string &reducef = std::string(""),
	                               const Tout &identityVal = Tout(),
	                               ff_oclallocator *allocator = nullptr,
	                               const size_t NACCELERATORS = 1,
								   const int stencil_width_half_ = 1,
								   const int stencil_height_half_ = 1) :
	                            	   //init srl 1D with no halo (1D halo depends on 2D width)
	                            	   base_srl_t(task, mapf, reducef, identityVal, allocator, NACCELERATORS, 0),
                                       stencil_width_half(stencil_width_half_),stencil_height_half(stencil_height_half_) {}

	    // the task is provided in the constructor -- one shot computation
	    ff_stencilReduceLoopOCL_2D(const T &task,
	                               const std::string &kernels_source,            // OpenCL source code path
	                               const std::string &mapf_name,           // name of the map kernel function
	                               const std::string &reducef_name,        // name of the reduce kernel function
	                               const Tout &identityVal = Tout(),
	                               ff_oclallocator *allocator = nullptr,
	                               const size_t NACCELERATORS = 1,
								   const int stencil_width_half_ = 1,
								   const int stencil_height_half_ = 1) :
	                            	   //init srl 1D with no halo (1D halo depends on 2D width)
	                            	   base_srl_t(task, kernels_source, mapf_name, reducef_name, identityVal, allocator, NACCELERATORS, 0),
	                            	   stencil_width_half(stencil_width_half_), stencil_height_half(stencil_height_half_) {}

protected:
	void adjustInputBufferOffset(const bool memorychange) {
		for (size_t i = 0; i < this->accelerators.size(); ++i) {
			this->accelerators[i]->setWidth(this->Task.getWidth());
			this->accelerators[i]->setHeight(this->Task.getHeight());
			this->accelerators[i]->setHaloHalf(halo_half(this->Task.getWidth()));
		}
		//accelerator uses above values to compute actual sizing
		base_srl_t::adjustInputBufferOffset(memorychange);
	}
private:

	const size_t halo_half(const size_t width) {
		return stencil_height_half * width + stencil_width_half;
	}

	/*
	 * 2D stencil can access elements in H x W rectangle around each element, where:
	 * H = 2 * stencil_height_half + 1
	 * W = 2 * stencil_width_half + 1
	 */
	const int stencil_width_half;
	const int stencil_height_half;
};


/*!
 * \class ff_mapOCL_1D
 *  \ingroup high_level_patterns
 *
 * \brief The OpenCL-based Map pattern in 1 dimension
 *
 * This class is defined in \ref stencilReduceOCL.hpp
 *
 */
template<typename T, typename TOCL = T>
class ff_mapOCL_2D: public ff_stencilReduceLoopOCL_2D<T, TOCL> {
public:
	ff_mapOCL_2D(std::string mapf, ff_oclallocator *alloc=nullptr,
                 const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_2D<T, TOCL>(mapf, "", 0, alloc, NACCELERATORS) {
	}

    ff_mapOCL_2D(const std::string &kernels_source, const std::string &mapf_name, 
                 ff_oclallocator *alloc=nullptr, const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_2D<T, TOCL>(kernels_source, mapf_name, "", 0, alloc, NACCELERATORS) {
	}
    

	ff_mapOCL_2D(const T &task, std::string mapf, 
                 ff_oclallocator *alloc=nullptr,
                 const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_2D<T, TOCL>(task, mapf, "", 0, alloc, NACCELERATORS) {
	}
    ff_mapOCL_2D(const T &task, const std::string &kernels_source, const std::string &mapf_name, 
                 ff_oclallocator *alloc=nullptr,
                 const size_t NACCELERATORS = 1) :
        ff_stencilReduceLoopOCL_2D<T, TOCL>(task, kernels_source, mapf_name, "", 0, alloc, NACCELERATORS) {
	}

	bool isPureMap() const { return true; }
};    


}// namespace ff

#endif // FF_OCL
#endif /* FF_MAP_OCL_HPP */