/* -*- Mode: C++; tab-width: 2; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /*! * \file tpcnode.hpp * \ingroup building_blocks * * \brief FastFlow Thread Pool Composer (TPC) interface node * * This class bridges multicore with FPGAs using the TPC library. * */ /* *************************************************************************** * * FastFlow is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License version 3 as * published by the Free Software Foundation. * Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3 * or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT) * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * **************************************************************************** */ /* Author: Massimo Torquati torquati@di.unipi.it * - September 2015 first version * */ #ifndef FF_TPCNODE_HPP #define FF_TPCNODE_HPP #include #include #include #include #include #include #include using namespace rpr; using namespace tpc; namespace ff{ namespace internal { /** * iovector-like data structure plus memory flags */ struct Arg_t { Arg_t(): ptr(NULL),size(0), byvalue(false), copy(false),reuse(false),release(false) {} Arg_t(void *ptr, size_t size): ptr(ptr),size(size), byvalue(true), copy(true),reuse(false),release(false) {} Arg_t(void *ptr, size_t size, bool copy, bool reuse, bool release): ptr(ptr),size(size), byvalue(false), copy(copy),reuse(reuse),release(release) {} void *ptr; size_t size; const bool byvalue; const bool copy,reuse,release; }; } // namespace internal /** * Interface of the task to be executed by a TPC-based node. */ template class baseTPCTask { template friend class ff_tpcNode_t; public: /** * Default constructor. */ baseTPCTask():tpc_kernel_id(-1), retvar{nullptr,0} {} /** * Destructor. */ virtual ~baseTPCTask() { } /** * Provides to the run-time the task to be computed. * This method must be overridden. * * @param t input task * */ virtual void setTask(TaskT_in *t) = 0; /** * Releases the input task just computed. * This method is called at the of the task computation. * It may be used to perform per-task host memory cleanup * (i.e. releasing the host memory previously allocated in the setTask * function) or to execute a post-elaboration phase * * @param t input task * @return the output task */ virtual TaskT_out *releaseTask(TaskT_in *t) { return t; } /** * Sets the host-pointer to the input parameter. * The order of calls determin the order of parametes. * * @param inPtr the host-pointer * @param size the number of elements in the input array (bytesize=size*sizeof(ptrT)) * @param copy if CopyFlags::COPY is set the data will be copied into the device * @param reuse if BitFlags::REUSE is set then the run-time looks for a previously * allocated device handle associated with the inPtr host pointer * @param release if BitFlags::RELEASE is set the device memory will be released * at the end of the kernel execution */ template void setInPtr(const ptrT* inPtr, size_t size, const CopyFlags copy =CopyFlags::COPY, const ReuseFlags reuse =ReuseFlags::DONTREUSE, const ReleaseFlags release=ReleaseFlags::DONTRELEASE) { internal::Arg_t arg(const_cast(inPtr),size*sizeof(ptrT), copy==CopyFlags::COPY, reuse==ReuseFlags::REUSE, release==ReleaseFlags::RELEASE); tpcInput.push_back(arg); } /** * Sets the host-pointer to the input parameter. * The order of calls determin the order of parametes. */ template void setInPtr(const ptrT* inPtr, size_t size, const MemoryFlags &flags) { internal::Arg_t arg(const_cast(inPtr),size*sizeof(ptrT), flags.copy==CopyFlags::COPY, flags.reuse==ReuseFlags::REUSE, flags.release==ReleaseFlags::RELEASE); tpcInput.push_back(arg); } /** * Sets the host-pointer to the input parameter that is passed by-value. * * @param inPtr the host-pointer */ template void setInVal(const ptrT* inPtr) { internal::Arg_t arg(const_cast(inPtr), sizeof(ptrT)); tpcInput.push_back(arg); } /** * Sets the host-pointer to the output parameter * * @see setInPtr() */ template void setOutPtr(const ptrT* _outPtr, size_t size, const CopyFlags copyback =CopyFlags::COPY, const ReuseFlags reuse =ReuseFlags::DONTREUSE, const ReleaseFlags release =ReleaseFlags::DONTRELEASE) { internal::Arg_t arg(const_cast(_outPtr),size*sizeof(ptrT), copyback==CopyFlags::COPY, reuse==ReuseFlags::REUSE, release==ReleaseFlags::RELEASE); tpcOutput.push_back(arg); } /** * Sets the host-pointer to the output parameter * * @see setInPtr() */ template void setOutPtr(const ptrT* _outPtr, size_t size, const MemoryFlags &flags) { internal::Arg_t arg(const_cast(_outPtr),size*sizeof(ptrT), flags.copy==CopyFlags::COPY, flags.reuse==ReuseFlags::REUSE, flags.release==ReleaseFlags::RELEASE); tpcOutput.push_back(arg); } /** * Sets the kernel id * * @param id function id */ void setFunctionId(const tpc_func_id_t id) { tpc_kernel_id = id; } /** * Should be used only if the kernel executed on the * device is a function. It allows to get back the return value. * * @param r host variable where the return value is copied */ template void setReturnVar(TresT *const r) { retvar.first = r; retvar.second= sizeof(TresT); } protected: const std::vector &getInputArgs() const { return tpcInput; } const std::vector &getOutputArgs() const { return tpcOutput;} void * getReturnVar(size_t &size) const { size=retvar.second; return retvar.first; } const tpc_func_id_t getKernelId() const { return tpc_kernel_id;} void resetTask() { tpcInput.resize(0); tpcOutput.resize(0); retvar.first = nullptr; retvar.second= 0; } protected: tpc_func_id_t tpc_kernel_id; std::vector tpcInput; std::vector tpcOutput; std::pair retvar; }; /*! * \class ff_tpcNode * \ingroup buiding_blocks * * \brief TPC specialisation of the ff_node class * * Implements the node that is serving as TPC device. * * */ template class ff_tpcNode_t : public ff_node { struct handle_t { void *ptr = nullptr; size_t size = 0; tpc_handle_t handle= 0; }; public: typedef IN_t in_type; typedef OUT_t out_type; /** * Constructor used for stream-based computation. * It builds the TPC node for the device. * * @param alloc the allocator to use for allocating * device memory. By default it is used an internal * private allocator. * */ ff_tpcNode_t(ff_tpcallocator *alloc = nullptr): tpcId(-1),deviceId(-1),my_own_allocator(false),oneshot(false),oneshotTask(nullptr),allocator(alloc) { if (allocator == nullptr) { my_own_allocator = true; allocator = new ff_tpcallocator; assert(allocator); } // TODO: management of multiple TPC devices }; /** * Constructor used for the "oneshot" mode, i.e. non * stream-based computations. * * It builds the TPC node for the device. * */ ff_tpcNode_t(const IN_t &task, ff_tpcallocator *alloc = nullptr): tpcId(-1),deviceId(-1), my_own_allocator(false), oneshot(true),oneshotTask(&task),allocator(alloc) { ff_node::skipfirstpop(true); Task.setTask(const_cast(oneshotTask)); if (allocator == nullptr) { my_own_allocator = true; allocator = new ff_tpcallocator; assert(allocator); } // TODO: management of multiple TPC devices }; /** * Destructor * */ virtual ~ff_tpcNode_t() { if (my_own_allocator && allocator != nullptr) { allocator->releaseAllBuffers(dev_ctx); delete allocator; allocator = nullptr; } } /** * Starts the execution of the device node. * A thread executing the object instance is spawned. * * @return 0 success, -1 otherwise */ virtual int run(bool = false) { return ff_node::run(); } /** * Waits for termination of the thread associated * to the object. * * @return 0 success, -1 otherwise */ virtual int wait() { return ff_node::wait(); } virtual int run_and_wait_end() { if (nodeInit()<0) return -1; svc(nullptr); return 0; } /** * Starts the execution of the device node. * A thread executing the object instance is spawned if * no previous thread was associated to the object. * The thread will be put to sleep at the end of * the kernel execution. * * @return 0 success, -1 otherwise */ virtual int run_then_freeze() { if (ff_node::isfrozen()) { ff_node::thaw(true); return 0; } return ff_node::freeze_and_run(); } /** * Waits for the termination of the kernel * execution. The associated thread is put * to sleep and not destroyed. * * @return 0 success, -1 otherwise */ virtual int wait_freezing() { return ff_node::wait_freezing(); } /** * Allows to set a new task when in "oneshot" mode. * * @param task task to be computed */ void setTask(IN_t &task) { Task.resetTask(); Task.setTask(&task); } // returns the internal task const TPC_t* getTask() const { return &Task; } // returns the kind of node virtual fftype getFFType() const { return TPC_WORKER; } // TODO currently only one devices can be used void setDeviceId(tpc_dev_id_t id) { deviceId = id; } tpc_dev_id_t getDeviceId() const { return deviceId; } int getTPCID() const { return tpcId; } #if defined(FF_REPARA) /** * Returns input data size */ size_t rpr_get_sizeIn() const { return rpr_sizeIn; } /** * Returns output data size */ size_t rpr_get_sizeOut() const { return rpr_sizeOut; } #endif protected: int nodeInit() { if (tpcId<0) { tpcId = tpcEnvironment::instance()->getTPCID(); if (deviceId == (tpc_dev_id_t)-1) { // the user didn't set any specific device dev_ctx = tpcEnvironment::instance()->getTPCDevice(); } else abort(); // TODO; if (dev_ctx == NULL) return -1; } return 0; } int svc_init() { return nodeInit(); } // NOTE: this implementation supposes that the number of arguments does not // change between two different tasks ! Only the size does change. void *svc(void *task) { if (task) { Task.resetTask(); Task.setTask((IN_t*)task); } #if defined(FF_REPARA) rpr_sizeIn = rpr_sizeOut = 0; #endif const tpc_func_id_t f_id = Task.getKernelId(); if (tpc_device_func_instance_count(dev_ctx, f_id) == 0) { error("No instances for function with id %d\n", (int)f_id); return GO_ON; } tpc_res_t res; tpc_job_id_t j_id = tpc_device_acquire_job_id(dev_ctx, f_id, TPC_DEVICE_ACQUIRE_JOB_ID_BLOCKING); const std::vector &inV = Task.getInputArgs(); const std::vector &outV = Task.getOutputArgs(); size_t ret_val_size=0; void *ret_val = Task.getReturnVar(ret_val_size); if (inHandles.size() == 0) inHandles.resize(inV.size()); if (outHandles.size() == 0) outHandles.resize(outV.size()); uint32_t arg_idx = 0; for(size_t i=0;ireleaseBuffer(inHandles[i].ptr, dev_ctx, inHandles[i].handle); } if (inV[i].reuse) { handle = allocator->createBufferUnique(inV[i].ptr, dev_ctx, TPC_DEVICE_ALLOC_FLAGS_NONE, inV[i].size); } else { handle = allocator->createBuffer(inV[i].ptr, dev_ctx, TPC_DEVICE_ALLOC_FLAGS_NONE, inV[i].size); } if (!handle) { error("ff_tpcNode::svc unable to allocate memory on the TPC device (IN)"); inHandles[i].size = 0, inHandles[i].ptr = nullptr, inHandles[i].handle = 0; return (oneshot?NULL:GO_ON); } inHandles[i].size = inV[i].size, inHandles[i].ptr = inV[i].ptr, inHandles[i].handle = handle; } if (inV[i].copy) { res = tpc_device_copy_to(dev_ctx, inV[i].ptr, handle, inV[i].size, TPC_DEVICE_COPY_BLOCKING); if (res != TPC_SUCCESS) { error("ff_tpcNode::svc unable to copy data into the device\n"); return (oneshot?NULL:GO_ON); } #if defined(FF_REPARA) rpr_sizeIn += inV[i].size; #endif } res = tpc_device_job_set_arg(dev_ctx, j_id, arg_idx, sizeof(handle), &handle); } else { // argument passed by-value res = tpc_device_job_set_arg(dev_ctx, j_id, arg_idx, inV[i].size, inV[i].ptr); } if (res != TPC_SUCCESS) { error("ff_tpcNode::svc unable to set argument for the device (IN)\n"); return (oneshot?NULL:GO_ON); } ++arg_idx; } for(size_t i=0;i::iterator it; it = std::find_if(inHandles.begin(), inHandles.end(), [ptr](const handle_t &arg) { return arg.ptr == ptr; } ); tpc_handle_t handle; if (it == inHandles.end()) { // not found handle = outHandles[i].handle; // previous handle if (outHandles[i].size < outV[i].size) { if (outHandles[i].handle) { // not first time assert(!outV[i].reuse); allocator->releaseBuffer(outHandles[i].ptr, dev_ctx, outHandles[i].handle); } if (outV[i].reuse) { handle = allocator->createBufferUnique(ptr, dev_ctx, TPC_DEVICE_ALLOC_FLAGS_NONE, outV[i].size); } else { handle = allocator->createBuffer(ptr, dev_ctx, TPC_DEVICE_ALLOC_FLAGS_NONE, outV[i].size); } if (!handle) { error("ff_tpcNode::svc unable to allocate memory on the TPC device (OUT)"); outHandles[i].size = 0, outHandles[i].ptr = nullptr, outHandles[i].handle = 0; return (oneshot?NULL:GO_ON); } outHandles[i].size = outV[i].size, outHandles[i].ptr = ptr, outHandles[i].handle = handle; } } else handle= (*it).handle; // in/out parameter res = tpc_device_job_set_arg(dev_ctx, j_id, arg_idx, sizeof(handle), &handle); if (res != TPC_SUCCESS) { error("ff_tpcNode::svc unable to set argument for the device (OUT)\n"); return (oneshot?NULL:GO_ON); } ++arg_idx; } res = tpc_device_job_launch(dev_ctx, j_id, TPC_DEVICE_JOB_LAUNCH_BLOCKING); if (res != TPC_SUCCESS) { error("ff_tpcNode::svc error launching kernel on TPC device\n"); return (oneshot?NULL:GO_ON); } // reset the arg_idx to the initial value for the out parameters arg_idx=inV.size(); // TODO: async data transfer for(size_t i=0;ireleaseBuffer(outHandles[i].ptr, dev_ctx, handle); outHandles[i].size = 0, outHandles[i].ptr = nullptr, outHandles[i].handle = 0; } ++arg_idx; } // now check if we have to get back the return value (if the kernel is a function) if (ret_val && ret_val_size>0) { res = tpc_device_job_get_return(dev_ctx, j_id, ret_val_size, ret_val); if (res != TPC_SUCCESS) { error("ff_tpcNode::svc error getting back the return value of the kernel function\n"); return (oneshot?NULL:GO_ON); } } // By default the buffers are not released ! for(size_t i=0;ireleaseBuffer(inHandles[i].ptr, dev_ctx, inHandles[i].handle); inHandles[i].size = 0, inHandles[i].ptr = nullptr, inHandles[i].handle = 0; } } // release job id tpc_device_release_job_id(dev_ctx, j_id); // per task host memory cleanup phase OUT_t * task_out; if (task) task_out = Task.releaseTask((IN_t*)task); else task_out = Task.releaseTask(const_cast(oneshotTask)); return (oneshot ? NULL : task_out); } protected: int tpcId; tpc_dev_id_t deviceId; tpc_dev_ctx_t *dev_ctx; TPC_t Task; bool my_own_allocator; const bool oneshot; const IN_t *const oneshotTask; ff_tpcallocator *allocator; std::vector inHandles; std::vector outHandles; }; } #endif /* FF_TPCNODE_HPP */