mesytec-mnode/external/taskflow-3.8.0/3rd-party/ff/distributed/loader/dff_run.cpp
2025-01-04 01:25:05 +01:00

339 lines
10 KiB
C++

/* ***************************************************************************
*
* FastFlow is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License version 3 as
* published by the Free Software Foundation.
* Starting from version 3.0.1 FastFlow is dual licensed under the GNU LGPLv3
* or MIT License (https://github.com/ParaGroup/WindFlow/blob/vers3.x/LICENSE.MIT)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
****************************************************************************
*/
/* Author:
* Nicolo' Tonci
*/
#include <iostream>
#include <fstream>
#include <chrono>
#include <thread>
#include <unistd.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/param.h>
#include <fcntl.h>
#include <cereal/cereal.hpp>
#include <cereal/archives/json.hpp>
#include <cereal/types/string.hpp>
#include <cereal/types/vector.hpp>
#include <filesystem>
namespace n_fs = std::filesystem;
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX 255
#endif
enum Proto {TCP = 1 , MPI};
Proto usedProtocol;
bool seeAll = false;
std::vector<std::string> viewGroups;
char hostname[HOST_NAME_MAX];
std::string configFile("");
std::string executable;
static inline unsigned long getusec() {
struct timeval tv;
gettimeofday(&tv,NULL);
return (unsigned long)(tv.tv_sec*1e6+tv.tv_usec);
}
bool toBePrinted(std::string gName){
return (seeAll || (find(viewGroups.begin(), viewGroups.end(), gName) != viewGroups.end()));
}
std::vector<std::string> split (const std::string &s, char delim) {
std::vector<std::string> result;
std::stringstream ss(s);
std::string item;
while (getline (ss, item, delim))
result.push_back (item);
return result;
}
struct G {
std::string name, host, preCmd;
int fd = 0;
FILE* file = nullptr;
template <class Archive>
void load( Archive & ar ){
ar(cereal::make_nvp("name", name));
try {
std::string endpoint;
ar(cereal::make_nvp("endpoint", endpoint)); std::vector endp(split(endpoint, ':'));
host = endp[0]; //port = std::stoi(endp[1]);
} catch (cereal::Exception&) {
host = "127.0.0.1"; // set the host to localhost if not found in config file!
ar.setNextName(nullptr);
}
try {
ar(cereal::make_nvp("preCmd", preCmd));
} catch (cereal::Exception&) {
ar.setNextName(nullptr);
}
}
void run(){
char b[1024]; // ssh -t // trovare MAX ARGV
sprintf(b, " %s %s %s %s %s --DFF_Config=%s --DFF_GName=%s %s 2>&1 %s", (isRemote() ? "ssh -T " : ""), (isRemote() ? host.c_str() : ""), (isRemote() ? "'" : ""), this->preCmd.c_str(), executable.c_str(), configFile.c_str(), this->name.c_str(), toBePrinted(this->name) ? "" : "> /dev/null", (isRemote() ? "'" : ""));
std::cout << "Executing the following command: " << b << std::endl;
file = popen(b, "r");
fd = fileno(file);
if (fd == -1) {
printf("Failed to run command\n" );
exit(1);
}
int flags = fcntl(fd, F_GETFL, 0);
flags |= O_NONBLOCK;
fcntl(fd, F_SETFL, flags);
}
bool isRemote(){return !(!host.compare("127.0.0.1") || !host.compare("localhost") || !host.compare(hostname));}
};
bool allTerminated(std::vector<G>& groups){
for (G& g: groups)
if (g.file != nullptr)
return false;
return true;
}
static inline void usage(char* progname) {
std::cout << "\nUSAGE: " << progname << " [Options] -f <configFile> <cmd> \n"
<< "Options: \n"
<< "\t -v <g1>,...,<g2> \t Prints the output of the specified groups\n"
<< "\t -V \t Print the output of all groups\n"
<< "\t -p \"TCP|MPI\" \t Force communication protocol\n";
std::cout << "\n";
}
std::string generateRankFile(std::vector<G>& parsedGroups){
std::string name = "/tmp/dffRankfile" + std::to_string(getpid());
std::ofstream tmpFile(name, std::ofstream::out);
for(size_t i = 0; i < parsedGroups.size(); i++)
tmpFile << "rank " << i << "=" << parsedGroups[i].host << " slot=0\n";
/*for (const G& group : parsedGroups)
tmpFile << group.host << std::endl;*/
tmpFile.close();
// return the name of the temporary file just created; remember to remove it after the usage
return name;
}
int main(int argc, char** argv) {
if (argc == 1 ||
strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-help") == 0 || strcmp(argv[1], "-h") == 0){
usage(argv[0]);
exit(EXIT_SUCCESS);
}
// get the hostname
if (gethostname(hostname, HOST_NAME_MAX) != 0) {
perror("gethostname");
exit(EXIT_FAILURE);
}
int optind=0;
for(int i=1;i<argc;++i) {
if (argv[i][0]=='-') {
switch(argv[i][1]) {
case 'p' : {
if (argv[i+1] == NULL) {
std::cerr << "-p require a protocol\n";
usage(argv[0]);
exit(EXIT_FAILURE);
}
std::string forcedProtocol = std::string(argv[++i]);
if (forcedProtocol == "MPI") usedProtocol = Proto::MPI;
else if (forcedProtocol == "TCP") usedProtocol = Proto::TCP;
else {
std::cerr << "-p require a valid protocol (TCP or MPI)\n";
exit(EXIT_FAILURE);
}
} break;
case 'f': {
if (argv[i+1] == NULL) {
std::cerr << "-f requires a file name\n";
usage(argv[0]);
exit(EXIT_FAILURE);
}
configFile = n_fs::absolute(n_fs::path(argv[++i])).string();
} break;
case 'V': {
seeAll=true;
} break;
case 'v': {
if (argv[i+1] == NULL) {
std::cerr << "-v requires at list one argument\n";
usage(argv[0]);
exit(EXIT_FAILURE);
}
viewGroups = split(argv[i+1], ',');
i+=viewGroups.size();
} break;
}
} else { optind=i; break;}
}
if (configFile == "") {
std::cerr << "ERROR: Missing config file for the loader\n";
usage(argv[0]);
exit(EXIT_FAILURE);
}
executable = n_fs::absolute(n_fs::path(argv[optind])).string();
if (!n_fs::exists(executable)) {
std::cerr << "ERROR: Unable to find the executable file (we found as executable \'" << argv[optind] << "\')\n";
exit(EXIT_FAILURE);
}
executable += " ";
for (int index = optind+1 ; index < argc; index++) {
executable += std::string(argv[index]) + " ";
}
std::ifstream is(configFile);
if (!is){
std::cerr << "Unable to open configuration file for the program!" << std::endl;
return -1;
}
std::vector<G> parsedGroups;
try {
cereal::JSONInputArchive ar(is);
// get the protocol to be used from the configuration file if it was not forced by the command line
if (!usedProtocol)
try {
std::string tmpProtocol;
ar(cereal::make_nvp("protocol", tmpProtocol));
if (tmpProtocol == "MPI")
usedProtocol = Proto::MPI;
else
usedProtocol = Proto::TCP;
} catch (cereal::Exception&) {
ar.setNextName(nullptr);
// if the protocol is not specified we assume TCP
usedProtocol = Proto::TCP;
}
// parse all the groups in the configuration file
ar(cereal::make_nvp("groups", parsedGroups));
} catch (const cereal::Exception& e){
std::cerr << "Error parsing the JSON config file. Check syntax and structure of the file and retry!" << std::endl;
exit(EXIT_FAILURE);
}
#ifdef DEBUG
for(auto& g : parsedGroups)
std::cout << "Group: " << g.name << " on host " << g.host << std::endl;
#endif
if (usedProtocol == Proto::TCP){
auto Tstart = getusec();
for (G& g : parsedGroups)
g.run();
while(!allTerminated(parsedGroups)){
for(G& g : parsedGroups){
if (g.file != nullptr){
char buff[1024] = { 0 };
ssize_t result = read(g.fd, buff, sizeof(buff));
if (result == -1){
if (errno == EAGAIN)
continue;
int code = pclose(g.file);
if (WEXITSTATUS(code) != 0)
std::cout << "[" << g.name << "][ERR] Report an return code: " << WEXITSTATUS(code) << std::endl;
g.file = nullptr;
} else if (result > 0){
std::cout << buff;
} else {
int code = pclose(g.file);
if (WEXITSTATUS(code) != 0)
std::cout << "[" << g.name << "][ERR] Report an return code: " << WEXITSTATUS(code) << std::endl;
g.file = nullptr;
}
}
}
std::this_thread::sleep_for(std::chrono::milliseconds(15));
}
std::cout << "Elapsed time: " << (getusec()-(Tstart))/1000 << " ms" << std::endl;
}
if (usedProtocol == Proto::MPI){
std::string rankFile = generateRankFile(parsedGroups);
std::cout << "RankFile: " << rankFile << std::endl;
// invoke mpirun using the just created rankfile
char command[350];
sprintf(command, "mpirun -np %lu --rankfile %s %s --DFF_Config=%s", parsedGroups.size(), rankFile.c_str(), executable.c_str(), configFile.c_str());
std::cout << "mpicommand: " << command << "\n";
FILE *fp;
char buff[1024];
fp = popen(command, "r");
if (fp == NULL) {
printf("Failed to run command\n" );
exit(1);
}
/* Read the output a line at a time - output it. */
while (fgets(buff, sizeof(buff), fp) != NULL) {
std::cout << buff;
}
pclose(fp);
std::remove(rankFile.c_str());
}
return 0;
}