1 #include "otsdaq/ARTDAQOnlineMonitor/ARTDAQOnlineMonitorSupervisor.h"
3 #include "otsdaq/TablePlugins/ARTDAQTableBase/ARTDAQTableBase.h"
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq-core/Utilities/TimeUtils.hh"
10 #include <boost/filesystem.hpp>
11 #include <boost/thread.hpp>
15 #define FAKE_CONFIG_NAME "ots_config"
18 ots::ARTDAQOnlineMonitorSupervisor::ARTDAQOnlineMonitorSupervisor(
19 xdaq::ApplicationStub* stub)
20 : CoreSupervisorBase(stub), partition_(getSupervisorProperty(
"partition", 0))
22 __SUP_COUT__ <<
"Constructor." << __E__;
26 __SUP_COUT__ <<
"Constructed." << __E__;
30 ots::ARTDAQOnlineMonitorSupervisor::~ARTDAQOnlineMonitorSupervisor(
void)
32 __SUP_COUT__ <<
"Destructor." << __E__;
34 __SUP_COUT__ <<
"Destructed." << __E__;
39 void ots::ARTDAQOnlineMonitorSupervisor::destroy(
void)
41 __SUP_COUT__ <<
"Destroying..." << __E__;
43 __SUP_COUT__ <<
"Destroyed." << __E__;
47 void ots::ARTDAQOnlineMonitorSupervisor::init(
void)
49 __SUP_COUT__ <<
"Initializing..." << __E__;
51 __SUP_COUT__ <<
"Initialized." << __E__;
56 toolbox::Event::Reference )
59 __SUP_COUT__ <<
"Initializing..." << __E__;
61 __SUP_COUT__ <<
"Initialized." << __E__;
63 catch(
const std::runtime_error& e)
65 __SS__ <<
"Error was caught while Initializing: " << e.what() << __E__;
70 __SS__ <<
"Unknown error was caught while Initializing. Please checked the logs."
72 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
77 void ots::ARTDAQOnlineMonitorSupervisor::transitionConfiguring(
78 toolbox::Event::Reference )
80 CoreSupervisorBase::configureInit();
84 ConfigurationTree theSupervisorNode = getSupervisorTableNode();
85 om_rank_ = theSupervisorNode.getNode(
"MonitorID").getValue<
int>();
87 __SUP_COUT__ <<
"Building configuration directory" << __E__;
89 boost::system::error_code ignored;
93 boost::filesystem::create_directory(
94 std::string(__ENV__(
"OTSDAQ_LOG_ROOT")) +
"/" + theSupervisorNode.getValue(),
100 ARTDAQTableBase::flattenFHICL(ARTDAQTableBase::ARTDAQAppType::Monitor,
101 theSupervisorNode.getValue());
103 symlink(ARTDAQTableBase::getFlatFHICLFilename(
104 ARTDAQTableBase::ARTDAQAppType::Monitor, theSupervisorNode.getValue())
107 theSupervisorNode.getValue() +
".fcl")
110 theSupervisorNode.getValue() +
".fcl";
112 catch(
const std::runtime_error& e)
114 __SS__ <<
"Error configuring the ARTDAQOnlineMonitorSupervisor! \n"
115 << e.what() << __E__;
116 __SUP_COUT_ERR__ << ss.str();
120 theStateMachine_.setErrorMessage(ss.str());
121 throw toolbox::fsm::exception::Exception(
124 "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" ,
132 void ots::ARTDAQOnlineMonitorSupervisor::transitionStarting(
133 toolbox::Event::Reference )
136 __SUP_COUT__ <<
"Starting..." << __E__;
138 StartArtProcess(config_file_name_);
140 __SUP_COUT__ <<
"Started." << __E__;
142 catch(
const std::runtime_error& e)
144 __SS__ <<
"Error was caught while Starting: " << e.what() << __E__;
149 __SS__ <<
"Unknown error was caught while Starting. Please checked the logs."
151 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
156 void ots::ARTDAQOnlineMonitorSupervisor::transitionStopping(
157 toolbox::Event::Reference )
160 __SUP_COUT__ <<
"Stopping..." << __E__;
162 ShutdownArtProcess();
164 __SUP_COUT__ <<
"Stopped." << __E__;
166 catch(
const std::runtime_error& e)
168 __SS__ <<
"Error was caught while Stopping: " << e.what() << __E__;
173 __SS__ <<
"Unknown error was caught while Stopping. Please checked the logs."
175 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
180 void ots::ARTDAQOnlineMonitorSupervisor::transitionPausing(
181 toolbox::Event::Reference )
184 __SUP_COUT__ <<
"Pausing..." << __E__;
186 ShutdownArtProcess();
188 __SUP_COUT__ <<
"Paused." << __E__;
190 catch(
const std::runtime_error& e)
192 __SS__ <<
"Error was caught while Pausing: " << e.what() << __E__;
197 __SS__ <<
"Unknown error was caught while Pausing. Please checked the logs." << __E__;
198 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
203 void ots::ARTDAQOnlineMonitorSupervisor::transitionResuming(
204 toolbox::Event::Reference )
207 __SUP_COUT__ <<
"Resuming..." << __E__;
209 StartArtProcess(config_file_name_);
211 __SUP_COUT__ <<
"Resumed." << __E__;
213 catch(
const std::runtime_error& e)
215 __SS__ <<
"Error was caught while Resuming: " << e.what() << __E__;
220 __SS__ <<
"Unknown error was caught while Resuming. Please checked the logs."
222 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
228 toolbox::Event::Reference )
231 __SUP_COUT__ <<
"Halting..." << __E__;
233 ShutdownArtProcess();
235 __SUP_COUT__ <<
"Halted." << __E__;
237 catch(
const std::runtime_error& e)
239 const std::string transitionName =
"Halting";
241 if(theStateMachine_.getProvenanceStateName() ==
242 RunControlStateMachine::FAILED_STATE_NAME ||
243 theStateMachine_.getProvenanceStateName() ==
244 RunControlStateMachine::HALTED_STATE_NAME)
246 __SUP_COUT_INFO__ <<
"Error was caught while halting (but ignoring because "
247 "previous state was '"
248 << RunControlStateMachine::FAILED_STATE_NAME
249 <<
"'): " << e.what() << __E__;
253 __SUP_SS__ <<
"Error was caught while " << transitionName <<
": " << e.what()
255 __SUP_COUT_ERR__ <<
"\n" << ss.str();
256 theStateMachine_.setErrorMessage(ss.str());
257 throw toolbox::fsm::exception::Exception(
260 "ots::ARTDAQOnlineMonitorSupervisorBase::transition" +
269 const std::string transitionName =
"Halting";
271 if(theStateMachine_.getProvenanceStateName() ==
272 RunControlStateMachine::FAILED_STATE_NAME ||
273 theStateMachine_.getProvenanceStateName() ==
274 RunControlStateMachine::HALTED_STATE_NAME)
276 __SUP_COUT_INFO__ <<
"Unknown error was caught while halting (but ignoring "
277 "because previous state was '"
278 << RunControlStateMachine::FAILED_STATE_NAME <<
"')." << __E__;
282 __SUP_SS__ <<
"Unknown error was caught while " << transitionName
283 <<
". Please checked the logs." << __E__;
284 __SUP_COUT_ERR__ <<
"\n" << ss.str();
285 theStateMachine_.setErrorMessage(ss.str());
287 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
289 throw toolbox::fsm::exception::Exception(
292 "ots::ARTDAQOnlineMonitorSupervisor::transition" + transitionName ,
300 void ots::ARTDAQOnlineMonitorSupervisor::enteringError(
301 toolbox::Event::Reference )
303 __SUP_COUT__ <<
"Entering error recovery state" << __E__;
305 ShutdownArtProcess();
307 __SUP_COUT__ <<
"EnteringError DONE." << __E__;
313 void ots::ARTDAQOnlineMonitorSupervisor::RunArt(
314 const std::string& config_file,
const std::shared_ptr<std::atomic<pid_t>>& pid_out)
318 auto start_time = std::chrono::steady_clock::now();
319 TLOG(TLVL_INFO) <<
"Starting art process with config file " << config_file;
323 char* filename =
new char[config_file.length() + 1];
324 memcpy(filename, config_file.c_str(), config_file.length());
325 filename[config_file.length()] =
329 std::string debugArgS =
"--config-out=" + app_name +
"_art.out";
330 char* debugArg =
new char[debugArgS.length() + 1];
331 memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
332 debugArg[debugArgS.length()] =
'\0';
334 std::vector<char*> args{
const_cast<char*
>(
"art"),
335 const_cast<char*
>(
"-c"),
340 std::vector<char*> args{
341 const_cast<char*
>(
"art"),
342 const_cast<char*
>(
"-c"),
352 std::string envVarKey =
"ARTDAQ_PARTITION_NUMBER";
353 std::string envVarValue = std::to_string(partition_);
354 if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
356 TLOG(TLVL_ERROR) <<
"Error setting environment variable \"" << envVarKey
357 <<
"\" in the environment of a child art process. "
358 <<
"This may result in incorrect TCP port number "
359 <<
"assignments or other issues, and data may "
360 <<
"not flow through the system correctly.";
362 envVarKey =
"ARTDAQ_APPLICATION_NAME";
363 envVarValue = getSupervisorTableNode().getValue();
364 if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
366 TLOG(TLVL_DEBUG) <<
"Error setting environment variable \"" << envVarKey
367 <<
"\" in the environment of a child art process. ";
369 envVarKey =
"ARTDAQ_RANK";
370 envVarValue = std::to_string(om_rank_);
371 if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
373 TLOG(TLVL_DEBUG) <<
"Error setting environment variable \"" << envVarKey
374 <<
"\" in the environment of a child art process. ";
377 execvp(
"art", &args[0]);
385 TLOG(TLVL_INFO) <<
"PID of new art process is " << pid;
387 auto sts = waitid(P_PID, pid, &status, WEXITED);
388 TLOG(TLVL_INFO) <<
"Removing PID " << pid <<
" from process list";
391 TLOG(TLVL_WARNING) <<
"Error occurred in waitid for art process " << pid
392 <<
": " << errno <<
" (" << strerror(errno) <<
").";
394 else if(status.si_code == CLD_EXITED && status.si_status == 0)
396 TLOG(TLVL_INFO) <<
"art process " << pid <<
" exited normally, "
397 << (restart_art_ ?
"restarting" :
"not restarting");
401 auto art_lifetime = artdaq::TimeUtils::GetElapsedTime(start_time);
402 if(art_lifetime < minimum_art_lifetime_s_)
404 restart_art_ =
false;
407 auto exit_type =
"exited with status code";
408 switch(status.si_code)
412 exit_type =
"was killed with signal";
419 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
420 <<
"art process " << pid <<
" " << exit_type <<
" " << status.si_status
421 << (status.si_code == CLD_DUMPED ?
" (core dumped)" :
"")
422 <<
" after running for " << std::setprecision(2) << std::fixed
423 << art_lifetime <<
" seconds, "
424 << (restart_art_ ?
"restarting" :
"not restarting");
426 }
while(restart_art_);
431 void ots::ARTDAQOnlineMonitorSupervisor::StartArtProcess(
const std::string& config_file)
433 static std::mutex start_art_mutex;
434 std::unique_lock<std::mutex> lk(start_art_mutex);
436 restart_art_ = should_restart_art_;
437 auto startTime = std::chrono::steady_clock::now();
439 art_pid_ = std::shared_ptr<std::atomic<pid_t>>(
new std::atomic<pid_t>(-1));
440 boost::thread thread([
this, config_file] { RunArt(config_file, art_pid_); });
443 while((*art_pid_ <= 0) && (artdaq::TimeUtils::GetElapsedTime(startTime) < 5))
450 <<
"art process has not started after 5s. Check art configuration!"
451 <<
" (pid=" << *art_pid_ <<
")";
452 __SS__ <<
"art process has not started after 5s. Check art configuration!"
453 <<
" (pid=" << *art_pid_ <<
")" << __E__;
457 TLOG(TLVL_INFO) << std::setw(4) << std::fixed <<
"art initialization took "
458 << artdaq::TimeUtils::GetElapsedTime(startTime) <<
" seconds.";
463 void ots::ARTDAQOnlineMonitorSupervisor::ShutdownArtProcess()
465 restart_art_ =
false;
469 auto check_pid = [&]() {
470 if(art_pid_ ==
nullptr || *art_pid_ <= 0)
474 else if(kill(*art_pid_, 0) < 0)
483 TLOG(14) <<
"art process already exited, nothing to do.";
488 auto shutdown_start = std::chrono::steady_clock::now();
490 int graceful_wait_ms = 1000 * 10;
491 int gentle_wait_ms = 1000 * 2;
492 int int_wait_ms = 1000;
494 TLOG(TLVL_TRACE) <<
"Waiting up to " << graceful_wait_ms
495 <<
" ms for art process to exit gracefully";
496 for(
int ii = 0; ii < graceful_wait_ms; ++ii)
502 TLOG(TLVL_INFO) <<
"art process exited after "
503 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
511 TLOG(TLVL_TRACE) <<
"Gently informing art process that it is time to shut down";
513 TLOG(TLVL_TRACE) <<
"Sending SIGQUIT to pid " << *art_pid_;
514 kill(*art_pid_, SIGQUIT);
517 TLOG(TLVL_TRACE) <<
"Waiting up to " << gentle_wait_ms
518 <<
" ms for art process to exit from SIGQUIT";
519 for(
int ii = 0; ii < gentle_wait_ms; ++ii)
525 TLOG(TLVL_INFO) <<
"art process exited after "
526 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
534 TLOG(TLVL_TRACE) <<
"Insisting that the art process shut down";
535 kill(*art_pid_, SIGINT);
538 TLOG(TLVL_TRACE) <<
"Waiting up to " << int_wait_ms
539 <<
" ms for art process to exit from SIGINT";
540 for(
int ii = 0; ii < int_wait_ms; ++ii)
546 TLOG(TLVL_INFO) <<
"art process exited after "
547 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
554 TLOG(TLVL_TRACE) <<
"Killing art process with extreme prejudice";
558 kill(*art_pid_, SIGKILL);
562 TLOG(TLVL_INFO) <<
"art process exited after "
563 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(shutdown_start)
virtual void transitionHalting(toolbox::Event::Reference event) override
virtual void transitionInitializing(toolbox::Event::Reference event) override
static const std::string ARTDAQ_FCL_PATH
Tree-path rule is, if the last link in the path is a group link with a specified group ID,...
static void outputOnlineMonitorFHICL(const ConfigurationTree &onlineMonitorNode)
void INIT_MF(const char *name)