1 #include "otsdaq/ARTDAQOnlineMonitor/ARTDAQOnlineMonitorSupervisor.h"
3 #include "otsdaq/TablePlugins/ARTDAQTableBase/ARTDAQTableBase.h"
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq-core/Utilities/TimeUtils.hh"
10 #include <boost/filesystem.hpp>
11 #include <boost/thread.hpp>
15 #define FAKE_CONFIG_NAME "ots_config"
18 ots::ARTDAQOnlineMonitorSupervisor::ARTDAQOnlineMonitorSupervisor(
19 xdaq::ApplicationStub* stub)
20 : CoreSupervisorBase(stub), partition_(getSupervisorProperty(
"partition", 0))
22 __SUP_COUT__ <<
"Constructor." << __E__;
26 __SUP_COUT__ <<
"Constructed." << __E__;
30 ots::ARTDAQOnlineMonitorSupervisor::~ARTDAQOnlineMonitorSupervisor(
void)
32 __SUP_COUT__ <<
"Destructor." << __E__;
34 __SUP_COUT__ <<
"Destructed." << __E__;
39 void ots::ARTDAQOnlineMonitorSupervisor::destroy(
void)
41 __SUP_COUT__ <<
"Destroying..." << __E__;
43 __SUP_COUT__ <<
"Destroyed." << __E__;
47 void ots::ARTDAQOnlineMonitorSupervisor::init(
void)
49 __SUP_COUT__ <<
"Initializing..." << __E__;
51 __SUP_COUT__ <<
"Initialized." << __E__;
56 toolbox::Event::Reference )
59 __SUP_COUT__ <<
"Initializing..." << __E__;
61 __SUP_COUT__ <<
"Initialized." << __E__;
63 catch(
const std::runtime_error& e)
65 __SS__ <<
"Error was caught while Initializing: " << e.what() << __E__;
70 __SS__ <<
"Unknown error was caught while Initializing. Please checked the logs."
72 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
77 void ots::ARTDAQOnlineMonitorSupervisor::transitionConfiguring(
78 toolbox::Event::Reference )
80 std::pair<std::string , TableGroupKey> theGroup(
81 SOAPUtilities::translate(theStateMachine_.getCurrentMessage())
83 .getValue(
"ConfigurationTableGroupName"),
84 TableGroupKey(SOAPUtilities::translate(theStateMachine_.getCurrentMessage())
86 .getValue(
"ConfigurationTableGroupKey")));
88 __SUP_COUT__ <<
"Configuration table group name: " << theGroup.first
89 <<
" key: " << theGroup.second << std::endl;
94 theConfigurationManager_->loadTableGroup(
107 ConfigurationManager::LoadGroupType::ALL_TYPES,
110 catch(
const std::runtime_error& e)
112 __SS__ <<
"Error loading table group '" << theGroup.first <<
"("
113 << theGroup.second <<
")! \n"
114 << e.what() << __E__;
115 __SUP_COUT_ERR__ << ss.str();
119 theStateMachine_.setErrorMessage(ss.str());
120 throw toolbox::fsm::exception::Exception(
123 "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" ,
130 __SS__ <<
"Unknown error loading table group '" << theGroup.first <<
"("
131 << theGroup.second <<
")!" << __E__;
132 __SUP_COUT_ERR__ << ss.str();
136 theStateMachine_.setErrorMessage(ss.str());
137 throw toolbox::fsm::exception::Exception(
140 "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" ,
148 ConfigurationTree theSupervisorNode = getSupervisorTableNode();
149 om_rank_ = theSupervisorNode.getNode(
"MonitorID").getValue<
int>();
151 __SUP_COUT__ <<
"Building configuration directory" << __E__;
153 boost::system::error_code ignored;
157 boost::filesystem::create_directory(
158 std::string(__ENV__(
"OTSDAQ_LOG_ROOT")) +
"/" + theSupervisorNode.getValue(),
160 mkdir((ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME).c_str(), 0755);
164 ARTDAQTableBase::flattenFHICL(ARTDAQTableBase::ARTDAQAppType::Monitor,
165 theSupervisorNode.getValue());
167 symlink(ARTDAQTableBase::getFlatFHICLFilename(
168 ARTDAQTableBase::ARTDAQAppType::Monitor, theSupervisorNode.getValue())
170 (ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME +
"/" +
171 theSupervisorNode.getValue() +
".fcl")
173 config_file_name_ = ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME +
"/" +
174 theSupervisorNode.getValue() +
".fcl";
176 catch(
const std::runtime_error& e)
178 __SS__ <<
"Error configuring the ARTDAQOnlineMonitorSupervisor! \n"
179 << e.what() << __E__;
180 __SUP_COUT_ERR__ << ss.str();
184 theStateMachine_.setErrorMessage(ss.str());
185 throw toolbox::fsm::exception::Exception(
188 "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" ,
196 void ots::ARTDAQOnlineMonitorSupervisor::transitionStarting(
197 toolbox::Event::Reference )
200 __SUP_COUT__ <<
"Starting..." << __E__;
202 StartArtProcess(config_file_name_);
204 __SUP_COUT__ <<
"Started." << __E__;
206 catch(
const std::runtime_error& e)
208 __SS__ <<
"Error was caught while Starting: " << e.what() << __E__;
213 __SS__ <<
"Unknown error was caught while Starting. Please checked the logs."
215 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
220 void ots::ARTDAQOnlineMonitorSupervisor::transitionStopping(
221 toolbox::Event::Reference )
224 __SUP_COUT__ <<
"Stopping..." << __E__;
226 ShutdownArtProcess();
228 __SUP_COUT__ <<
"Stopped." << __E__;
230 catch(
const std::runtime_error& e)
232 __SS__ <<
"Error was caught while Stopping: " << e.what() << __E__;
237 __SS__ <<
"Unknown error was caught while Stopping. Please checked the logs."
239 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
244 void ots::ARTDAQOnlineMonitorSupervisor::transitionPausing(
245 toolbox::Event::Reference )
248 __SUP_COUT__ <<
"Pausing..." << __E__;
250 ShutdownArtProcess();
252 __SUP_COUT__ <<
"Paused." << __E__;
254 catch(
const std::runtime_error& e)
256 __SS__ <<
"Error was caught while Pausing: " << e.what() << __E__;
261 __SS__ <<
"Unknown error was caught while Pausing. Please checked the logs." << __E__;
262 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
267 void ots::ARTDAQOnlineMonitorSupervisor::transitionResuming(
268 toolbox::Event::Reference )
271 __SUP_COUT__ <<
"Resuming..." << __E__;
273 StartArtProcess(config_file_name_);
275 __SUP_COUT__ <<
"Resumed." << __E__;
277 catch(
const std::runtime_error& e)
279 __SS__ <<
"Error was caught while Resuming: " << e.what() << __E__;
284 __SS__ <<
"Unknown error was caught while Resuming. Please checked the logs."
286 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
292 toolbox::Event::Reference )
295 __SUP_COUT__ <<
"Halting..." << __E__;
297 ShutdownArtProcess();
299 __SUP_COUT__ <<
"Halted." << __E__;
301 catch(
const std::runtime_error& e)
303 const std::string transitionName =
"Halting";
305 if(theStateMachine_.getProvenanceStateName() ==
306 RunControlStateMachine::FAILED_STATE_NAME ||
307 theStateMachine_.getProvenanceStateName() ==
308 RunControlStateMachine::HALTED_STATE_NAME)
310 __SUP_COUT_INFO__ <<
"Error was caught while halting (but ignoring because "
311 "previous state was '"
312 << RunControlStateMachine::FAILED_STATE_NAME
313 <<
"'): " << e.what() << __E__;
317 __SUP_SS__ <<
"Error was caught while " << transitionName <<
": " << e.what()
319 __SUP_COUT_ERR__ <<
"\n" << ss.str();
320 theStateMachine_.setErrorMessage(ss.str());
321 throw toolbox::fsm::exception::Exception(
324 "ots::ARTDAQOnlineMonitorSupervisorBase::transition" +
333 const std::string transitionName =
"Halting";
335 if(theStateMachine_.getProvenanceStateName() ==
336 RunControlStateMachine::FAILED_STATE_NAME ||
337 theStateMachine_.getProvenanceStateName() ==
338 RunControlStateMachine::HALTED_STATE_NAME)
340 __SUP_COUT_INFO__ <<
"Unknown error was caught while halting (but ignoring "
341 "because previous state was '"
342 << RunControlStateMachine::FAILED_STATE_NAME <<
"')." << __E__;
346 __SUP_SS__ <<
"Unknown error was caught while " << transitionName
347 <<
". Please checked the logs." << __E__;
348 __SUP_COUT_ERR__ <<
"\n" << ss.str();
349 theStateMachine_.setErrorMessage(ss.str());
351 artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
353 throw toolbox::fsm::exception::Exception(
356 "ots::ARTDAQOnlineMonitorSupervisor::transition" + transitionName ,
364 void ots::ARTDAQOnlineMonitorSupervisor::enteringError(
365 toolbox::Event::Reference )
367 __SUP_COUT__ <<
"Entering error recovery state" << __E__;
369 ShutdownArtProcess();
371 __SUP_COUT__ <<
"EnteringError DONE." << __E__;
377 void ots::ARTDAQOnlineMonitorSupervisor::RunArt(
378 const std::string& config_file,
const std::shared_ptr<std::atomic<pid_t>>& pid_out)
382 auto start_time = std::chrono::steady_clock::now();
383 TLOG(TLVL_INFO) <<
"Starting art process with config file " << config_file;
387 char* filename =
new char[config_file.length() + 1];
388 memcpy(filename, config_file.c_str(), config_file.length());
389 filename[config_file.length()] =
393 std::string debugArgS =
"--config-out=" + app_name +
"_art.out";
394 char* debugArg =
new char[debugArgS.length() + 1];
395 memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
396 debugArg[debugArgS.length()] =
'\0';
398 std::vector<char*> args{
const_cast<char*
>(
"art"),
399 const_cast<char*
>(
"-c"),
404 std::vector<char*> args{
405 const_cast<char*
>(
"art"),
406 const_cast<char*
>(
"-c"),
416 std::string envVarKey =
"ARTDAQ_PARTITION_NUMBER";
417 std::string envVarValue = std::to_string(partition_);
418 if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
420 TLOG(TLVL_ERROR) <<
"Error setting environment variable \"" << envVarKey
421 <<
"\" in the environment of a child art process. "
422 <<
"This may result in incorrect TCP port number "
423 <<
"assignments or other issues, and data may "
424 <<
"not flow through the system correctly.";
426 envVarKey =
"ARTDAQ_APPLICATION_NAME";
427 envVarValue = getSupervisorTableNode().getValue();
428 if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
430 TLOG(TLVL_DEBUG) <<
"Error setting environment variable \"" << envVarKey
431 <<
"\" in the environment of a child art process. ";
433 envVarKey =
"ARTDAQ_RANK";
434 envVarValue = std::to_string(om_rank_);
435 if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
437 TLOG(TLVL_DEBUG) <<
"Error setting environment variable \"" << envVarKey
438 <<
"\" in the environment of a child art process. ";
441 execvp(
"art", &args[0]);
449 TLOG(TLVL_INFO) <<
"PID of new art process is " << pid;
451 auto sts = waitid(P_PID, pid, &status, WEXITED);
452 TLOG(TLVL_INFO) <<
"Removing PID " << pid <<
" from process list";
455 TLOG(TLVL_WARNING) <<
"Error occurred in waitid for art process " << pid
456 <<
": " << errno <<
" (" << strerror(errno) <<
").";
458 else if(status.si_code == CLD_EXITED && status.si_status == 0)
460 TLOG(TLVL_INFO) <<
"art process " << pid <<
" exited normally, "
461 << (restart_art_ ?
"restarting" :
"not restarting");
465 auto art_lifetime = artdaq::TimeUtils::GetElapsedTime(start_time);
466 if(art_lifetime < minimum_art_lifetime_s_)
468 restart_art_ =
false;
471 auto exit_type =
"exited with status code";
472 switch(status.si_code)
476 exit_type =
"was killed with signal";
483 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
484 <<
"art process " << pid <<
" " << exit_type <<
" " << status.si_status
485 << (status.si_code == CLD_DUMPED ?
" (core dumped)" :
"")
486 <<
" after running for " << std::setprecision(2) << std::fixed
487 << art_lifetime <<
" seconds, "
488 << (restart_art_ ?
"restarting" :
"not restarting");
490 }
while(restart_art_);
495 void ots::ARTDAQOnlineMonitorSupervisor::StartArtProcess(
const std::string& config_file)
497 static std::mutex start_art_mutex;
498 std::unique_lock<std::mutex> lk(start_art_mutex);
500 restart_art_ = should_restart_art_;
501 auto startTime = std::chrono::steady_clock::now();
503 art_pid_ = std::shared_ptr<std::atomic<pid_t>>(
new std::atomic<pid_t>(-1));
504 boost::thread thread([
this, config_file] { RunArt(config_file, art_pid_); });
507 while((*art_pid_ <= 0) && (artdaq::TimeUtils::GetElapsedTime(startTime) < 5))
514 <<
"art process has not started after 5s. Check art configuration!"
515 <<
" (pid=" << *art_pid_ <<
")";
516 __SS__ <<
"art process has not started after 5s. Check art configuration!"
517 <<
" (pid=" << *art_pid_ <<
")" << __E__;
521 TLOG(TLVL_INFO) << std::setw(4) << std::fixed <<
"art initialization took "
522 << artdaq::TimeUtils::GetElapsedTime(startTime) <<
" seconds.";
527 void ots::ARTDAQOnlineMonitorSupervisor::ShutdownArtProcess()
529 restart_art_ =
false;
533 auto check_pid = [&]() {
534 if(art_pid_ ==
nullptr || *art_pid_ <= 0)
538 else if(kill(*art_pid_, 0) < 0)
547 TLOG(14) <<
"art process already exited, nothing to do.";
552 auto shutdown_start = std::chrono::steady_clock::now();
554 int graceful_wait_ms = 1000 * 10;
555 int gentle_wait_ms = 1000 * 2;
556 int int_wait_ms = 1000;
558 TLOG(TLVL_TRACE) <<
"Waiting up to " << graceful_wait_ms
559 <<
" ms for art process to exit gracefully";
560 for(
int ii = 0; ii < graceful_wait_ms; ++ii)
566 TLOG(TLVL_INFO) <<
"art process exited after "
567 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
575 TLOG(TLVL_TRACE) <<
"Gently informing art process that it is time to shut down";
577 TLOG(TLVL_TRACE) <<
"Sending SIGQUIT to pid " << *art_pid_;
578 kill(*art_pid_, SIGQUIT);
581 TLOG(TLVL_TRACE) <<
"Waiting up to " << gentle_wait_ms
582 <<
" ms for art process to exit from SIGQUIT";
583 for(
int ii = 0; ii < gentle_wait_ms; ++ii)
589 TLOG(TLVL_INFO) <<
"art process exited after "
590 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
598 TLOG(TLVL_TRACE) <<
"Insisting that the art process shut down";
599 kill(*art_pid_, SIGINT);
602 TLOG(TLVL_TRACE) <<
"Waiting up to " << int_wait_ms
603 <<
" ms for art process to exit from SIGINT";
604 for(
int ii = 0; ii < int_wait_ms; ++ii)
610 TLOG(TLVL_INFO) <<
"art process exited after "
611 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
618 TLOG(TLVL_TRACE) <<
"Killing art process with extreme prejudice";
622 kill(*art_pid_, SIGKILL);
626 TLOG(TLVL_INFO) <<
"art process exited after "
627 << artdaq::TimeUtils::GetElapsedTimeMilliseconds(shutdown_start)
virtual void transitionHalting(toolbox::Event::Reference event) override
virtual void transitionInitializing(toolbox::Event::Reference event) override
static void outputOnlineMonitorFHICL(const ConfigurationTree &onlineMonitorNode)
void INIT_MF(const char *name)