otsdaq  3.03.00
ARTDAQOnlineMonitorSupervisor.cc
1 #include "otsdaq/ARTDAQOnlineMonitor/ARTDAQOnlineMonitorSupervisor.h"
2 
3 #include "otsdaq/TablePlugins/ARTDAQTableBase/ARTDAQTableBase.h"
4 
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq-core/Utilities/TimeUtils.hh"
7 
8 #include <signal.h>
9 #include <sys/wait.h>
10 #include <boost/filesystem.hpp>
11 #include <boost/thread.hpp>
12 
13 XDAQ_INSTANTIATOR_IMPL(ots::ARTDAQOnlineMonitorSupervisor)
14 
15 #define FAKE_CONFIG_NAME "ots_config"
16 
17 //==============================================================================
18 ots::ARTDAQOnlineMonitorSupervisor::ARTDAQOnlineMonitorSupervisor(
19  xdaq::ApplicationStub* stub)
20  : CoreSupervisorBase(stub), partition_(getSupervisorProperty("partition", 0))
21 {
22  __SUP_COUT__ << "Constructor." << __E__;
23 
24  INIT_MF("." /*directory used is USER_DATA/LOG/.*/);
25 
26  __SUP_COUT__ << "Constructed." << __E__;
27 } // end constructor()
28 
29 //==============================================================================
30 ots::ARTDAQOnlineMonitorSupervisor::~ARTDAQOnlineMonitorSupervisor(void)
31 {
32  __SUP_COUT__ << "Destructor." << __E__;
33  destroy();
34  __SUP_COUT__ << "Destructed." << __E__;
35 } // end destructor()
36 
37 //==============================================================================
38 
39 void ots::ARTDAQOnlineMonitorSupervisor::destroy(void)
40 {
41  __SUP_COUT__ << "Destroying..." << __E__;
42 
43  __SUP_COUT__ << "Destroyed." << __E__;
44 } // end destroy()
45 
46 //==============================================================================
47 void ots::ARTDAQOnlineMonitorSupervisor::init(void)
48 {
49  __SUP_COUT__ << "Initializing..." << __E__;
50 
51  __SUP_COUT__ << "Initialized." << __E__;
52 } // end init()
53 
54 //==============================================================================
56  toolbox::Event::Reference /*event*/)
57 try
58 {
59  __SUP_COUT__ << "Initializing..." << __E__;
60  init();
61  __SUP_COUT__ << "Initialized." << __E__;
62 } // end transitionInitializing()
63 catch(const std::runtime_error& e)
64 {
65  __SS__ << "Error was caught while Initializing: " << e.what() << __E__;
66  __SS_THROW__;
67 }
68 catch(...)
69 {
70  __SS__ << "Unknown error was caught while Initializing. Please checked the logs."
71  << __E__;
72  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
73  __SS_THROW__;
74 } // end transitionInitializing() error handling
75 
76 //==============================================================================
77 void ots::ARTDAQOnlineMonitorSupervisor::transitionConfiguring(
78  toolbox::Event::Reference /*e*/)
79 {
80  std::pair<std::string /*group name*/, TableGroupKey> theGroup(
81  SOAPUtilities::translate(theStateMachine_.getCurrentMessage())
82  .getParameters()
83  .getValue("ConfigurationTableGroupName"),
84  TableGroupKey(SOAPUtilities::translate(theStateMachine_.getCurrentMessage())
85  .getParameters()
86  .getValue("ConfigurationTableGroupKey")));
87 
88  __SUP_COUT__ << "Configuration table group name: " << theGroup.first
89  << " key: " << theGroup.second << std::endl;
90 
91  try
92  {
93  //disable version tracking to accept untracked versions to be selected by the FSM transition source
94  theConfigurationManager_->loadTableGroup(
95  theGroup.first,
96  theGroup.second,
97  true /*doActivate*/,
98  0,
99  0,
100  0,
101  0,
102  0,
103  0,
104  false,
105  0,
106  0,
107  ConfigurationManager::LoadGroupType::ALL_TYPES,
108  true /*ignoreVersionTracking*/);
109  }
110  catch(const std::runtime_error& e)
111  {
112  __SS__ << "Error loading table group '" << theGroup.first << "("
113  << theGroup.second << ")! \n"
114  << e.what() << __E__;
115  __SUP_COUT_ERR__ << ss.str();
116  // ExceptionHandler(ExceptionHandlerRethrow::no, ss.str());
117 
118  //__SS_THROW_ONLY__;
119  theStateMachine_.setErrorMessage(ss.str());
120  throw toolbox::fsm::exception::Exception(
121  "Transition Error" /*name*/,
122  ss.str() /* message*/,
123  "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" /*module*/,
124  __LINE__ /*line*/,
125  __FUNCTION__ /*function*/
126  );
127  }
128  catch(...)
129  {
130  __SS__ << "Unknown error loading table group '" << theGroup.first << "("
131  << theGroup.second << ")!" << __E__;
132  __SUP_COUT_ERR__ << ss.str();
133  // ExceptionHandler(ExceptionHandlerRethrow::no, ss.str());
134 
135  //__SS_THROW_ONLY__;
136  theStateMachine_.setErrorMessage(ss.str());
137  throw toolbox::fsm::exception::Exception(
138  "Transition Error" /*name*/,
139  ss.str() /* message*/,
140  "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" /*module*/,
141  __LINE__ /*line*/,
142  __FUNCTION__ /*function*/
143  );
144  }
145 
146  try
147  {
148  ConfigurationTree theSupervisorNode = getSupervisorTableNode();
149  om_rank_ = theSupervisorNode.getNode("MonitorID").getValue<int>();
150 
151  __SUP_COUT__ << "Building configuration directory" << __E__;
152 
153  boost::system::error_code ignored;
154  // boost::filesystem::remove_all(ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME, ignored);
155 
156  // Make directory for art process logfiles
157  boost::filesystem::create_directory(
158  std::string(__ENV__("OTSDAQ_LOG_ROOT")) + "/" + theSupervisorNode.getValue(),
159  ignored);
160  mkdir((ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME).c_str(), 0755);
161 
162  // Generate Online Monitor FHICL
164  ARTDAQTableBase::flattenFHICL(ARTDAQTableBase::ARTDAQAppType::Monitor,
165  theSupervisorNode.getValue());
166 
167  symlink(ARTDAQTableBase::getFlatFHICLFilename(
168  ARTDAQTableBase::ARTDAQAppType::Monitor, theSupervisorNode.getValue())
169  .c_str(),
170  (ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME + "/" +
171  theSupervisorNode.getValue() + ".fcl")
172  .c_str());
173  config_file_name_ = ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME + "/" +
174  theSupervisorNode.getValue() + ".fcl";
175  }
176  catch(const std::runtime_error& e)
177  {
178  __SS__ << "Error configuring the ARTDAQOnlineMonitorSupervisor! \n"
179  << e.what() << __E__;
180  __SUP_COUT_ERR__ << ss.str();
181  // ExceptionHandler(ExceptionHandlerRethrow::no, ss.str());
182 
183  //__SS_THROW_ONLY__;
184  theStateMachine_.setErrorMessage(ss.str());
185  throw toolbox::fsm::exception::Exception(
186  "Transition Error" /*name*/,
187  ss.str() /* message*/,
188  "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" /*module*/,
189  __LINE__ /*line*/,
190  __FUNCTION__ /*function*/
191  );
192  }
193 } //end transitionConfiguring()
194 
195 //==============================================================================
196 void ots::ARTDAQOnlineMonitorSupervisor::transitionStarting(
197  toolbox::Event::Reference /*event*/)
198 try
199 {
200  __SUP_COUT__ << "Starting..." << __E__;
201 
202  StartArtProcess(config_file_name_);
203 
204  __SUP_COUT__ << "Started." << __E__;
205 } // end transitionStarting()
206 catch(const std::runtime_error& e)
207 {
208  __SS__ << "Error was caught while Starting: " << e.what() << __E__;
209  __SS_THROW__;
210 }
211 catch(...)
212 {
213  __SS__ << "Unknown error was caught while Starting. Please checked the logs."
214  << __E__;
215  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
216  __SS_THROW__;
217 } // end transitionStarting() error handling
218 
219 //==============================================================================
220 void ots::ARTDAQOnlineMonitorSupervisor::transitionStopping(
221  toolbox::Event::Reference /*event*/)
222 try
223 {
224  __SUP_COUT__ << "Stopping..." << __E__;
225 
226  ShutdownArtProcess();
227 
228  __SUP_COUT__ << "Stopped." << __E__;
229 } // end transitionStopping()
230 catch(const std::runtime_error& e)
231 {
232  __SS__ << "Error was caught while Stopping: " << e.what() << __E__;
233  __SS_THROW__;
234 }
235 catch(...)
236 {
237  __SS__ << "Unknown error was caught while Stopping. Please checked the logs."
238  << __E__;
239  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
240  __SS_THROW__;
241 } // end transitionStopping() error handling
242 
243 //==============================================================================
244 void ots::ARTDAQOnlineMonitorSupervisor::transitionPausing(
245  toolbox::Event::Reference /*event*/)
246 try
247 {
248  __SUP_COUT__ << "Pausing..." << __E__;
249 
250  ShutdownArtProcess();
251 
252  __SUP_COUT__ << "Paused." << __E__;
253 } // end transitionPausing()
254 catch(const std::runtime_error& e)
255 {
256  __SS__ << "Error was caught while Pausing: " << e.what() << __E__;
257  __SS_THROW__;
258 }
259 catch(...)
260 {
261  __SS__ << "Unknown error was caught while Pausing. Please checked the logs." << __E__;
262  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
263  __SS_THROW__;
264 } // end transitionPausing() error handling
265 
266 //==============================================================================
267 void ots::ARTDAQOnlineMonitorSupervisor::transitionResuming(
268  toolbox::Event::Reference /*event*/)
269 try
270 {
271  __SUP_COUT__ << "Resuming..." << __E__;
272 
273  StartArtProcess(config_file_name_);
274 
275  __SUP_COUT__ << "Resumed." << __E__;
276 } // end transitionResuming()
277 catch(const std::runtime_error& e)
278 {
279  __SS__ << "Error was caught while Resuming: " << e.what() << __E__;
280  __SS_THROW__;
281 }
282 catch(...)
283 {
284  __SS__ << "Unknown error was caught while Resuming. Please checked the logs."
285  << __E__;
286  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
287  __SS_THROW__;
288 } // end transitionResuming() error handling
289 
290 //==============================================================================
292  toolbox::Event::Reference /*event*/)
293 try
294 {
295  __SUP_COUT__ << "Halting..." << __E__;
296 
297  ShutdownArtProcess();
298 
299  __SUP_COUT__ << "Halted." << __E__;
300 } // end transitionHalting()
301 catch(const std::runtime_error& e)
302 {
303  const std::string transitionName = "Halting";
304  // if halting from Failed state, then ignore errors
305  if(theStateMachine_.getProvenanceStateName() ==
306  RunControlStateMachine::FAILED_STATE_NAME ||
307  theStateMachine_.getProvenanceStateName() ==
308  RunControlStateMachine::HALTED_STATE_NAME)
309  {
310  __SUP_COUT_INFO__ << "Error was caught while halting (but ignoring because "
311  "previous state was '"
312  << RunControlStateMachine::FAILED_STATE_NAME
313  << "'): " << e.what() << __E__;
314  }
315  else // if not previously in Failed state, then fail
316  {
317  __SUP_SS__ << "Error was caught while " << transitionName << ": " << e.what()
318  << __E__;
319  __SUP_COUT_ERR__ << "\n" << ss.str();
320  theStateMachine_.setErrorMessage(ss.str());
321  throw toolbox::fsm::exception::Exception(
322  "Transition Error" /*name*/,
323  ss.str() /* message*/,
324  "ots::ARTDAQOnlineMonitorSupervisorBase::transition" +
325  transitionName /*module*/,
326  __LINE__ /*line*/,
327  __FUNCTION__ /*function*/
328  );
329  }
330 } // end transitionHalting() std::runtime_error exception handling
331 catch(...)
332 {
333  const std::string transitionName = "Halting";
334  // if halting from Failed state, then ignore errors
335  if(theStateMachine_.getProvenanceStateName() ==
336  RunControlStateMachine::FAILED_STATE_NAME ||
337  theStateMachine_.getProvenanceStateName() ==
338  RunControlStateMachine::HALTED_STATE_NAME)
339  {
340  __SUP_COUT_INFO__ << "Unknown error was caught while halting (but ignoring "
341  "because previous state was '"
342  << RunControlStateMachine::FAILED_STATE_NAME << "')." << __E__;
343  }
344  else // if not previously in Failed state, then fail
345  {
346  __SUP_SS__ << "Unknown error was caught while " << transitionName
347  << ". Please checked the logs." << __E__;
348  __SUP_COUT_ERR__ << "\n" << ss.str();
349  theStateMachine_.setErrorMessage(ss.str());
350 
351  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
352 
353  throw toolbox::fsm::exception::Exception(
354  "Transition Error" /*name*/,
355  ss.str() /* message*/,
356  "ots::ARTDAQOnlineMonitorSupervisor::transition" + transitionName /*module*/,
357  __LINE__ /*line*/,
358  __FUNCTION__ /*function*/
359  );
360  }
361 } // end transitionHalting() exception handling
362 
363 //==============================================================================
364 void ots::ARTDAQOnlineMonitorSupervisor::enteringError(
365  toolbox::Event::Reference /*event*/)
366 {
367  __SUP_COUT__ << "Entering error recovery state" << __E__;
368 
369  ShutdownArtProcess();
370 
371  __SUP_COUT__ << "EnteringError DONE." << __E__;
372 
373 } // end enteringError()
374 
375 //==============================================================================
376 
377 void ots::ARTDAQOnlineMonitorSupervisor::RunArt(
378  const std::string& config_file, const std::shared_ptr<std::atomic<pid_t>>& pid_out)
379 {
380  do
381  {
382  auto start_time = std::chrono::steady_clock::now();
383  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file;
384 
385  pid_t pid = 0;
386 
387  char* filename = new char[config_file.length() + 1];
388  memcpy(filename, config_file.c_str(), config_file.length());
389  filename[config_file.length()] =
390  '\0'; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
391 
392 #ifdef DEBUG_ART
393  std::string debugArgS = "--config-out=" + app_name + "_art.out";
394  char* debugArg = new char[debugArgS.length() + 1];
395  memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
396  debugArg[debugArgS.length()] = '\0';
397 
398  std::vector<char*> args{const_cast<char*>("art"),
399  const_cast<char*>("-c"),
400  filename,
401  debugArg,
402  NULL}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
403 #else
404  std::vector<char*> args{
405  const_cast<char*>("art"),
406  const_cast<char*>("-c"),
407  filename,
408  nullptr}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
409 #endif
410 
411  pid = fork();
412  if(pid == 0)
413  { /* child */
414 
415  // Do any child environment setup here
416  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
417  std::string envVarValue = std::to_string(partition_);
418  if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
419  {
420  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
421  << "\" in the environment of a child art process. "
422  << "This may result in incorrect TCP port number "
423  << "assignments or other issues, and data may "
424  << "not flow through the system correctly.";
425  }
426  envVarKey = "ARTDAQ_APPLICATION_NAME";
427  envVarValue = getSupervisorTableNode().getValue();
428  if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
429  {
430  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
431  << "\" in the environment of a child art process. ";
432  }
433  envVarKey = "ARTDAQ_RANK";
434  envVarValue = std::to_string(om_rank_);
435  if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
436  {
437  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
438  << "\" in the environment of a child art process. ";
439  }
440 
441  execvp("art", &args[0]);
442  delete[] filename;
443  exit(1);
444  }
445  delete[] filename;
446 
447  *pid_out = pid;
448 
449  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
450  siginfo_t status;
451  auto sts = waitid(P_PID, pid, &status, WEXITED);
452  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
453  if(sts < 0)
454  {
455  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid
456  << ": " << errno << " (" << strerror(errno) << ").";
457  }
458  else if(status.si_code == CLD_EXITED && status.si_status == 0)
459  {
460  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, "
461  << (restart_art_ ? "restarting" : "not restarting");
462  }
463  else
464  {
465  auto art_lifetime = artdaq::TimeUtils::GetElapsedTime(start_time);
466  if(art_lifetime < minimum_art_lifetime_s_)
467  {
468  restart_art_ = false;
469  }
470 
471  auto exit_type = "exited with status code";
472  switch(status.si_code)
473  {
474  case CLD_DUMPED:
475  case CLD_KILLED:
476  exit_type = "was killed with signal";
477  break;
478  case CLD_EXITED:
479  default:
480  break;
481  }
482 
483  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
484  << "art process " << pid << " " << exit_type << " " << status.si_status
485  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
486  << " after running for " << std::setprecision(2) << std::fixed
487  << art_lifetime << " seconds, "
488  << (restart_art_ ? "restarting" : "not restarting");
489  }
490  } while(restart_art_);
491 }
492 
493 //==============================================================================
494 
495 void ots::ARTDAQOnlineMonitorSupervisor::StartArtProcess(const std::string& config_file)
496 {
497  static std::mutex start_art_mutex;
498  std::unique_lock<std::mutex> lk(start_art_mutex);
499  // TraceLock lk(start_art_mutex, 15, "StartArtLock");
500  restart_art_ = should_restart_art_;
501  auto startTime = std::chrono::steady_clock::now();
502 
503  art_pid_ = std::shared_ptr<std::atomic<pid_t>>(new std::atomic<pid_t>(-1));
504  boost::thread thread([this, config_file] { RunArt(config_file, art_pid_); });
505  thread.detach();
506 
507  while((*art_pid_ <= 0) && (artdaq::TimeUtils::GetElapsedTime(startTime) < 5))
508  {
509  usleep(10000);
510  }
511  if(*art_pid_ <= 0)
512  {
513  TLOG(TLVL_WARNING)
514  << "art process has not started after 5s. Check art configuration!"
515  << " (pid=" << *art_pid_ << ")";
516  __SS__ << "art process has not started after 5s. Check art configuration!"
517  << " (pid=" << *art_pid_ << ")" << __E__;
518  __SUP_SS_THROW__;
519  }
520 
521  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
522  << artdaq::TimeUtils::GetElapsedTime(startTime) << " seconds.";
523 }
524 
525 //==============================================================================
526 
527 void ots::ARTDAQOnlineMonitorSupervisor::ShutdownArtProcess()
528 {
529  restart_art_ = false;
530  // current_art_config_file_ = nullptr;
531  // current_art_pset_ = fhicl::ParameterSet();
532 
533  auto check_pid = [&]() {
534  if(art_pid_ == nullptr || *art_pid_ <= 0)
535  {
536  return false;
537  }
538  else if(kill(*art_pid_, 0) < 0)
539  {
540  return false;
541  }
542  return true;
543  };
544 
545  if(!check_pid())
546  {
547  TLOG(14) << "art process already exited, nothing to do.";
548  usleep(1000);
549  return;
550  }
551 
552  auto shutdown_start = std::chrono::steady_clock::now();
553 
554  int graceful_wait_ms = 1000 * 10;
555  int gentle_wait_ms = 1000 * 2;
556  int int_wait_ms = 1000;
557 
558  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms
559  << " ms for art process to exit gracefully";
560  for(int ii = 0; ii < graceful_wait_ms; ++ii)
561  {
562  usleep(1000);
563 
564  if(!check_pid())
565  {
566  TLOG(TLVL_INFO) << "art process exited after "
567  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
568  shutdown_start)
569  << " ms.";
570  return;
571  }
572  }
573 
574  {
575  TLOG(TLVL_TRACE) << "Gently informing art process that it is time to shut down";
576 
577  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << *art_pid_;
578  kill(*art_pid_, SIGQUIT);
579  }
580 
581  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms
582  << " ms for art process to exit from SIGQUIT";
583  for(int ii = 0; ii < gentle_wait_ms; ++ii)
584  {
585  usleep(1000);
586 
587  if(!check_pid())
588  {
589  TLOG(TLVL_INFO) << "art process exited after "
590  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
591  shutdown_start)
592  << " ms (SIGQUIT).";
593  return;
594  }
595  }
596 
597  {
598  TLOG(TLVL_TRACE) << "Insisting that the art process shut down";
599  kill(*art_pid_, SIGINT);
600  }
601 
602  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms
603  << " ms for art process to exit from SIGINT";
604  for(int ii = 0; ii < int_wait_ms; ++ii)
605  {
606  usleep(1000);
607 
608  if(!check_pid())
609  {
610  TLOG(TLVL_INFO) << "art process exited after "
611  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
612  shutdown_start)
613  << " ms (SIGINT).";
614  return;
615  }
616  }
617 
618  TLOG(TLVL_TRACE) << "Killing art process with extreme prejudice";
619  while(check_pid())
620  {
621  {
622  kill(*art_pid_, SIGKILL);
623  usleep(1000);
624  }
625  }
626  TLOG(TLVL_INFO) << "art process exited after "
627  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(shutdown_start)
628  << " ms (SIGKILL).";
629 }
630 
631 //==============================================================================
virtual void transitionHalting(toolbox::Event::Reference event) override
virtual void transitionInitializing(toolbox::Event::Reference event) override
static void outputOnlineMonitorFHICL(const ConfigurationTree &onlineMonitorNode)
void INIT_MF(const char *name)