otsdaq  3.06.00
ARTDAQOnlineMonitorSupervisor.cc
1 #include "otsdaq/ARTDAQOnlineMonitor/ARTDAQOnlineMonitorSupervisor.h"
2 
3 #include "otsdaq/TablePlugins/ARTDAQTableBase/ARTDAQTableBase.h"
4 
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq-core/Utilities/TimeUtils.hh"
7 
8 #include <signal.h>
9 #include <sys/wait.h>
10 #include <boost/filesystem.hpp>
11 #include <boost/thread.hpp>
12 
13 XDAQ_INSTANTIATOR_IMPL(ots::ARTDAQOnlineMonitorSupervisor)
14 
15 #define FAKE_CONFIG_NAME "ots_config"
16 
17 //==============================================================================
18 ots::ARTDAQOnlineMonitorSupervisor::ARTDAQOnlineMonitorSupervisor(
19  xdaq::ApplicationStub* stub)
20  : CoreSupervisorBase(stub), partition_(getSupervisorProperty("partition", 0))
21 {
22  __SUP_COUT__ << "Constructor." << __E__;
23 
24  INIT_MF("." /*directory used is USER_DATA/LOG/.*/);
25 
26  __SUP_COUT__ << "Constructed." << __E__;
27 } // end constructor()
28 
29 //==============================================================================
30 ots::ARTDAQOnlineMonitorSupervisor::~ARTDAQOnlineMonitorSupervisor(void)
31 {
32  __SUP_COUT__ << "Destructor." << __E__;
33  destroy();
34  __SUP_COUT__ << "Destructed." << __E__;
35 } // end destructor()
36 
37 //==============================================================================
38 
39 void ots::ARTDAQOnlineMonitorSupervisor::destroy(void)
40 {
41  __SUP_COUT__ << "Destroying..." << __E__;
42 
43  __SUP_COUT__ << "Destroyed." << __E__;
44 } // end destroy()
45 
46 //==============================================================================
47 void ots::ARTDAQOnlineMonitorSupervisor::init(void)
48 {
49  __SUP_COUT__ << "Initializing..." << __E__;
50 
51  __SUP_COUT__ << "Initialized." << __E__;
52 } // end init()
53 
54 //==============================================================================
56  toolbox::Event::Reference /*event*/)
57 try
58 {
59  __SUP_COUT__ << "Initializing..." << __E__;
60  init();
61  __SUP_COUT__ << "Initialized." << __E__;
62 } // end transitionInitializing()
63 catch(const std::runtime_error& e)
64 {
65  __SS__ << "Error was caught while Initializing: " << e.what() << __E__;
66  __SS_THROW__;
67 }
68 catch(...)
69 {
70  __SS__ << "Unknown error was caught while Initializing. Please checked the logs."
71  << __E__;
72  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
73  __SS_THROW__;
74 } // end transitionInitializing() error handling
75 
76 //==============================================================================
77 void ots::ARTDAQOnlineMonitorSupervisor::transitionConfiguring(
78  toolbox::Event::Reference /*e*/)
79 {
80  CoreSupervisorBase::configureInit();
81 
82  try
83  {
84  ConfigurationTree theSupervisorNode = getSupervisorTableNode();
85  om_rank_ = theSupervisorNode.getNode("MonitorID").getValue<int>();
86 
87  __SUP_COUT__ << "Building configuration directory" << __E__;
88 
89  boost::system::error_code ignored;
90  // boost::filesystem::remove_all(ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME, ignored);
91 
92  // Make directory for art process logfiles
93  boost::filesystem::create_directory(
94  std::string(__ENV__("OTSDAQ_LOG_ROOT")) + "/" + theSupervisorNode.getValue(),
95  ignored);
96  mkdir((ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME).c_str(), 0755);
97 
98  // Generate Online Monitor FHICL
100  ARTDAQTableBase::flattenFHICL(ARTDAQTableBase::ARTDAQAppType::Monitor,
101  theSupervisorNode.getValue());
102 
103  symlink(ARTDAQTableBase::getFlatFHICLFilename(
104  ARTDAQTableBase::ARTDAQAppType::Monitor, theSupervisorNode.getValue())
105  .c_str(),
106  (ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME + "/" +
107  theSupervisorNode.getValue() + ".fcl")
108  .c_str());
109  config_file_name_ = ARTDAQTableBase::ARTDAQ_FCL_PATH + FAKE_CONFIG_NAME + "/" +
110  theSupervisorNode.getValue() + ".fcl";
111  }
112  catch(const std::runtime_error& e)
113  {
114  __SS__ << "Error configuring the ARTDAQOnlineMonitorSupervisor! \n"
115  << e.what() << __E__;
116  __SUP_COUT_ERR__ << ss.str();
117  // ExceptionHandler(ExceptionHandlerRethrow::no, ss.str());
118 
119  //__SS_THROW_ONLY__;
120  theStateMachine_.setErrorMessage(ss.str());
121  throw toolbox::fsm::exception::Exception(
122  "Transition Error" /*name*/,
123  ss.str() /* message*/,
124  "ARTDAQOnlineMonitorSupervisor::transitionConfiguring" /*module*/,
125  __LINE__ /*line*/,
126  __FUNCTION__ /*function*/
127  );
128  }
129 } //end transitionConfiguring()
130 
131 //==============================================================================
132 void ots::ARTDAQOnlineMonitorSupervisor::transitionStarting(
133  toolbox::Event::Reference /*event*/)
134 try
135 {
136  __SUP_COUT__ << "Starting..." << __E__;
137 
138  StartArtProcess(config_file_name_);
139 
140  __SUP_COUT__ << "Started." << __E__;
141 } // end transitionStarting()
142 catch(const std::runtime_error& e)
143 {
144  __SS__ << "Error was caught while Starting: " << e.what() << __E__;
145  __SS_THROW__;
146 }
147 catch(...)
148 {
149  __SS__ << "Unknown error was caught while Starting. Please checked the logs."
150  << __E__;
151  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
152  __SS_THROW__;
153 } // end transitionStarting() error handling
154 
155 //==============================================================================
156 void ots::ARTDAQOnlineMonitorSupervisor::transitionStopping(
157  toolbox::Event::Reference /*event*/)
158 try
159 {
160  __SUP_COUT__ << "Stopping..." << __E__;
161 
162  ShutdownArtProcess();
163 
164  __SUP_COUT__ << "Stopped." << __E__;
165 } // end transitionStopping()
166 catch(const std::runtime_error& e)
167 {
168  __SS__ << "Error was caught while Stopping: " << e.what() << __E__;
169  __SS_THROW__;
170 }
171 catch(...)
172 {
173  __SS__ << "Unknown error was caught while Stopping. Please checked the logs."
174  << __E__;
175  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
176  __SS_THROW__;
177 } // end transitionStopping() error handling
178 
179 //==============================================================================
180 void ots::ARTDAQOnlineMonitorSupervisor::transitionPausing(
181  toolbox::Event::Reference /*event*/)
182 try
183 {
184  __SUP_COUT__ << "Pausing..." << __E__;
185 
186  ShutdownArtProcess();
187 
188  __SUP_COUT__ << "Paused." << __E__;
189 } // end transitionPausing()
190 catch(const std::runtime_error& e)
191 {
192  __SS__ << "Error was caught while Pausing: " << e.what() << __E__;
193  __SS_THROW__;
194 }
195 catch(...)
196 {
197  __SS__ << "Unknown error was caught while Pausing. Please checked the logs." << __E__;
198  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
199  __SS_THROW__;
200 } // end transitionPausing() error handling
201 
202 //==============================================================================
203 void ots::ARTDAQOnlineMonitorSupervisor::transitionResuming(
204  toolbox::Event::Reference /*event*/)
205 try
206 {
207  __SUP_COUT__ << "Resuming..." << __E__;
208 
209  StartArtProcess(config_file_name_);
210 
211  __SUP_COUT__ << "Resumed." << __E__;
212 } // end transitionResuming()
213 catch(const std::runtime_error& e)
214 {
215  __SS__ << "Error was caught while Resuming: " << e.what() << __E__;
216  __SS_THROW__;
217 }
218 catch(...)
219 {
220  __SS__ << "Unknown error was caught while Resuming. Please checked the logs."
221  << __E__;
222  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
223  __SS_THROW__;
224 } // end transitionResuming() error handling
225 
226 //==============================================================================
228  toolbox::Event::Reference /*event*/)
229 try
230 {
231  __SUP_COUT__ << "Halting..." << __E__;
232 
233  ShutdownArtProcess();
234 
235  __SUP_COUT__ << "Halted." << __E__;
236 } // end transitionHalting()
237 catch(const std::runtime_error& e)
238 {
239  const std::string transitionName = "Halting";
240  // if halting from Failed state, then ignore errors
241  if(theStateMachine_.getProvenanceStateName() ==
242  RunControlStateMachine::FAILED_STATE_NAME ||
243  theStateMachine_.getProvenanceStateName() ==
244  RunControlStateMachine::HALTED_STATE_NAME)
245  {
246  __SUP_COUT_INFO__ << "Error was caught while halting (but ignoring because "
247  "previous state was '"
248  << RunControlStateMachine::FAILED_STATE_NAME
249  << "'): " << e.what() << __E__;
250  }
251  else // if not previously in Failed state, then fail
252  {
253  __SUP_SS__ << "Error was caught while " << transitionName << ": " << e.what()
254  << __E__;
255  __SUP_COUT_ERR__ << "\n" << ss.str();
256  theStateMachine_.setErrorMessage(ss.str());
257  throw toolbox::fsm::exception::Exception(
258  "Transition Error" /*name*/,
259  ss.str() /* message*/,
260  "ots::ARTDAQOnlineMonitorSupervisorBase::transition" +
261  transitionName /*module*/,
262  __LINE__ /*line*/,
263  __FUNCTION__ /*function*/
264  );
265  }
266 } // end transitionHalting() std::runtime_error exception handling
267 catch(...)
268 {
269  const std::string transitionName = "Halting";
270  // if halting from Failed state, then ignore errors
271  if(theStateMachine_.getProvenanceStateName() ==
272  RunControlStateMachine::FAILED_STATE_NAME ||
273  theStateMachine_.getProvenanceStateName() ==
274  RunControlStateMachine::HALTED_STATE_NAME)
275  {
276  __SUP_COUT_INFO__ << "Unknown error was caught while halting (but ignoring "
277  "because previous state was '"
278  << RunControlStateMachine::FAILED_STATE_NAME << "')." << __E__;
279  }
280  else // if not previously in Failed state, then fail
281  {
282  __SUP_SS__ << "Unknown error was caught while " << transitionName
283  << ". Please checked the logs." << __E__;
284  __SUP_COUT_ERR__ << "\n" << ss.str();
285  theStateMachine_.setErrorMessage(ss.str());
286 
287  artdaq::ExceptionHandler(artdaq::ExceptionHandlerRethrow::no, ss.str());
288 
289  throw toolbox::fsm::exception::Exception(
290  "Transition Error" /*name*/,
291  ss.str() /* message*/,
292  "ots::ARTDAQOnlineMonitorSupervisor::transition" + transitionName /*module*/,
293  __LINE__ /*line*/,
294  __FUNCTION__ /*function*/
295  );
296  }
297 } // end transitionHalting() exception handling
298 
299 //==============================================================================
300 void ots::ARTDAQOnlineMonitorSupervisor::enteringError(
301  toolbox::Event::Reference /*event*/)
302 {
303  __SUP_COUT__ << "Entering error recovery state" << __E__;
304 
305  ShutdownArtProcess();
306 
307  __SUP_COUT__ << "EnteringError DONE." << __E__;
308 
309 } // end enteringError()
310 
311 //==============================================================================
312 
313 void ots::ARTDAQOnlineMonitorSupervisor::RunArt(
314  const std::string& config_file, const std::shared_ptr<std::atomic<pid_t>>& pid_out)
315 {
316  do
317  {
318  auto start_time = std::chrono::steady_clock::now();
319  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file;
320 
321  pid_t pid = 0;
322 
323  char* filename = new char[config_file.length() + 1];
324  memcpy(filename, config_file.c_str(), config_file.length());
325  filename[config_file.length()] =
326  '\0'; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
327 
328 #ifdef DEBUG_ART
329  std::string debugArgS = "--config-out=" + app_name + "_art.out";
330  char* debugArg = new char[debugArgS.length() + 1];
331  memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
332  debugArg[debugArgS.length()] = '\0';
333 
334  std::vector<char*> args{const_cast<char*>("art"),
335  const_cast<char*>("-c"),
336  filename,
337  debugArg,
338  NULL}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
339 #else
340  std::vector<char*> args{
341  const_cast<char*>("art"),
342  const_cast<char*>("-c"),
343  filename,
344  nullptr}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
345 #endif
346 
347  pid = fork();
348  if(pid == 0)
349  { /* child */
350 
351  // Do any child environment setup here
352  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
353  std::string envVarValue = std::to_string(partition_);
354  if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
355  {
356  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
357  << "\" in the environment of a child art process. "
358  << "This may result in incorrect TCP port number "
359  << "assignments or other issues, and data may "
360  << "not flow through the system correctly.";
361  }
362  envVarKey = "ARTDAQ_APPLICATION_NAME";
363  envVarValue = getSupervisorTableNode().getValue();
364  if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
365  {
366  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
367  << "\" in the environment of a child art process. ";
368  }
369  envVarKey = "ARTDAQ_RANK";
370  envVarValue = std::to_string(om_rank_);
371  if(setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
372  {
373  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
374  << "\" in the environment of a child art process. ";
375  }
376 
377  execvp("art", &args[0]);
378  delete[] filename;
379  exit(1);
380  }
381  delete[] filename;
382 
383  *pid_out = pid;
384 
385  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
386  siginfo_t status;
387  auto sts = waitid(P_PID, pid, &status, WEXITED);
388  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
389  if(sts < 0)
390  {
391  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid
392  << ": " << errno << " (" << strerror(errno) << ").";
393  }
394  else if(status.si_code == CLD_EXITED && status.si_status == 0)
395  {
396  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, "
397  << (restart_art_ ? "restarting" : "not restarting");
398  }
399  else
400  {
401  auto art_lifetime = artdaq::TimeUtils::GetElapsedTime(start_time);
402  if(art_lifetime < minimum_art_lifetime_s_)
403  {
404  restart_art_ = false;
405  }
406 
407  auto exit_type = "exited with status code";
408  switch(status.si_code)
409  {
410  case CLD_DUMPED:
411  case CLD_KILLED:
412  exit_type = "was killed with signal";
413  break;
414  case CLD_EXITED:
415  default:
416  break;
417  }
418 
419  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
420  << "art process " << pid << " " << exit_type << " " << status.si_status
421  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
422  << " after running for " << std::setprecision(2) << std::fixed
423  << art_lifetime << " seconds, "
424  << (restart_art_ ? "restarting" : "not restarting");
425  }
426  } while(restart_art_);
427 }
428 
429 //==============================================================================
430 
431 void ots::ARTDAQOnlineMonitorSupervisor::StartArtProcess(const std::string& config_file)
432 {
433  static std::mutex start_art_mutex;
434  std::unique_lock<std::mutex> lk(start_art_mutex);
435  // TraceLock lk(start_art_mutex, 15, "StartArtLock");
436  restart_art_ = should_restart_art_;
437  auto startTime = std::chrono::steady_clock::now();
438 
439  art_pid_ = std::shared_ptr<std::atomic<pid_t>>(new std::atomic<pid_t>(-1));
440  boost::thread thread([this, config_file] { RunArt(config_file, art_pid_); });
441  thread.detach();
442 
443  while((*art_pid_ <= 0) && (artdaq::TimeUtils::GetElapsedTime(startTime) < 5))
444  {
445  usleep(10000);
446  }
447  if(*art_pid_ <= 0)
448  {
449  TLOG(TLVL_WARNING)
450  << "art process has not started after 5s. Check art configuration!"
451  << " (pid=" << *art_pid_ << ")";
452  __SS__ << "art process has not started after 5s. Check art configuration!"
453  << " (pid=" << *art_pid_ << ")" << __E__;
454  __SUP_SS_THROW__;
455  }
456 
457  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
458  << artdaq::TimeUtils::GetElapsedTime(startTime) << " seconds.";
459 }
460 
461 //==============================================================================
462 
463 void ots::ARTDAQOnlineMonitorSupervisor::ShutdownArtProcess()
464 {
465  restart_art_ = false;
466  // current_art_config_file_ = nullptr;
467  // current_art_pset_ = fhicl::ParameterSet();
468 
469  auto check_pid = [&]() {
470  if(art_pid_ == nullptr || *art_pid_ <= 0)
471  {
472  return false;
473  }
474  else if(kill(*art_pid_, 0) < 0)
475  {
476  return false;
477  }
478  return true;
479  };
480 
481  if(!check_pid())
482  {
483  TLOG(14) << "art process already exited, nothing to do.";
484  usleep(1000);
485  return;
486  }
487 
488  auto shutdown_start = std::chrono::steady_clock::now();
489 
490  int graceful_wait_ms = 1000 * 10;
491  int gentle_wait_ms = 1000 * 2;
492  int int_wait_ms = 1000;
493 
494  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms
495  << " ms for art process to exit gracefully";
496  for(int ii = 0; ii < graceful_wait_ms; ++ii)
497  {
498  usleep(1000);
499 
500  if(!check_pid())
501  {
502  TLOG(TLVL_INFO) << "art process exited after "
503  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
504  shutdown_start)
505  << " ms.";
506  return;
507  }
508  }
509 
510  {
511  TLOG(TLVL_TRACE) << "Gently informing art process that it is time to shut down";
512 
513  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << *art_pid_;
514  kill(*art_pid_, SIGQUIT);
515  }
516 
517  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms
518  << " ms for art process to exit from SIGQUIT";
519  for(int ii = 0; ii < gentle_wait_ms; ++ii)
520  {
521  usleep(1000);
522 
523  if(!check_pid())
524  {
525  TLOG(TLVL_INFO) << "art process exited after "
526  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
527  shutdown_start)
528  << " ms (SIGQUIT).";
529  return;
530  }
531  }
532 
533  {
534  TLOG(TLVL_TRACE) << "Insisting that the art process shut down";
535  kill(*art_pid_, SIGINT);
536  }
537 
538  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms
539  << " ms for art process to exit from SIGINT";
540  for(int ii = 0; ii < int_wait_ms; ++ii)
541  {
542  usleep(1000);
543 
544  if(!check_pid())
545  {
546  TLOG(TLVL_INFO) << "art process exited after "
547  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(
548  shutdown_start)
549  << " ms (SIGINT).";
550  return;
551  }
552  }
553 
554  TLOG(TLVL_TRACE) << "Killing art process with extreme prejudice";
555  while(check_pid())
556  {
557  {
558  kill(*art_pid_, SIGKILL);
559  usleep(1000);
560  }
561  }
562  TLOG(TLVL_INFO) << "art process exited after "
563  << artdaq::TimeUtils::GetElapsedTimeMilliseconds(shutdown_start)
564  << " ms (SIGKILL).";
565 }
566 
567 //==============================================================================
virtual void transitionHalting(toolbox::Event::Reference event) override
virtual void transitionInitializing(toolbox::Event::Reference event) override
static const std::string ARTDAQ_FCL_PATH
Tree-path rule is, if the last link in the path is a group link with a specified group ID,...
static void outputOnlineMonitorFHICL(const ConfigurationTree &onlineMonitorNode)
void INIT_MF(const char *name)