From 5efade26f60332f5015ae54b8a10af0012e00e4f Mon Sep 17 00:00:00 2001 From: Anar Manafov Date: Mon, 25 Jul 2022 11:00:06 +0200 Subject: [PATCH] GH-459: Fix agents fail to clean shm dds-agent: Fixed: Ignore SIGTERM while performing cleaning procedures. (GH-459) dds-slurm-plugin: Fixed: Make sure that scancel's SIGTERM is properly handled by all job steps and their scripts. (GH-459) --- ReleaseNotes.md | 2 ++ dds-agent/src/AgentConnectionManager.cpp | 8 +++++- dds-agent/src/main.cpp | 5 ++++ dds-commander/src/main.cpp | 28 +++++++++++--------- dds-protocol-lib/src/ConnectionManagerImpl.h | 2 +- dds-topology-lib/src/TopoBase.h | 2 +- etc/DDSWorker.sh.in | 2 +- plugins/dds-submit-slurm/src/job.slurm.in | 10 +++++-- 8 files changed, 41 insertions(+), 18 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 81904464..9be5086a 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -17,6 +17,7 @@ Added: Support for Apple's arm64 architecture. (GH-393) ### dds-agent Fixed: Address potential crash in the external process termination routines. Fixed: Revised handling of the slots container. +Fixed: Ignore SIGTERM while performing cleaning procedures. (GH-459) ### dds\_intercom\_lib Fixed: Stability improvements. @@ -49,6 +50,7 @@ Fixed: ssh cfg parser is passing cfg files of all plug-ins. (GH-413) Added: Support for SubmissionID (GH-411) ### dds-slurm-plugin +Fixed: Make sure that scancel's SIGTERM is properly handled by all job steps and their scripts. (GH-459) Added: Support for SubmissionID (GH-411) Added: Support of minimum number of agents to spawn. (GH-434) Modified: Replace array job submission with nodes requirement. (GH-430) diff --git a/dds-agent/src/AgentConnectionManager.cpp b/dds-agent/src/AgentConnectionManager.cpp index 5f3037ec..45ab0f74 100644 --- a/dds-agent/src/AgentConnectionManager.cpp +++ b/dds-agent/src/AgentConnectionManager.cpp @@ -44,8 +44,14 @@ CAgentConnectionManager::~CAgentConnectionManager() void CAgentConnectionManager::doAwaitStop() { m_signals.async_wait( - [this](boost::system::error_code /*ec*/, int /*signo*/) + [this](boost::system::error_code /*ec*/, int signo) { + // The server is stopped by cancelling all outstanding asynchronous + // operations. Once all operations have finished the io_context::run() + // call will exit. + LOG(dds::misc::info) << "Received a signal: " << signo; + LOG(dds::misc::info) << "Stopping DDS connetion manager..."; + // Stop transport engine stop(); }); diff --git a/dds-agent/src/main.cpp b/dds-agent/src/main.cpp index 498f9200..c296a7f7 100644 --- a/dds-agent/src/main.cpp +++ b/dds-agent/src/main.cpp @@ -52,6 +52,11 @@ void clean() int main(int argc, char* argv[]) { + // ignore SIGTERM + // This is mainly for the clean mode to be able to finish the clean process. + // Other agent modes will reassign sig handlers via asio in anyway. + std::signal(SIGTERM, SIG_IGN); + // Command line parser SOptions_t options; try diff --git a/dds-commander/src/main.cpp b/dds-commander/src/main.cpp index 31fcec42..56fde0df 100644 --- a/dds-commander/src/main.cpp +++ b/dds-commander/src/main.cpp @@ -192,22 +192,26 @@ int main(int argc, char* argv[]) jobs.push_back(protoSlurmSubmitInfo.slurm_job_id(0)); } } - const fs::path scancelPath{ bp::search_path("scancel") }; - stringstream ssCmd; - ssCmd << scancelPath.string(); - for (const auto& id : jobs) + if (!jobs.empty()) { - ssCmd << " " << id; - } + const fs::path scancelPath{ bp::search_path("scancel") }; - LOG(log_stdout) << "SLURM JOB CANCEL: " << ssCmd.str(); - string sout; - string serr; - execute(ssCmd.str(), chrono::seconds(30), &sout, &serr); - if (!serr.empty()) - LOG(log_stderr) << "SLURM JOB CANCEL: " << serr; + stringstream ssCmd; + ssCmd << scancelPath.string(); + ssCmd << " --full "; + for (const auto& id : jobs) + { + ssCmd << " " << id; + } + LOG(log_stdout) << "SLURM JOB CANCEL: " << ssCmd.str(); + string sout; + string serr; + execute(ssCmd.str(), chrono::seconds(30), &sout, &serr); + if (!serr.empty()) + LOG(log_stderr) << "SLURM JOB CANCEL: " << serr; + } // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< return EXIT_SUCCESS; diff --git a/dds-protocol-lib/src/ConnectionManagerImpl.h b/dds-protocol-lib/src/ConnectionManagerImpl.h index 158266a5..16e0fad0 100644 --- a/dds-protocol-lib/src/ConnectionManagerImpl.h +++ b/dds-protocol-lib/src/ConnectionManagerImpl.h @@ -67,7 +67,7 @@ namespace dds // operations. Once all operations have finished the io_context::run() // call will exit. LOG(dds::misc::info) << "Received a signal: " << signo; - LOG(dds::misc::info) << "Stopping DDS transport server"; + LOG(dds::misc::info) << "Stopping DDS transport server..."; stop(); }); diff --git a/dds-topology-lib/src/TopoBase.h b/dds-topology-lib/src/TopoBase.h index 9f485fd7..7337117b 100644 --- a/dds-topology-lib/src/TopoBase.h +++ b/dds-topology-lib/src/TopoBase.h @@ -7,10 +7,10 @@ #define __DDS__TopoBase__ // STD +#include #include #include #include -#include // BOOST #include diff --git a/etc/DDSWorker.sh.in b/etc/DDSWorker.sh.in index 7e28e928..3d565879 100755 --- a/etc/DDSWorker.sh.in +++ b/etc/DDSWorker.sh.in @@ -54,7 +54,7 @@ wait_and_kill() kill -9 $1 break fi - sleep 1 + sleep 0.3 done } #============================================================================= diff --git a/plugins/dds-submit-slurm/src/job.slurm.in b/plugins/dds-submit-slurm/src/job.slurm.in index b1febefd..4d045b59 100755 --- a/plugins/dds-submit-slurm/src/job.slurm.in +++ b/plugins/dds-submit-slurm/src/job.slurm.in @@ -12,7 +12,13 @@ #DDS_USER_OPTIONS -# execute DDS Scout -srun --no-kill --kill-on-bad-exit=0 --output=slurm-%j-%N.out /usr/bin/env bash -c 'eval JOB_WRK_DIR=%DDS_AGENT_ROOT_WRK_DIR%/${SLURM_JOB_NAME}_${SLURM_JOBID}_${SLURMD_NODENAME}; mkdir -p $JOB_WRK_DIR; cd $JOB_WRK_DIR; cp %DDS_SCOUT% $JOB_WRK_DIR/; ./DDSWorker.sh' +# ignore signals +# continue waiting for child processes by any means +trap -- '' SIGINT SIGTERM + +# execute DDS Scoullt +srun --no-kill --kill-on-bad-exit=0 --output=slurm-%j-%N.out /usr/bin/env bash -c 'trap '"'"'kill $PID && wait'"'"' SIGINT SIGTERM; eval JOB_WRK_DIR=%DDS_AGENT_ROOT_WRK_DIR%/${SLURM_JOB_NAME}_${SLURM_JOBID}_${SLURMD_NODENAME}; mkdir -p $JOB_WRK_DIR; cd $JOB_WRK_DIR; cp %DDS_SCOUT% $JOB_WRK_DIR/; ./DDSWorker.sh & PID=$!; wait' & + +wait exit 0