From 1c723c72f956ce408828aa3795083ee6ca499969 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Mon, 21 Oct 2024 14:47:31 +0100 Subject: [PATCH] HPCC-32839 Fix Thor aborting race When a Thor workunit was aborted in k8s, there was a race condition which could cause the job to continue and fail with an unrelated spurious/confusing error. Signed-off-by: Jake Smith --- common/workunit/workunit.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/common/workunit/workunit.cpp b/common/workunit/workunit.cpp index e9efd0e0416..96b157b9c93 100644 --- a/common/workunit/workunit.cpp +++ b/common/workunit/workunit.cpp @@ -3687,7 +3687,8 @@ EnumMapping priorityClasses[] = { const char * getWorkunitStateStr(WUState state) { - dbgassertex(state < WUStateSize); + if (state >= WUStateSize) + return "unknown workunit state"; return states[state].str; // MORE - should be using getEnumText, or need to take steps to ensure values remain contiguous and in order. } @@ -14496,11 +14497,22 @@ void executeThorGraph(const char * graphName, IConstWorkUnit &workunit, const IP } } + // NB: check for expected success state (WUStateWait). If any other state, abort. { Owned w = &workunit.lock(); WUState state = w->getState(); - if (WUStateFailed == state) - throw makeStringException(0, "Workunit failed"); + if (WUStateWait != state) // expected state from successful Thor run from above + { + switch (state) + { + case WUStateAborting: + throw new WorkflowException(0, "Workunit abort requested", 0, WorkflowException::ABORT, MSGAUD_user); + case WUStateFailed: + throw makeStringException(0, "Workunit failed"); + defualt: + throw makeStringExceptionV(0, "Workunit failed. Unexpected state: %s", getWorkunitStateStr(state)); + } + } w->setState(WUStateRunning); } #else