From 8772c9917c63ecb2df28dbf0e4a9a4cfad8cf2ba Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Sun, 15 Oct 2023 03:33:08 +0530 Subject: [PATCH 1/3] TEZ-4110: Make Tez fail fast when DFS quota is exceeded. --- .../apache/tez/runtime/task/TezTaskRunner2.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java b/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java index 90eeeb569b..7e96594301 100644 --- a/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java +++ b/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java @@ -30,6 +30,7 @@ import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.fs.ClusterStorageCapacityExceededException; import org.apache.tez.common.Preconditions; import com.google.common.collect.Multimap; import org.apache.commons.lang.exception.ExceptionUtils; @@ -204,7 +205,7 @@ public TaskRunner2Result run() { synchronized (this) { if (isRunningState()) { trySettingEndReason(EndReason.TASK_ERROR); - registerFirstException(TaskFailureType.NON_FATAL, e, null); + registerFirstException(getTaskFailureType(e), e, null); LOG.warn("Exception from RunnerCallable", e); } } @@ -305,7 +306,7 @@ void processCallableResult(TaskRunner2CallableResult executionResult) { if (isRunningState()) { if (executionResult.error != null) { trySettingEndReason(EndReason.TASK_ERROR); - registerFirstException(TaskFailureType.NON_FATAL, executionResult.error, null); + registerFirstException(getTaskFailureType(executionResult.error), executionResult.error, null); } else { trySettingEndReason(EndReason.SUCCESS); taskComplete.set(true); @@ -584,4 +585,13 @@ private void logAborting(String abortReason) { LOG.info("Attempting to abort {} due to an invocation of {}", task.getTaskAttemptID(), abortReason); } + + private TaskFailureType getTaskFailureType(Throwable e) { + boolean hasClusterStorageCapacityExceededException = + ExceptionUtils.indexOfType(e, ClusterStorageCapacityExceededException.class) != -1; + if (hasClusterStorageCapacityExceededException) { + return TaskFailureType.FATAL; + } + return TaskFailureType.NON_FATAL; + } } \ No newline at end of file From 29b234c92423fe0be0a03e8f2d6e40a7c8909767 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Sun, 15 Oct 2023 18:22:19 +0530 Subject: [PATCH 2/3] Add Test. --- .../tez/runtime/task/TezTaskRunner2.java | 2 +- .../tez/runtime/task/TestTaskExecution2.java | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java b/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java index 7e96594301..2f1be9c000 100644 --- a/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java +++ b/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TezTaskRunner2.java @@ -91,7 +91,7 @@ public class TezTaskRunner2 { // TaskRunnerCallable, a failure to heartbeat, or a signalFatalError on the context. private volatile Throwable firstException; private volatile EventMetaData exceptionSourceInfo; - private volatile TaskFailureType firstTaskFailureType; + volatile TaskFailureType firstTaskFailureType; private final AtomicBoolean errorReporterToAm = new AtomicBoolean(false); private volatile boolean oobSignalErrorInProgress = false; diff --git a/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java b/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java index 810c3ac82b..5c51f898ec 100644 --- a/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java +++ b/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java @@ -38,6 +38,9 @@ import java.util.concurrent.locks.ReentrantLock; import com.google.common.annotations.VisibleForTesting; + +import org.apache.hadoop.fs.ClusterStorageCapacityExceededException; +import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException; import org.apache.tez.common.Preconditions; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; @@ -77,6 +80,7 @@ import org.apache.tez.runtime.common.resources.ScalingAllocator; import org.apache.tez.runtime.internals.api.TaskReporterInterface; import org.apache.tez.runtime.task.TaskExecutionTestHelpers.TestProcessor; +import org.apache.tez.runtime.task.TaskRunner2Callable.TaskRunner2CallableResult; import org.junit.AfterClass; import org.junit.Before; import org.junit.Test; @@ -653,6 +657,40 @@ public void testKilledAfterComplete() throws IOException, InterruptedException, } } + @Test + public void testClusterStoragaeCapacityFatalError() throws IOException { + // Try having a ClusterStorageCapacityExceededException, which is nested within several exceptions. + TezTaskRunner2ForTest taskRunner = createTaskRunnerForTest(); + TaskRunner2CallableResult executionResult = new TaskRunner2CallableResult(new Exception( + new IllegalArgumentException(new ClusterStorageCapacityExceededException("cluster capacity blown")))); + taskRunner.processCallableResult(executionResult); + + assertEquals(TaskFailureType.FATAL, taskRunner.getFirstTaskFailureType()); + + // Try having a child class of ClusterStorageCapacityExceededException, which is nested within several exceptions. + taskRunner = createTaskRunnerForTest(); + executionResult = new TaskRunner2CallableResult( + new Exception(new IllegalArgumentException(new NSQuotaExceededException("Namespace quota blown")))); + taskRunner.processCallableResult(executionResult); + + assertEquals(TaskFailureType.FATAL, taskRunner.getFirstTaskFailureType()); + + // Try having a ClusterStorageCapacityExceededException as the first exception (non-nested) + taskRunner = createTaskRunnerForTest(); + executionResult = + new TaskRunner2CallableResult(new ClusterStorageCapacityExceededException("cluster capacity blown")); + taskRunner.processCallableResult(executionResult); + + assertEquals(TaskFailureType.FATAL, taskRunner.getFirstTaskFailureType()); + + // Try having some other exception, for that it should be NON_FATAL + taskRunner = createTaskRunnerForTest(); + executionResult = new TaskRunner2CallableResult(new Exception(new IllegalArgumentException("Generic Exception"))); + taskRunner.processCallableResult(executionResult); + + assertEquals(TaskFailureType.NON_FATAL, taskRunner.getFirstTaskFailureType()); + } + private void verifySysCounters(TezCounters tezCounters, int minTaskCounterCount, int minFsCounterCount) { Preconditions.checkArgument((minTaskCounterCount > 0 && minFsCounterCount > 0) || @@ -747,6 +785,11 @@ private TezTaskRunner2 createTaskRunner(ApplicationId appId, processorConf, false, updateSysCounters); } + private TezTaskRunner2ForTest createTaskRunnerForTest() throws IOException { + return (TezTaskRunner2ForTest) createTaskRunner(ApplicationId.newInstance(10000, 1), null, null, null, + TestProcessor.class.getName(), TestProcessor.CONF_EMPTY, true, false); + } + private TezTaskRunner2ForTest createTaskRunnerForTest(ApplicationId appId, TaskExecutionTestHelpers.TezTaskUmbilicalForTest umbilical, TaskReporter taskReporter, @@ -827,6 +870,9 @@ executionContext, memAvailable, updateSysCounters, new DefaultHadoopShim(), sharedExecutor); } + public TaskFailureType getFirstTaskFailureType() { + return firstTaskFailureType; + } @Override @VisibleForTesting From fa1b6fc80ea88c6d9cda680b366ccdb45c2e5eb1 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Mon, 16 Oct 2023 13:18:19 +0530 Subject: [PATCH 3/3] Fix Typo. --- .../java/org/apache/tez/runtime/task/TestTaskExecution2.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java b/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java index 5c51f898ec..aeaec53124 100644 --- a/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java +++ b/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TestTaskExecution2.java @@ -658,7 +658,7 @@ public void testKilledAfterComplete() throws IOException, InterruptedException, } @Test - public void testClusterStoragaeCapacityFatalError() throws IOException { + public void testClusterStorageCapacityFatalError() throws IOException { // Try having a ClusterStorageCapacityExceededException, which is nested within several exceptions. TezTaskRunner2ForTest taskRunner = createTaskRunnerForTest(); TaskRunner2CallableResult executionResult = new TaskRunner2CallableResult(new Exception(