Skip to content

Commit

Permalink
Fix TonY requests yarn config "yarn.io/gpu" on non-GPU clusters #500 (#…
Browse files Browse the repository at this point in the history
…502)

* Fix TonY requests yarn config "yarn.io/gpu" on non-GPU clusters #500

Co-authored-by: Pei-Lun Liao <pliao@linkedin.com>
  • Loading branch information
oliverhu and Pei-Lun Liao authored Feb 24, 2021
1 parent f3b1e7f commit e925dcf
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ dependency-reduced-pom.xml
log/
out/
target
tony-core/bin
tony-cli/bin
tony-portal/conf/*.jks
tony-portal/conf/application.conf
tony-portal/conf/prod.conf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
import org.apache.hadoop.yarn.util.AbstractLivelinessMonitor;
import org.apache.hadoop.yarn.util.UTCClock;


public class ApplicationMaster {
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);

Expand Down Expand Up @@ -1077,7 +1078,7 @@ public void onContainersAllocated(List<Container> containers) {
amRMClient.removeContainerRequest(Utils.setupContainerRequestForRM(new JobContainerRequest(
"", 1, container.getResource().getMemorySize(),
container.getResource().getVirtualCores(),
(int) container.getResource().getResourceInformation(Constants.GPU_URI).getValue(),
Utils.getNumOfRequestedGPU(container),
container.getPriority().getPriority(),
getNodeLabelsExpression(container.getPriority().getPriority()),
new ArrayList<>())));
Expand Down
22 changes: 21 additions & 1 deletion tony-core/src/main/java/com/linkedin/tony/util/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;

import java.io.File;
import java.io.FileOutputStream;
Expand Down Expand Up @@ -383,6 +384,11 @@ public static Map<String, JobContainerRequest> parseContainerRequests(Configurat
TonyConfigurationKeys.DEFAULT_VCORES);
int gpus = conf.getInt(TonyConfigurationKeys.getResourceKey(jobName, Constants.GPUS),
TonyConfigurationKeys.DEFAULT_GPUS);
if (gpus > 0 && !ResourceUtils.getResourceTypeIndex().containsKey(Constants.GPU_URI)) {
LOG.warn(String.format("User requested %d GPUs for job '%s' but GPU is not available on the cluster. ",
gpus, jobName));
}

String nodeLabel = conf.get(TonyConfigurationKeys.getNodeLabelKey(jobName));

// Any task that belong to the training stage depend on prepare stage
Expand Down Expand Up @@ -410,12 +416,26 @@ public static Map<String, JobContainerRequest> parseContainerRequests(Configurat
public static AMRMClient.ContainerRequest setupContainerRequestForRM(JobContainerRequest request) {
Priority priority = Priority.newInstance(request.getPriority());
Resource capability = Resource.newInstance((int) request.getMemory(), request.getVCores());
Utils.setCapabilityGPU(capability, request.getGPU());
if (request.getGPU() > 0) {
Utils.setCapabilityGPU(capability, request.getGPU());
}
AMRMClient.ContainerRequest containerRequest = new AMRMClient.ContainerRequest(capability, null, null, priority, true, request.getNodeLabelsExpression());
LOG.info("Requested container ask: " + containerRequest.toString());
return containerRequest;
}

/**
* Gets the number of requested GPU in a Container. If GPU is not available on the cluster,
* the function will return zero.
*/
public static int getNumOfRequestedGPU(Container container) {
int numGPU = 0;
if (ResourceUtils.getResourceTypeIndex().containsKey(Constants.GPU_URI)) {
numGPU = (int) container.getResource().getResourceInformation(Constants.GPU_URI).getValue();
}
return numGPU;
}

private static void ensureStagedTasksIntegrity(List<String> prepareStageTasks, List<String> trainingStageTasks,
Set<String> allJobTypes) {
if (prepareStageTasks.isEmpty() && !trainingStageTasks.isEmpty()) {
Expand Down
40 changes: 40 additions & 0 deletions tony-core/src/test/java/com/linkedin/tony/util/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableMap;
import com.linkedin.tony.TFConfig;
import com.linkedin.tony.TonyConfigurationKeys;
import com.linkedin.tony.tensorflow.JobContainerRequest;
Expand All @@ -24,8 +25,11 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.testng.annotations.Test;

import static com.linkedin.tony.Constants.LOGS_SUFFIX;
Expand Down Expand Up @@ -313,4 +317,40 @@ public void testLinksToBeDisplayedOnPage() {
assertEquals(linksToBeDisplayed.get("Logs"), "/" + LOGS_SUFFIX + "/" + "fakeJobId");
assertEquals(linksToBeDisplayed.get("Events"), "/" + JOBS_SUFFIX + "/" + "fakeJobId");
}

@Test
public void testGetNumOfRequestedGPUWithGPUAvailable() {
Resource resource = Resource.newInstance(256, 32);
Container container = mock(Container.class);
when(container.getResource()).thenReturn(resource);
// Request 0 GPUs
assertEquals(Utils.getNumOfRequestedGPU(container), 0);

// Request 2 GPUs.
resource.setResourceInformation(
ResourceInformation.GPU_URI, ResourceInformation.newInstance(ResourceInformation.GPU_URI, "", 2));
assertEquals(Utils.getNumOfRequestedGPU(container), 2);
}

@Test
public void testGetNumOfRequestedGPUWithGPUUnavailable() {
Container container = mock(Container.class);
Resource resource = Resource.newInstance(256, 32);
resource.setResourceInformation(
ResourceInformation.GPU_URI, ResourceInformation.newInstance(ResourceInformation.GPU_URI, "", 2));
when(container.getResource()).thenReturn(resource); // Request 2 GPUs in the container.

Map<String, ResourceInformation> defaultResourceTypes = ResourceUtils.getResourceTypes();
try {
// Mock that GPU is not available on cluster.
ResourceUtils.initializeResourcesFromResourceInformationMap(ImmutableMap.of(
ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB,
ResourceInformation.VCORES_URI, ResourceInformation.VCORES
));
assertEquals(Utils.getNumOfRequestedGPU(container), 0);
} finally {
// Reset to default resource types.
ResourceUtils.initializeResourcesFromResourceInformationMap(defaultResourceTypes);
}
}
}

0 comments on commit e925dcf

Please sign in to comment.