GpuDiscoverer.java
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Lists;
import org.apache.hadoop.util.Sets;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class GpuDiscoverer extends Configured {
public static final Logger LOG = LoggerFactory.getLogger(
GpuDiscoverer.class);
@VisibleForTesting
static final String DEFAULT_BINARY_NAME = "nvidia-smi";
// When executable path not set, try to search default dirs
// By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
// launched by nvidia-docker.
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private NvidiaBinaryHelper nvidiaBinaryHelper;
private String pathOfGpuBinary = null;
private Map<String, String> environment = new HashMap<>();
private int numOfErrorExecutionSinceLastSucceed = 0;
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
private List<GpuDevice> gpuDevicesFromUser;
private void validateConfOrThrowException() throws YarnException {
if (getConf() == null) {
throw new YarnException("Please initialize (call initialize) before use "
+ GpuDiscoverer.class.getSimpleName());
}
}
private String getErrorMessageOfScriptExecution(String msg) {
return getFailedToExecuteScriptMessage() +
"! Exception message: " + msg;
}
private String getErrorMessageOfScriptExecutionThresholdReached() {
return getFailedToExecuteScriptMessage() + " for " +
MAX_REPEATED_ERROR_ALLOWED + " times, " +
"skipping following executions!";
}
private String getFailedToExecuteScriptMessage() {
return "Failed to execute " +
GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE +
" (" + pathOfGpuBinary + ")";
}
private String getFailedToParseErrorMessage(String msg) {
return "Failed to parse XML output of " +
GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE
+ "( " + pathOfGpuBinary + ")" + msg;
}
/**
* Get GPU device information from system.
* This need to be called after initialize.
*
* Please note that this only works on *NIX platform, so external caller
* need to make sure this.
*
* @return GpuDeviceInformation
* @throws YarnException when any error happens
*/
public synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException {
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg);
throw new YarnException(msg);
}
try {
lastDiscoveredGpuInformation =
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = getErrorMessageOfScriptExecution(e.getMessage());
LOG.debug(msg);
throw new YarnException(msg, e);
} catch (YarnException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = getFailedToParseErrorMessage(e.getMessage());
LOG.debug(msg, e);
throw e;
}
return lastDiscoveredGpuInformation;
}
boolean isAutoDiscoveryEnabled() {
String allowedDevicesStr = getConf().get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
return allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
}
/**
* Get list of GPU devices usable by YARN.
*
* @return List of GPU devices
*/
public synchronized List<GpuDevice> getGpusUsableByYarn()
throws YarnException {
validateConfOrThrowException();
if (isAutoDiscoveryEnabled()) {
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
} else {
if (gpuDevicesFromUser == null) {
gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
}
return gpuDevicesFromUser;
}
}
private List<GpuDevice> parseGpuDevicesFromAutoDiscoveredGpuInfo()
throws YarnException {
if (lastDiscoveredGpuInformation == null) {
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
+ ", however automatically discovering "
+ "GPU information failed, please check NodeManager log for more"
+ " details, as an alternative, admin can specify "
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
+ " manually to enable GPU isolation.";
LOG.error(msg);
throw new YarnException(msg);
}
List<GpuDevice> gpuDevices = new ArrayList<>();
if (lastDiscoveredGpuInformation.getGpus() != null) {
int numberOfGpus = lastDiscoveredGpuInformation.getGpus().size();
LOG.debug("Found {} GPU devices", numberOfGpus);
for (int i = 0; i < numberOfGpus; i++) {
List<PerGpuDeviceInformation> gpuInfos =
lastDiscoveredGpuInformation.getGpus();
gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
}
}
return gpuDevices;
}
/**
* @return List of GpuDevices
* @throws YarnException when a GPU device is defined as a duplicate.
* The first duplicate GPU device will be added to the exception message.
*/
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
throws YarnException {
String devices = getConf().get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
if (devices.trim().isEmpty()) {
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
}
List<GpuDevice> gpuDevices = Lists.newArrayList();
for (String device : devices.split(",")) {
if (device.trim().length() > 0) {
String[] splitByColon = device.trim().split(":");
if (splitByColon.length != 2) {
throwIfNecessary(GpuDeviceSpecificationException
.createWithWrongValueSpecified(device, devices), getConf());
LOG.warn("Wrong GPU specification string {}, ignored", device);
}
GpuDevice gpuDevice;
try {
gpuDevice = parseGpuDevice(splitByColon);
} catch (NumberFormatException e) {
throwIfNecessary(GpuDeviceSpecificationException
.createWithWrongValueSpecified(device, devices, e), getConf());
LOG.warn("Cannot parse GPU device numbers: {}", device);
continue;
}
if (!gpuDevices.contains(gpuDevice)) {
gpuDevices.add(gpuDevice);
} else {
throwIfNecessary(GpuDeviceSpecificationException
.createWithDuplicateValueSpecified(device, devices), getConf());
LOG.warn("CPU device is duplicated: {}", device);
}
}
}
LOG.info("Allowed GPU devices:" + gpuDevices);
return gpuDevices;
}
private GpuDevice parseGpuDevice(String[] splitByColon) {
int index = Integer.parseInt(splitByColon[0]);
int minorNumber = Integer.parseInt(splitByColon[1]);
return new GpuDevice(index, minorNumber);
}
public synchronized void initialize(Configuration config,
NvidiaBinaryHelper nvidiaHelper) throws YarnException {
setConf(config);
this.nvidiaBinaryHelper = nvidiaHelper;
if (isAutoDiscoveryEnabled()) {
numOfErrorExecutionSinceLastSucceed = 0;
lookUpAutoDiscoveryBinary(config);
// Try to discover GPU information once and print
try {
LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info("Discovered GPU information: " + info.toString());
} catch (YarnException e) {
String msg =
"Failed to discover GPU information from system, exception message:"
+ e.getMessage() + " continue...";
LOG.warn(msg);
}
}
}
private void lookUpAutoDiscoveryBinary(Configuration config)
throws YarnException {
String configuredBinaryPath = config.get(
YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME);
if (configuredBinaryPath.isEmpty()) {
configuredBinaryPath = DEFAULT_BINARY_NAME;
}
File binaryPath;
File configuredBinaryFile = new File(configuredBinaryPath);
if (!configuredBinaryFile.exists()) {
binaryPath = lookupBinaryInDefaultDirs();
} else if (configuredBinaryFile.isDirectory()) {
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
} else {
binaryPath = configuredBinaryFile;
// If path exists but file name is incorrect don't execute the file
String fileName = binaryPath.getName();
if (!DEFAULT_BINARY_NAME.equals(fileName)) {
String msg = String.format("Please check the configuration value of"
+" %s. It should point to an %s binary, which is now %s",
YarnConfiguration.NM_GPU_PATH_TO_EXEC,
DEFAULT_BINARY_NAME,
fileName);
throwIfNecessary(new YarnException(msg), config);
LOG.warn(msg);
}
}
pathOfGpuBinary = binaryPath.getAbsolutePath();
}
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
throws YarnException {
File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME);
if (!binaryPath.exists()) {
throw new YarnException("Failed to find GPU discovery executable, " +
"please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
" setting. The setting points to a directory but " +
"no file found in the directory with name:" + DEFAULT_BINARY_NAME);
} else {
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ " under the directory, updated path-to-executable:"
+ binaryPath.getAbsolutePath());
}
return binaryPath;
}
private File lookupBinaryInDefaultDirs() throws YarnException {
final File lookedUpBinary = lookupBinaryInDefaultDirsInternal();
if (lookedUpBinary == null) {
throw new YarnException("Failed to find GPU discovery executable, " +
"please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC +
" setting. Also tried to find the executable " +
"in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS);
}
return lookedUpBinary;
}
private File lookupBinaryInDefaultDirsInternal() {
Set<String> triedBinaryPaths = Sets.newHashSet();
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
File binaryPath = new File(dir, DEFAULT_BINARY_NAME);
if (binaryPath.exists()) {
return binaryPath;
} else {
triedBinaryPaths.add(binaryPath.getAbsolutePath());
}
}
LOG.warn("Failed to locate GPU device discovery binary, tried paths: "
+ triedBinaryPaths + "! Please double check the value of config "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
". Using default binary: " + DEFAULT_BINARY_NAME);
return null;
}
@VisibleForTesting
Map<String, String> getEnvironmentToRunCommand() {
return environment;
}
@VisibleForTesting
String getPathOfGpuBinary() {
return pathOfGpuBinary;
}
}