GpuResourcePlugin.java
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class GpuResourcePlugin implements ResourcePlugin {
private static final Logger LOG =
LoggerFactory.getLogger(GpuResourcePlugin.class);
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
private final GpuDiscoverer gpuDiscoverer;
public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private int numOfErrorExecutionSinceLastSucceed = 0;
private GpuResourceHandlerImpl gpuResourceHandler = null;
private DockerCommandPlugin dockerCommandPlugin = null;
public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler,
GpuDiscoverer gpuDiscoverer) {
this.resourceDiscoverHandler = resourceDiscoverHandler;
this.gpuDiscoverer = gpuDiscoverer;
}
@Override
public void initialize(Context context) throws YarnException {
validateExecutorConfig(context.getConf());
this.gpuDiscoverer.initialize(context.getConf(),
new NvidiaBinaryHelper());
this.dockerCommandPlugin =
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
context.getConf());
}
private void validateExecutorConfig(Configuration conf) {
Class<? extends ContainerExecutor> executorClass = conf.getClass(
YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
ContainerExecutor.class);
if (executorClass.equals(DefaultContainerExecutor.class)) {
LOG.warn("Using GPU plugin with disabled LinuxContainerExecutor" +
" is considered to be unsafe.");
}
}
@Override
public ResourceHandler createResourceHandler(
Context context, CGroupsHandler cGroupsHandler,
PrivilegedOperationExecutor privilegedOperationExecutor) {
if (gpuResourceHandler == null) {
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
privilegedOperationExecutor, gpuDiscoverer);
}
return gpuResourceHandler;
}
@Override
public NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() {
return resourceDiscoverHandler;
}
@Override
public void cleanup() throws YarnException {
// Do nothing.
}
public DockerCommandPlugin getDockerCommandPluginInstance() {
return dockerCommandPlugin;
}
@Override
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
final GpuDeviceInformation gpuDeviceInformation;
if (gpuDiscoverer.isAutoDiscoveryEnabled()) {
//At this point the gpu plugin is already enabled
checkGpuResourceHandler();
checkErrorCount();
try{
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
numOfErrorExecutionSinceLastSucceed = 0;
} catch (YarnException e) {
LOG.error(e.getMessage(), e);
numOfErrorExecutionSinceLastSucceed++;
throw e;
}
} else {
gpuDeviceInformation = null;
}
GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
List<AssignedGpuDevice> assignedGpuDevices =
gpuResourceAllocator.getAssignedGpus();
return new NMGpuResourceInfo(gpuDeviceInformation, totalGpus,
assignedGpuDevices);
}
private void checkGpuResourceHandler() throws YarnException {
if(gpuResourceHandler == null) {
String errorMsg =
"Linux Container Executor is not configured for the NodeManager. "
+ "To fully enable GPU feature on the node also set "
+ YarnConfiguration.NM_CONTAINER_EXECUTOR + " properly.";
LOG.warn(errorMsg);
throw new YarnException(errorMsg);
}
}
private void checkErrorCount() throws YarnException {
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg =
"Failed to execute GPU device information detection script for "
+ MAX_REPEATED_ERROR_ALLOWED
+ " times, skip following executions.";
LOG.error(msg);
throw new YarnException(msg);
}
}
@Override
public String toString() {
return GpuResourcePlugin.class.getName();
}
}