NodeHealthScriptRunner.java
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.health;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Timer;
import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.util.Shell.ExitCodeException;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The class which provides functionality of checking the health of the node
* using the configured node health script and reporting back to the service
* for which the health checker has been asked to report.
*/
public class NodeHealthScriptRunner extends TimedHealthReporterService {
private static final Logger LOG =
LoggerFactory.getLogger(NodeHealthScriptRunner.class);
/** Absolute path to the health script. */
private String nodeHealthScript;
/** Time after which the script should be timed out. */
private long scriptTimeout;
/** ShellCommandExecutor used to execute monitoring script. */
private ShellCommandExecutor commandExecutor = null;
/** Pattern used for searching in the output of the node health script. */
private static final String ERROR_PATTERN = "ERROR";
/** Time out error message. */
static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG =
"Node health script timed out";
private NodeHealthScriptRunner(String scriptName, long checkInterval,
long timeout, String[] scriptArgs, boolean runBeforeStartup) {
super(NodeHealthScriptRunner.class.getName(), checkInterval,
runBeforeStartup);
this.nodeHealthScript = scriptName;
this.scriptTimeout = timeout;
setTimerTask(new NodeHealthMonitorExecutor(scriptArgs));
}
public static NodeHealthScriptRunner newInstance(String scriptName,
Configuration conf) {
String nodeHealthScriptsConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE, scriptName);
String nodeHealthScript = conf.get(nodeHealthScriptsConfig);
if (!shouldRun(scriptName, nodeHealthScript)) {
return null;
}
// Determine check interval ms
String checkIntervalMsConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE,
scriptName);
long checkIntervalMs = conf.getLong(checkIntervalMsConfig, 0L);
if (checkIntervalMs == 0L) {
checkIntervalMs = conf.getLong(
YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS);
}
if (checkIntervalMs < 0) {
throw new IllegalArgumentException("The node health-checker's " +
"interval-ms can not be set to a negative number.");
}
boolean runBeforeStartup = conf.getBoolean(
YarnConfiguration.NM_HEALTH_CHECK_RUN_BEFORE_STARTUP,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_RUN_BEFORE_STARTUP);
// Determine time out
String scriptTimeoutConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE,
scriptName);
long scriptTimeout = conf.getLong(scriptTimeoutConfig, 0L);
if (scriptTimeout == 0L) {
scriptTimeout = conf.getLong(
YarnConfiguration.NM_HEALTH_CHECK_TIMEOUT_MS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_TIMEOUT_MS);
}
if (scriptTimeout <= 0) {
throw new IllegalArgumentException("The node health-checker's " +
"timeout can only be set to a positive number.");
}
// Determine script arguments
String scriptArgsConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS_TEMPLATE,
scriptName);
String[] scriptArgs = conf.getStrings(scriptArgsConfig, new String[]{});
return new NodeHealthScriptRunner(nodeHealthScript,
checkIntervalMs, scriptTimeout, scriptArgs, runBeforeStartup);
}
private enum HealthCheckerExitStatus {
SUCCESS,
TIMED_OUT,
FAILED_WITH_EXIT_CODE,
FAILED_WITH_EXCEPTION,
FAILED
}
/**
* Class which is used by the {@link Timer} class to periodically execute the
* node health script.
*/
private class NodeHealthMonitorExecutor extends TimerTask {
private String exceptionStackTrace = "";
NodeHealthMonitorExecutor(String[] args) {
ArrayList<String> execScript = new ArrayList<String>();
execScript.add(nodeHealthScript);
if (args != null) {
execScript.addAll(Arrays.asList(args));
}
commandExecutor = new ShellCommandExecutor(execScript
.toArray(new String[execScript.size()]), null, null, scriptTimeout);
}
@Override
public void run() {
HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
try {
commandExecutor.execute();
} catch (ExitCodeException e) {
// ignore the exit code of the script
status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
// On Windows, we will not hit the Stream closed IOException
// thrown by stdout buffered reader for timeout event.
if (Shell.WINDOWS && commandExecutor.isTimedOut()) {
status = HealthCheckerExitStatus.TIMED_OUT;
}
} catch (Exception e) {
LOG.warn("Caught exception : " + e.getMessage());
if (!commandExecutor.isTimedOut()) {
status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
} else {
status = HealthCheckerExitStatus.TIMED_OUT;
}
exceptionStackTrace = StringUtils.stringifyException(e);
} finally {
if (status == HealthCheckerExitStatus.SUCCESS) {
if (hasErrors(commandExecutor.getOutput())) {
status = HealthCheckerExitStatus.FAILED;
}
}
reportHealthStatus(status);
}
}
/**
* Method which is used to parse output from the node health monitor and
* send to the report address.
*
* The timed out script or script which causes IOException output is
* ignored.
*
* The node is marked unhealthy if
* <ol>
* <li>The node health script times out</li>
* <li>The node health scripts output has a line which begins
* with ERROR</li>
* <li>An exception is thrown while executing the script</li>
* </ol>
* If the script throws {@link IOException} or {@link ExitCodeException} the
* output is ignored and node is left remaining healthy, as script might
* have syntax error.
*
* @param status
*/
void reportHealthStatus(HealthCheckerExitStatus status) {
switch (status) {
case SUCCESS:
case FAILED_WITH_EXIT_CODE:
// see Javadoc above - we don't report bad health intentionally
setHealthyWithoutReport();
break;
case TIMED_OUT:
setUnhealthyWithReport(NODE_HEALTH_SCRIPT_TIMED_OUT_MSG);
break;
case FAILED_WITH_EXCEPTION:
setUnhealthyWithReport(exceptionStackTrace);
break;
case FAILED:
setUnhealthyWithReport(commandExecutor.getOutput());
break;
default:
LOG.warn("Unknown HealthCheckerExitStatus - ignored.");
break;
}
}
/**
* Method to check if the output string has line which begins with ERROR.
*
* @param output the output of the node health script to process
* @return true if output string has error pattern in it.
*/
private boolean hasErrors(String output) {
String[] splits = output.split("\n");
for (String split : splits) {
if (split.startsWith(ERROR_PATTERN)) {
return true;
}
}
return false;
}
}
@Override
public void serviceStop() throws Exception {
if (commandExecutor != null) {
Process p = commandExecutor.getProcess();
if (p != null) {
p.destroy();
}
}
super.serviceStop();
}
/**
* Method used to determine whether the {@link NodeHealthScriptRunner}
* should be started or not.<p>
* Returns true if following conditions are met:
*
* <ol>
* <li>Path to Node health check script is not empty</li>
* <li>Node health check script file exists</li>
* </ol>
*
* @return true if node health monitoring service can be started.
*/
static boolean shouldRun(String script, String healthScript) {
if (healthScript == null || healthScript.trim().isEmpty()) {
LOG.info("Missing location for the node health check script \"{}\".",
script);
return false;
}
File f = new File(healthScript);
if (!f.exists()) {
LOG.warn("File {} for script \"{}\" does not exist.",
healthScript, script);
return false;
}
if (!FileUtil.canExecute(f)) {
LOG.warn("File {} for script \"{}\" can not be executed.",
healthScript, script);
return false;
}
return true;
}
}