CGroupsResourceCalculator.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A Cgroup version 1 file-system based Resource calculator without the process tree features.
 *
 * Warning: this implementation will not work properly
 * when configured using the mapreduce.job.process-tree.class job property.
 * Theoretically the ResourceCalculatorProcessTree can be configured using the
 * mapreduce.job.process-tree.class job property, however it has a dependency on an
 * instantiated ResourceHandlerModule, which is only initialised in the NodeManager process
 * and not in the containers.
 *
 * Limitation:
 * The ResourceCalculatorProcessTree class can be configured using the
 * mapreduce.job.process-tree.class property within a MapReduce job.
 * However, it is important to note that instances of ResourceCalculatorProcessTree operate
 * within the context of a MapReduce task. This presents a limitation:
 * these instances do not have access to the ResourceHandlerModule,
 * which is only initialized within the NodeManager process
 * and not within individual containers where MapReduce tasks execute.
 * As a result, the current implementation of ResourceCalculatorProcessTree is incompatible
 * with the mapreduce.job.process-tree.class property. This incompatibility arises
 * because the ResourceHandlerModule is essential for managing and monitoring resource usage,
 * and without it, the ResourceCalculatorProcessTree cannot function as intended
 * within the confines of a MapReduce task. Therefore, any attempts to utilize this class
 * through the mapreduce.job.process-tree.class property
 * will not succeed under the current architecture.
 */
public class CGroupsResourceCalculator extends AbstractCGroupsResourceCalculator {
  private static final Logger LOG = LoggerFactory.getLogger(CGroupsResourceCalculator.class);

  /**
   * <a href="https://docs.kernel.org/admin-guide/cgroup-v1/cpuacct.html">DOC</a>
   *
   * ...
   * cpuacct.stat file lists a few statistics which further divide the CPU time obtained
   * by the cgroup into user and system times.
   * Currently the following statistics are supported:
   *  - user: Time spent by tasks of the cgroup in user mode.
   *  - system: Time spent by tasks of the cgroup in kernel mode.
   * user and system are in USER_HZ unit.
   *  ...
   *
   * <a href="https://litux.nl/mirror/kerneldevelopment/0672327201/ch10lev1sec3.html">DOC</a>
   *
   * ...
   * In kernels earlier than 2.6, changing the value of HZ resulted in user-space anomalies.
   * This happened because values were exported to user-space in units of ticks-per-second.
   * As these interfaces became permanent, applications grew to rely on a specific value of HZ.
   * Consequently, changing HZ would scale various exported values
   * by some constantwithout user-space knowing!
   * Uptime would read 20 hours when it was in fact two!
   *
   * To prevent such problems, the kernel needs to scale all exported jiffies values.
   * It does this by defining USER_HZ, which is the HZ value that user-space expects. On x86,
   * because HZ was historically 100, USER_HZ is 100. The macro jiffies_to_clock_t()
   * is then used to scale a tick count in terms of HZ to a tick count in terms of USER_HZ.
   * The macro used depends on whether USER_HZ and HZ are integer multiples of themselves.
   * ...
   *
   */
  private static final String CPU_STAT = "cpuacct.stat";

  /**
   * <a href="https://docs.kernel.org/admin-guide/cgroup-v1/memory.html#usage-in-bytes">DOC</a>
   *
   * ...
   * For efficiency, as other kernel components, memory cgroup uses some optimization
   * to avoid unnecessary cacheline false sharing.
   * usage_in_bytes is affected by the method
   * and doesn���t show ���exact��� value of memory (and swap) usage,
   * it���s a fuzz value for efficient access. (Of course, when necessary, it���s synchronized.)
   *  ...
   *
   */
  private static final String MEM_STAT = "memory.usage_in_bytes";
  private static final String MEMSW_STAT = "memory.memsw.usage_in_bytes";

  public CGroupsResourceCalculator(String pid) {
    super(
        pid,
        Arrays.asList(CPU_STAT + "#user", CPU_STAT + "#system"),
        MEM_STAT,
        MEMSW_STAT
    );
  }

  @Override
  protected List<Path> getCGroupFilesToLoadInStats() {
    List<Path> result = new ArrayList<>();

    try {
      String cpuRelative = getCGroupRelativePath(CGroupsHandler.CGroupController.CPUACCT);
      if (cpuRelative != null) {
        File cpuDir = new File(getcGroupsHandler().getControllerPath(
            CGroupsHandler.CGroupController.CPUACCT), cpuRelative);
        result.add(Paths.get(cpuDir.getAbsolutePath(), CPU_STAT));
      }
    } catch (IOException e) {
      LOG.debug("Exception while looking for CPUACCT controller for pid: " + getPid(), e);
    }

    try {
      String memoryRelative = getCGroupRelativePath(CGroupsHandler.CGroupController.MEMORY);
      if (memoryRelative != null) {
        File memDir = new File(getcGroupsHandler().getControllerPath(
            CGroupsHandler.CGroupController.MEMORY), memoryRelative);
        result.add(Paths.get(memDir.getAbsolutePath(), MEM_STAT));
        result.add(Paths.get(memDir.getAbsolutePath(), MEMSW_STAT));
      }
    } catch (IOException e) {
      LOG.debug("Exception while looking for MEMORY controller for pid: " + getPid(), e);
    }

    return result;
  }

  private String getCGroupRelativePath(CGroupsHandler.CGroupController controller)
      throws IOException {
    for (String line : readLinesFromCGroupFileFromProcDir()) {
      // example line: 6:cpuacct,cpu:/yarn/container_1
      String[] parts = line.split(":");
      if (parts[1].contains(controller.getName())) {
        String cgroupPath = parts[2];
        Path fileName = new File(cgroupPath).toPath().getFileName();
        if (fileName != null) {
          return getcGroupsHandler().getRelativePathForCGroup(fileName.toString());
        }
      }
    }
    LOG.debug("No {} controller found for pid {}", controller, getPid());
    return null;
  }
}