TestNMReconnect.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.server.resourcemanager.resourcetracker;

import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.ConfigurationProvider;
import org.apache.hadoop.yarn.conf.ConfigurationProviderFactory;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.event.InlineDispatcher;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
import org.apache.hadoop.yarn.server.api.records.NodeAction;
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
import org.apache.hadoop.yarn.server.resourcemanager.ParameterizedSchedulerTestBase;
import org.apache.hadoop.yarn.server.resourcemanager.NMLivelinessMonitor;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.NodesListManager;
import org.apache.hadoop.yarn.server.resourcemanager.RMContextImpl;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.NodeEventDispatcher;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM;
import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Timeout;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

/**
 * TestNMReconnect run tests against the scheduler set by
 * {@link ParameterizedSchedulerTestBase} which is configured
 * in {@link YarnConfiguration}.
 */
public class TestNMReconnect extends ParameterizedSchedulerTestBase {
  private static final RecordFactory recordFactory = 
      RecordFactoryProvider.getRecordFactory(null);

  private List<RMNodeEvent> rmNodeEvents = new ArrayList<RMNodeEvent>();
  private Dispatcher dispatcher;
  private RMContextImpl context;

  public void initTestNMReconnect(SchedulerType type) throws IOException {
    initParameterizedSchedulerTestBase(type);
    setUp();
  }

  private class TestRMNodeEventDispatcher implements
      EventHandler<RMNodeEvent> {

    @Override
    public void handle(RMNodeEvent event) {
      rmNodeEvents.add(event);
    }

  }

  ResourceTrackerService resourceTrackerService;

  public void setUp() {
    Configuration conf = new Configuration();
    // Dispatcher that processes events inline
    dispatcher = new InlineDispatcher();

    dispatcher.register(RMNodeEventType.class,
        new TestRMNodeEventDispatcher());

    context = new RMContextImpl(dispatcher, null,
        null, null, null, null, null, null, null, null);
    dispatcher.register(SchedulerEventType.class,
        new InlineDispatcher.EmptyEventHandler());
    dispatcher.register(RMNodeEventType.class,
        new NodeEventDispatcher(context));
    NMLivelinessMonitor nmLivelinessMonitor = new NMLivelinessMonitor(
        dispatcher);
    nmLivelinessMonitor.init(conf);
    nmLivelinessMonitor.start();
    NodesListManager nodesListManager = new NodesListManager(context);
    nodesListManager.init(conf);
    RMContainerTokenSecretManager containerTokenSecretManager =
        new RMContainerTokenSecretManager(conf);
    containerTokenSecretManager.start();
    NMTokenSecretManagerInRM nmTokenSecretManager =
        new NMTokenSecretManagerInRM(conf);
    nmTokenSecretManager.start();
    resourceTrackerService = new ResourceTrackerService(context,
        nodesListManager, nmLivelinessMonitor, containerTokenSecretManager,
        nmTokenSecretManager);
    
    resourceTrackerService.init(conf);
    resourceTrackerService.start();
  }

  @AfterEach
  public void tearDown() {
    resourceTrackerService.stop();
  }

  @ParameterizedTest(name = "{0}")
  @MethodSource("getParameters")
  public void testReconnect(SchedulerType type) throws Exception {
    initTestNMReconnect(type);
    String hostname1 = "localhost1";
    Resource capability = Resources.createResource(1024);

    RegisterNodeManagerRequest request1 = recordFactory
        .newRecordInstance(RegisterNodeManagerRequest.class);
    NodeId nodeId1 = NodeId.newInstance(hostname1, 0);
    request1.setNodeId(nodeId1);
    request1.setHttpPort(0);
    request1.setResource(capability);
    resourceTrackerService.registerNodeManager(request1);

    assertEquals(RMNodeEventType.STARTED, rmNodeEvents.get(0).getType());

    rmNodeEvents.clear();
    resourceTrackerService.registerNodeManager(request1);
    assertEquals(RMNodeEventType.RECONNECTED,
        rmNodeEvents.get(0).getType());

    rmNodeEvents.clear();
    resourceTrackerService.registerNodeManager(request1);
    capability = Resources.createResource(1024, 2);
    request1.setResource(capability);
    assertEquals(RMNodeEventType.RECONNECTED,
        rmNodeEvents.get(0).getType());
  }

  @ParameterizedTest(name = "{0}")
  @MethodSource("getParameters")
  public void testCompareRMNodeAfterReconnect(SchedulerType type) throws Exception {
    initTestNMReconnect(type);
    AbstractYarnScheduler scheduler = getScheduler();
    Configuration yarnConf = new YarnConfiguration();
    ConfigurationProvider configurationProvider =
        ConfigurationProviderFactory.getConfigurationProvider(yarnConf);
    configurationProvider.init(yarnConf);
    context.setConfigurationProvider(configurationProvider);
    RMNodeLabelsManager nlm = new RMNodeLabelsManager();
    nlm.init(yarnConf);
    nlm.start();
    context.setNodeLabelManager(nlm);
    scheduler.setRMContext(context);
    scheduler.init(yarnConf);
    scheduler.start();
    dispatcher.register(SchedulerEventType.class, scheduler);

    String hostname1 = "localhost1";
    Resource capability = Resources.createResource(4096, 4);

    RegisterNodeManagerRequest request1 = recordFactory
        .newRecordInstance(RegisterNodeManagerRequest.class);
    NodeId nodeId1 = NodeId.newInstance(hostname1, 0);

    NodeStatus mockNodeStatus = createMockNodeStatus();

    request1.setNodeId(nodeId1);
    request1.setHttpPort(0);
    request1.setResource(capability);
    request1.setNodeStatus(mockNodeStatus);
    resourceTrackerService.registerNodeManager(request1);
    assertNotNull(context.getRMNodes().get(nodeId1));
    // verify Scheduler and RMContext use same RMNode reference.
    assertTrue(scheduler.getSchedulerNode(nodeId1).getRMNode() ==
        context.getRMNodes().get(nodeId1));
    assertEquals(context.getRMNodes().get(nodeId1).
        getTotalCapability(), capability);
    Resource capability1 = Resources.createResource(2048, 2);
    request1.setResource(capability1);
    resourceTrackerService.registerNodeManager(request1);
    assertNotNull(context.getRMNodes().get(nodeId1));
    // verify Scheduler and RMContext use same RMNode reference
    // after reconnect.
    assertTrue(scheduler.getSchedulerNode(nodeId1).getRMNode() ==
        context.getRMNodes().get(nodeId1));
    // verify RMNode's capability is changed.
    assertEquals(context.getRMNodes().get(nodeId1).
        getTotalCapability(), capability1);
    nlm.stop();
    scheduler.stop();
  }

  @SuppressWarnings("unchecked")
  @ParameterizedTest(name = "{0}")
  @MethodSource("getParameters")
  @Timeout(10)
  public void testDecommissioningNodeReconnect(SchedulerType type) throws Exception {
    initTestNMReconnect(type);

    MockRM rm = new MockRM();
    rm.start();
    MockNM nm1 =
        new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
    nm1.registerNode();
    rm.waitForState(nm1.getNodeId(), NodeState.RUNNING);

    rm.getRMContext().getDispatcher().getEventHandler().handle(
        new RMNodeEvent(nm1.getNodeId(),
            RMNodeEventType.GRACEFUL_DECOMMISSION));
    rm.waitForState(nm1.getNodeId(), NodeState.DECOMMISSIONING);

    MockNM nm2 =
        new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
    RegisterNodeManagerResponse response = nm2.registerNode();
    // not SHUTDOWN
    assertTrue(response.getNodeAction().equals(NodeAction.NORMAL));
    rm.stop();
  }

  @Timeout(10)
  @ParameterizedTest(name = "{0}")
  @MethodSource("getParameters")
  public void testRMNodeStatusAfterReconnect(SchedulerType type) throws Exception {
    initTestNMReconnect(type);
    // The node(127.0.0.1:1234) reconnected with RM. When it registered with
    // RM, RM set its lastNodeHeartbeatResponse's id to 0 asynchronously. But
    // the node's heartbeat come before RM succeeded setting the id to 0.
    MockRM rm = new MockRM();
    rm.start();

    MockNM nm1 =
        new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
    nm1.registerNode();
    int i = 0;
    while(i < 3) {
      nm1.nodeHeartbeat(true);
      rm.drainEvents();
      i++;
    }

    MockNM nm2 =
        new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
    nm2.registerNode();
    RMNode rmNode = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
    nm2.nodeHeartbeat(true);
    rm.drainEvents();
    assertEquals(NodeState.RUNNING, rmNode.getState(), "Node is Not in Running state.");
    rm.stop();
  }
}