TestLineReader.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.util;

import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.apache.hadoop.io.Text;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class TestLineReader {

  /**
   * TEST_1: The test scenario is the tail of the buffer equals the starting
   * character/s of delimiter.
   *
   * The Test Data is such that,
   *
   * 1) we will have "</entity>" as delimiter
   *
   * 2) The tail of the current buffer would be "</" which matches with the
   * starting character sequence of delimiter.
   *
   * 3) The Head of the next buffer would be "id>" which does NOT match with
   * the remaining characters of delimiter.
   *
   * 4) Input data would be prefixed by char 'a' about
   * numberOfCharToFillTheBuffer times. So that, one iteration to buffer the
   * input data, would end at '</' ie equals starting 2 char of delimiter
   *
   * 5) For this we would take BufferSize as 64 * 1024;
   *
   * Check Condition In the second key value pair, the value should contain
   * "</" from currentToken and "id>" from next token
   */
  @Test
  public void testCustomDelimiter1() throws Exception {

    final String delimiter = "</entity>";

    // Ending part of Input Data Buffer
    // It contains '</' ie delimiter character
    final String currentBufferTailToken = "</entity><entity><id>Gelesh</";

    // Supposing the start of next buffer is this
    final String nextBufferHeadToken = "id><name>Omathil</name></entity>";

    // Expected must capture from both the buffer, excluding Delimiter
    final String expected =
        (currentBufferTailToken + nextBufferHeadToken).replace(delimiter, "");

    final String testPartOfInput = currentBufferTailToken + nextBufferHeadToken;

    final int bufferSize = 64 * 1024;
    int numberOfCharToFillTheBuffer =
        bufferSize - currentBufferTailToken.length();

    final char[] fillBuffer = new char[numberOfCharToFillTheBuffer];

    // char 'a' as a filler for the test string
    Arrays.fill(fillBuffer, 'a');

    final StringBuilder fillerString = new StringBuilder();

    final String testData = fillerString + testPartOfInput;

    final LineReader lineReader = new LineReader(
        new ByteArrayInputStream(testData.getBytes(StandardCharsets.UTF_8)),
        delimiter.getBytes(StandardCharsets.UTF_8));

    final Text line = new Text();
    lineReader.readLine(line);
    lineReader.close();

    assertEquals(fillerString.toString(), line.toString());

    lineReader.readLine(line);
    assertEquals(expected, line.toString());
  }

  /**
   * TEST_2: The test scenario is such that, the character/s preceding the
   * delimiter, equals the starting character/s of delimiter.
   */
  @Test
  public void testCustomDelimiter2() throws Exception {
    final String delimiter = "record";
    final StringBuilder testStringBuilder = new StringBuilder();

    testStringBuilder.append(delimiter).append("Kerala ");
    testStringBuilder.append(delimiter).append("Bangalore");
    testStringBuilder.append(delimiter).append(" North Korea");
    testStringBuilder.append(delimiter).append(delimiter).append("Guantanamo");

    // ~EOF with 're'
    testStringBuilder.append(delimiter + "ecord" + "recor" + "core");

    final String testData = testStringBuilder.toString();

    final LineReader lineReader = new LineReader(
        new ByteArrayInputStream(testData.getBytes(StandardCharsets.UTF_8)),
        delimiter.getBytes((StandardCharsets.UTF_8)));

    final Text line = new Text();

    lineReader.readLine(line);
    assertEquals("", line.toString());
    lineReader.readLine(line);
    assertEquals("Kerala ", line.toString());

    lineReader.readLine(line);
    assertEquals("Bangalore", line.toString());

    lineReader.readLine(line);
    assertEquals(" North Korea", line.toString());

    lineReader.readLine(line);
    assertEquals("", line.toString());
    lineReader.readLine(line);
    assertEquals("Guantanamo", line.toString());

    lineReader.readLine(line);
    assertEquals(("ecord" + "recor" + "core"), line.toString());

    lineReader.close();
  }

  /**
   * Test 3: The test scenario is such that, aaabccc split by aaab.
   */
  @Test
  public void testCustomDelimiter3() throws Exception {
    final String testData = "aaaabccc";
    final String delimiter = "aaab";
    final LineReader lineReader = new LineReader(
        new ByteArrayInputStream(testData.getBytes(StandardCharsets.UTF_8)),
        delimiter.getBytes(StandardCharsets.UTF_8));

    final Text line = new Text();

    lineReader.readLine(line);
    assertEquals("a", line.toString());
    lineReader.readLine(line);
    assertEquals("ccc", line.toString());

    lineReader.close();
  }
}