TestText.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.io;

import java.io.IOException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.StandardCharsets;
import java.util.Random;

import org.apache.hadoop.constants.ConfigConstants;
import org.apache.hadoop.thirdparty.com.google.common.primitives.Bytes;
import org.junit.jupiter.api.Test;

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

/** Unit tests for LargeUTF8. */
public class TestText {
  private static final int NUM_ITERATIONS = 100;

  private static final Random RANDOM = new Random(1);

  private static final int RAND_LEN = -1;
  
  // generate a valid java String
  private static String getTestString(int len) throws Exception {
    StringBuilder buffer = new StringBuilder();    
    int length = (len==RAND_LEN) ? RANDOM.nextInt(1000) : len;
    while (buffer.length()<length) {
      int codePoint = RANDOM.nextInt(Character.MAX_CODE_POINT);
      char tmpStr[] = new char[2];
      if (Character.isDefined(codePoint)) {
        //unpaired surrogate
        if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT &&
            !Character.isHighSurrogate((char)codePoint) &&
            !Character.isLowSurrogate((char)codePoint)) {
          Character.toChars(codePoint, tmpStr, 0);
          buffer.append(tmpStr);
        }
      }
    }
    return buffer.toString();
  }
  
  public static String getTestString() throws Exception {
    return getTestString(RAND_LEN);
  }
  
  public static String getLongString() throws Exception {
    String str = getTestString();
    int length = Short.MAX_VALUE+str.length();
    StringBuilder buffer = new StringBuilder();
    while(buffer.length()<length)
      buffer.append(str);
      
    return buffer.toString();
  }

  @Test
  public void testWritable() throws Exception {
    for (int i = 0; i < NUM_ITERATIONS; i++) {
      String str;
      if (i == 0)
        str = getLongString();
      else
        str = getTestString();
      TestWritable.testWritable(new Text(str));
    }
  }


  @Test
  public void testCoding() throws Exception {
    String before = "Bad \t encoding \t testcase";
    Text text = new Text(before);
    String after = text.toString();
    assertTrue(before.equals(after));

    for (int i = 0; i < NUM_ITERATIONS; i++) {
      // generate a random string
      if (i == 0)
        before = getLongString();
      else
        before = getTestString();
    
      // test string to utf8
      ByteBuffer bb = Text.encode(before);
          
      byte[] utf8Text = bb.array();
      byte[] utf8Java = before.getBytes(StandardCharsets.UTF_8);
      assertEquals(0, WritableComparator.compareBytes(
              utf8Text, 0, bb.limit(),
              utf8Java, 0, utf8Java.length));
      // test utf8 to string
      after = Text.decode(utf8Java);
      assertTrue(before.equals(after));
    }
  }

  @Test
  public void testIO() throws Exception {
    DataOutputBuffer out = new DataOutputBuffer();
    DataInputBuffer in = new DataInputBuffer();

    for (int i = 0; i < NUM_ITERATIONS; i++) {
      // generate a random string
      String before;          
      if (i == 0)
        before = getLongString();
      else
        before = getTestString();
        
      // write it
      out.reset();
      Text.writeString(out, before);
        
      // test that it reads correctly
      in.reset(out.getData(), out.getLength());
      String after = Text.readString(in);
      assertTrue(before.equals(after));
        
      // Test compatibility with Java's other decoder 
      int strLenSize = WritableUtils.getVIntSize(Text.utf8Length(before));
      String after2 = new String(out.getData(), strLenSize, 
                                 out.getLength()-strLenSize, "UTF-8");
      assertTrue(before.equals(after2));
    }
  }
  
  public void doTestLimitedIO(String str, int len) throws IOException {
    DataOutputBuffer out = new DataOutputBuffer();
    DataInputBuffer in = new DataInputBuffer();

    out.reset();
    try {
      Text.writeString(out, str, len);
      fail("expected writeString to fail when told to write a string " +
          "that was too long!  The string was '" + str + "'");
    } catch (IOException e) {
    }
    Text.writeString(out, str, len + 1);

    // test that it reads correctly
    in.reset(out.getData(), out.getLength());
    in.mark(len);
    String after;
    try {
      after = Text.readString(in, len);
      fail("expected readString to fail when told to read a string " +
          "that was too long!  The string was '" + str + "'");
    } catch (IOException e) {
    }
    in.reset();
    after = Text.readString(in, len + 1);
    assertTrue(str.equals(after));
  }

  @Test
  public void testLimitedIO() throws Exception {
    doTestLimitedIO("abcd", 3);
    doTestLimitedIO("foo bar baz", 10);
    doTestLimitedIO("1", 0);
  }

  @Test
  public void testCompare() throws Exception {
    DataOutputBuffer out1 = new DataOutputBuffer();
    DataOutputBuffer out2 = new DataOutputBuffer();
    DataOutputBuffer out3 = new DataOutputBuffer();
    Text.Comparator comparator = new Text.Comparator();
    for (int i=0; i<NUM_ITERATIONS; i++) {
      // reset output buffer
      out1.reset();
      out2.reset();
      out3.reset();

      // generate two random strings
      String str1 = getTestString();
      String str2 = getTestString();
      if (i == 0) {
        str1 = getLongString();
        str2 = getLongString();
      } else {
        str1 = getTestString();
        str2 = getTestString();
      }
          
      // convert to texts
      Text txt1 = new Text(str1);
      Text txt2 = new Text(str2);
      Text txt3 = new Text(str1);
          
      // serialize them
      txt1.write(out1);
      txt2.write(out2);
      txt3.write(out3);
          
      // compare two strings by looking at their binary formats
      int ret1 = comparator.compare(out1.getData(), 0, out1.getLength(),
                                    out2.getData(), 0, out2.getLength());
      // compare two strings
      int ret2 = txt1.compareTo(txt2);
          
      assertEquals(ret1, ret2);
          
      assertEquals(0, txt1.compareTo(txt3),
          "Equivalence of different txt objects, same content");
      assertEquals(0, comparator.compare(out1.getData(), 0, out3.getLength(),
          out3.getData(), 0, out3.getLength()),
          "Equvalence of data output buffers");
    }
  }

  @Test
  public void testFind() throws Exception {
    Text text = new Text("abcd\u20acbdcd\u20ac");
    assertThat(text.find("abd")).isEqualTo(-1);
    assertThat(text.find("ac")).isEqualTo(-1);
    assertThat(text.find("\u20ac")).isEqualTo(4);
    assertThat(text.find("\u20ac", 5)).isEqualTo(11);
  }

  @Test
  public void testFindAfterUpdatingContents() throws Exception {
    Text text = new Text("abcd");
    text.set("a".getBytes());
    assertEquals(text.getLength(),1);
    assertEquals(text.find("a"), 0);
    assertEquals(text.find("b"), -1);
  }

  @Test
  public void testValidate() throws Exception {
    Text text = new Text("abcd\u20acbdcd\u20ac");
    byte [] utf8 = text.getBytes();
    int length = text.getLength();
    Text.validateUTF8(utf8, 0, length);
  }

  @Test
  public void testClear() throws Exception {
    // Test lengths on an empty text object
    Text text = new Text();
    assertEquals("", text.toString(),
        "Actual string on an empty text object must be an empty string");
    assertEquals(0, text.getBytes().length,
        "Underlying byte array length must be zero");
    assertEquals(0, text.getLength(), "String's length must be zero");
    assertEquals(0, text.getTextLength(), "String's text length must be zero");

    // Test if clear works as intended
    text = new Text("abcd\u20acbdcd\u20ac");
    int len = text.getLength();
    text.clear();
    assertEquals("", text.toString(),
        "String must be empty after clear()");
    assertTrue(text.getBytes().length >= len,
        "Length of the byte array must not decrease after clear()");
    assertEquals(0, text.getLength(),
        "Length of the string must be reset to 0 after clear()");
    assertEquals(0, text.getTextLength(),
        "Text length of the string must be reset to 0 after clear()");
  }

  @Test
  public void testTextText() throws CharacterCodingException {
    Text a=new Text("abc");
    Text b=new Text("a");
    b.set(a);
    assertEquals("abc", b.toString());
    assertEquals(3, a.getTextLength());
    assertEquals(3, b.getTextLength());
    a.append("xdefgxxx".getBytes(), 1, 4);
    assertEquals("abc", b.toString(), "modified aliased string");
    assertEquals("abcdefg", a.toString(), "appended string incorrectly");
    assertEquals(7, a.getTextLength(), "This should reflect in the lenght");
    // add an extra byte so that capacity = 10 and length = 8
    a.append(new byte[]{'d'}, 0, 1);
    assertEquals(10, a.getBytes().length);
    assertEquals(8, a.copyBytes().length);
  }
  
  private class ConcurrentEncodeDecodeThread extends Thread {
    public ConcurrentEncodeDecodeThread(String name) {
      super(name);
    }

    @Override
    public void run() {
      final String name = this.getName();
      DataOutputBuffer out = new DataOutputBuffer();
      DataInputBuffer in = new DataInputBuffer();
      for (int i=0; i < 1000; ++i) {
        try {
          out.reset();
          WritableUtils.writeString(out, name);
          
          in.reset(out.getData(), out.getLength());
          String s = WritableUtils.readString(in);
          
          assertEquals(name, s, "input buffer reset contents = " + name);
        } catch (Exception ioe) {
          throw new RuntimeException(ioe);
        }
      }
    }
  }

  @Test
  public void testConcurrentEncodeDecode() throws Exception{
    Thread thread1 = new ConcurrentEncodeDecodeThread("apache");
    Thread thread2 = new ConcurrentEncodeDecodeThread("hadoop");
    
    thread1.start();
    thread2.start();
    
    thread2.join();
    thread2.join();
  }

  @Test
  public void testAvroReflect() throws Exception {
    // Avro expects explicitely stated, trusted packages used for (de-)serialization
    System.setProperty(ConfigConstants.CONFIG_AVRO_SERIALIZABLE_PACKAGES, "org.apache.hadoop.io");
    AvroTestUtil.testReflect
            (new Text("foo"),
                    "{\"type\":\"string\",\"java-class\":\"org.apache.hadoop.io.Text\"}");
  }
  
  /**
   * 
   */
  @Test
  public void testCharAt() {
    String line = "adsawseeeeegqewgasddga";
    Text text = new Text(line);
    for (int i = 0; i < line.length(); i++) {
      assertTrue(text.charAt(i) == line.charAt(i), "testCharAt error1 !!!");
    }    
    assertEquals(-1, text.charAt(-1), "testCharAt error2 !!!");
    assertEquals(-1, text.charAt(100), "testCharAt error3 !!!");
  }
  
  /**
   * test {@code Text} readFields/write operations
   */
  @Test
  public void testReadWriteOperations() {
    String line = "adsawseeeeegqewgasddga";
    byte[] inputBytes = line.getBytes();       
    inputBytes = Bytes.concat(new byte[] {(byte)22}, inputBytes);        
    
    DataInputBuffer in = new DataInputBuffer();
    DataOutputBuffer out = new DataOutputBuffer();
    Text text = new Text(line);
    try {      
      in.reset(inputBytes, inputBytes.length);
      text.readFields(in);      
    } catch(Exception ex) {
      fail("testReadFields error !!!");
    }    
    try {
      text.write(out);
    } catch(IOException ex) {      
    } catch(Exception ex) {
      fail("testReadWriteOperations error !!!");
    }        
  }

  @Test
  public void testReadWithKnownLength() throws IOException {
    String line = "hello world";
    byte[] inputBytes = line.getBytes(StandardCharsets.UTF_8);
    DataInputBuffer in = new DataInputBuffer();
    Text text = new Text();

    in.reset(inputBytes, inputBytes.length);
    text.readWithKnownLength(in, 5);
    assertEquals("hello", text.toString());
    assertEquals(5, text.getTextLength());

    // Read longer length, make sure it lengthens
    in.reset(inputBytes, inputBytes.length);
    text.readWithKnownLength(in, 7);
    assertEquals("hello w", text.toString());
    assertEquals(7, text.getTextLength());

    // Read shorter length, make sure it shortens
    in.reset(inputBytes, inputBytes.length);
    text.readWithKnownLength(in, 2);
    assertEquals("he", text.toString());
    assertEquals(2, text.getTextLength());
  }
  
  /**
   * test {@code Text.bytesToCodePoint(bytes) } 
   * with {@code BufferUnderflowException}
   * 
   */
  @Test
  public void testBytesToCodePoint() {
    try {
      ByteBuffer bytes = ByteBuffer.wrap(new byte[] {-2, 45, 23, 12, 76, 89});                                      
      Text.bytesToCodePoint(bytes);
      assertTrue(bytes.position() == 6, "testBytesToCodePoint error !!!");
    } catch (BufferUnderflowException ex) {
      fail("testBytesToCodePoint unexp exception");
    } catch (Exception e) {
      fail("testBytesToCodePoint unexp exception");
    }    
  }

  @Test
  public void testbytesToCodePointWithInvalidUTF() {
    try {                 
      Text.bytesToCodePoint(ByteBuffer.wrap(new byte[] {-2}));
      fail("testbytesToCodePointWithInvalidUTF error unexp exception !!!");
    } catch (BufferUnderflowException ex) {      
    } catch(Exception e) {
      fail("testbytesToCodePointWithInvalidUTF error unexp exception !!!");
    }
  }

  @Test
  public void testUtf8Length() {
    assertEquals(1, Text.utf8Length(new String(new char[]{(char) 1})),
        "testUtf8Length1 error !!!");
    assertEquals(1, Text.utf8Length(new String(new char[]{(char) 127})),
        "testUtf8Length127 error !!!");
    assertEquals(2, Text.utf8Length(new String(new char[]{(char) 128})),
        "testUtf8Length128 error !!!");
    assertEquals(2, Text.utf8Length(new String(new char[]{(char) 193})),
        "testUtf8Length193 error !!!");
    assertEquals(2, Text.utf8Length(new String(new char[]{(char) 225})),
        "testUtf8Length225 error !!!");
    assertEquals(2, Text.utf8Length(new String(new char[]{(char)254})),
        "testUtf8Length254 error !!!");
  }

  @Test
  public void testSetBytes(){
    Text a = new Text(new byte[100]);
    assertEquals(100, a.getLength(),
        "testSetBytes100 getLength error !");
    assertEquals(100, a.getBytes().length,
        "testSetBytes100 getBytes.length error !");
    assertEquals(100, a.getTextLength(),
        "testSetBytes100 getTextLength error !");

    a.set(new byte[0]);
    assertEquals(0, a.getLength(),
        "testSetBytes0 getLength error !");
    assertEquals(0, a.getBytes().length,
        "testSetBytes0 getBytes.length error !");
    assertEquals(0, a.getTextLength(),
        "testSetBytes0 getTextLength error !");
  }
}