DictionaryEncoderBenchmarks.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.arrow.vector.dictionary;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

/** Benchmarks for {@link DictionaryEncoder}. */
@State(Scope.Benchmark)
public class DictionaryEncoderBenchmarks {

  private BufferAllocator allocator;

  private static final int DATA_SIZE = 1000;
  private static final int KEY_SIZE = 100;

  private static final int KEY_LENGTH = 10;

  private List<String> keys = new ArrayList<>();

  private VarCharVector vector;

  private VarCharVector dictionaryVector;

  /** Setup benchmarks. */
  @Setup
  public void prepare() {

    for (int i = 0; i < KEY_SIZE; i++) {
      keys.add(generateUniqueKey(KEY_LENGTH));
    }

    allocator = new RootAllocator(10 * 1024 * 1024);

    vector = new VarCharVector("vector", allocator);
    dictionaryVector = new VarCharVector("dict", allocator);

    vector.allocateNew(10240, DATA_SIZE);
    vector.setValueCount(DATA_SIZE);
    for (int i = 0; i < DATA_SIZE; i++) {
      byte[] value = keys.get(generateRandomIndex(KEY_SIZE)).getBytes(StandardCharsets.UTF_8);
      vector.setSafe(i, value, 0, value.length);
    }

    dictionaryVector.allocateNew(1024, 100);
    dictionaryVector.setValueCount(100);
    for (int i = 0; i < KEY_SIZE; i++) {
      byte[] value = keys.get(i).getBytes(StandardCharsets.UTF_8);
      dictionaryVector.setSafe(i, value, 0, value.length);
    }
  }

  /** Tear down benchmarks. */
  @TearDown
  public void tearDown() {
    vector.close();
    dictionaryVector.close();
    keys.clear();
    allocator.close();
  }

  /**
   * Test encode for {@link DictionaryEncoder}.
   *
   * @return useless. To avoid DCE by JIT.
   */
  @Benchmark
  @BenchmarkMode(Mode.AverageTime)
  @OutputTimeUnit(TimeUnit.NANOSECONDS)
  public int testEncode() {
    Dictionary dictionary =
        new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
    final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary);
    encoded.close();
    return 0;
  }

  private int generateRandomIndex(int max) {
    Random random = new Random();
    return random.nextInt(max);
  }

  private String generateUniqueKey(int length) {
    String str = "abcdefghijklmnopqrstuvwxyz";
    Random random = new Random();
    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < length; i++) {
      int number = random.nextInt(26);
      sb.append(str.charAt(number));
    }
    if (keys.contains(sb.toString())) {
      return generateUniqueKey(length);
    }
    return sb.toString();
  }

  public static void main(String[] args) throws RunnerException {
    Options opt =
        new OptionsBuilder()
            .include(DictionaryEncoderBenchmarks.class.getSimpleName())
            .forks(1)
            .build();

    new Runner(opt).run();
  }
}