DataFrameJoinerPerformanceTest.java

package tech.tablesaw.perf;

import static org.junit.jupiter.api.Assertions.assertTimeout;
import static tech.tablesaw.joining.JoinType.FULL_OUTER;
import static tech.tablesaw.joining.JoinType.LEFT_OUTER;

import java.time.Duration;
import java.util.*;
import java.util.stream.IntStream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import tech.tablesaw.api.IntColumn;
import tech.tablesaw.api.Table;

@Tag("Slow")
@Tag("Flaky")
public class DataFrameJoinerPerformanceTest {

  private static final long SEED = 200L;

  private static final int CUSTOMER_COUNT = 1_000;
  private static final int ORDER_COUNT = 10_000;

  private static final int CUSTOMER_FILL_COL_COUNT = 5;
  private static final int ORDER_FILL_COL_COUNT = 5;

  private static final int TIME_OUT_MILLIES = 1000;

  private static Table customers;
  private static Table orders;

  private static final Map<Integer, Integer> REGION_MAP = new HashMap<>();

  @BeforeAll
  static void setup() {
    customers = createCustomersTable(CUSTOMER_COUNT);
    orders = createOrdersTable(ORDER_COUNT, CUSTOMER_COUNT);
    addFillerColumns(customers, orders, CUSTOMER_FILL_COL_COUNT, ORDER_FILL_COL_COUNT);
  }

  private static void addFillerColumn(Table table, int numberColumnsToAdd, String prefix) {
    int[] filler = new int[table.rowCount()];
    Arrays.fill(filler, 1);
    IntColumn col = IntColumn.create("temp", filler);
    for (int i = 0; i < numberColumnsToAdd; i++) {
      table.addColumns(col.copy().setName(prefix + "_appendColumn" + i));
    }
  }

  private static Table createCustomersTable(int numberCustomers) {
    Random random = new Random(SEED);

    Table customersTable = Table.create("customers");
    IntColumn customerIds =
        IntColumn.create("customerId", IntStream.range(0, numberCustomers).toArray());
    IntColumn regions = IntColumn.create("region", numberCustomers);
    for (int i = 0; i < numberCustomers; i++) {
      int val = random.nextInt(49);
      REGION_MAP.put(customerIds.get(i), val);
      regions.set(i, val);
    }
    customersTable.addColumns(customerIds, regions);
    return customersTable;
  }

  private static Table createOrdersTable(int numberOrders, int numberCustomers) {
    Table ordersTable = Table.create("orders");
    Random random = new Random(SEED);
    IntColumn orderCustomerIds =
        IntColumn.create(
            "customerId",
            random
                .doubles()
                .limit(numberOrders)
                .mapToInt(randomDouble -> (int) Math.floor(randomDouble * numberCustomers))
                .toArray());

    IntColumn regions = IntColumn.create("region", numberOrders);
    for (int i = 0; i < numberOrders; i++) {
      int customer = orderCustomerIds.getInt(i);
      int region = REGION_MAP.get(customer);
      regions.set(i, region);
    }
    ordersTable.addColumns(orderCustomerIds, regions);
    return ordersTable;
  }

  @Test
  public void innerJoinCustomersFirst() {
    assertTimeout(
        Duration.ofMillis(TIME_OUT_MILLIES),
        () -> customers.joinOn("customerId").with(orders).allowDuplicateColumnNames(true).join());
  }

  @Test
  public void innerJoinCustomersFirst2() {
    assertTimeout(
        Duration.ofMillis(TIME_OUT_MILLIES),
        () ->
            customers
                .joinOn("customerId", "region")
                .with(orders)
                .allowDuplicateColumnNames(true)
                .join());
  }

  @Test
  public void innerJoinOrdersFirst() {
    assertTimeout(
        Duration.ofMillis(TIME_OUT_MILLIES),
        () -> orders.joinOn("customerId").with(customers).allowDuplicateColumnNames(true).join());
  }

  @Test
  public void leftOuterOrdersFirst() {
    assertTimeout(
        Duration.ofMillis(TIME_OUT_MILLIES),
        () ->
            orders
                .joinOn("customerId")
                .with(customers)
                .type(LEFT_OUTER)
                .allowDuplicateColumnNames(true)
                .join());
  }

  @Test
  public void leftOuterCustomersFirst() {
    assertTimeout(
        Duration.ofMillis(TIME_OUT_MILLIES),
        () ->
            customers
                .joinOn("customerId")
                .with(orders)
                .type(LEFT_OUTER)
                .allowDuplicateColumnNames(true)
                .join());
  }

  @Test
  public void fullOuterJoin() {
    assertTimeout(
        Duration.ofMillis(TIME_OUT_MILLIES),
        () ->
            customers
                .joinOn("customerId")
                .with(orders)
                .type(FULL_OUTER)
                .allowDuplicateColumnNames(true)
                .join());
  }

  private static void addFillerColumns(
      Table customers, Table orders, int customerFillCols, int orderFillCols) {
    addFillerColumn(customers, customerFillCols, "customer");
    addFillerColumn(orders, orderFillCols, "order");
  }
}