JsonlReader.java

package tech.tablesaw.io.jsonl;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.github.wnameless.json.flattener.JsonFlattener;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import tech.tablesaw.api.Table;
import tech.tablesaw.io.DataReader;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.io.ReaderRegistry;
import tech.tablesaw.io.RuntimeIOException;
import tech.tablesaw.io.Source;
import tech.tablesaw.io.TableBuildingUtils;

public class JsonlReader implements DataReader<JsonlReadOptions> {

  private static final JsonlReader INSTANCE = new JsonlReader();
  private static final ObjectMapper mapper = new ObjectMapper();

  static {
    register(Table.defaultReaderRegistry);
  }

  public static void register(ReaderRegistry registry) {
    // no standard exists yet. taken from https://murex.rocks/types/jsonl.html#more-information
    registry.registerExtension("jsonl", INSTANCE);
    registry.registerMimeType("application/json-lines", INSTANCE);
    registry.registerMimeType("application/jsonl", INSTANCE);
    registry.registerMimeType("application/jsonlines", INSTANCE);
    registry.registerMimeType("application/ldjson", INSTANCE);
    registry.registerMimeType("application/ndjson", INSTANCE);
    registry.registerMimeType("application/x-json-lines", INSTANCE);
    registry.registerMimeType("application/x-jsonl", INSTANCE);
    registry.registerMimeType("application/x-jsonlines", INSTANCE);
    registry.registerMimeType("application/x-ldjson", INSTANCE);
    registry.registerMimeType("application/x-ndjson", INSTANCE);
    registry.registerMimeType("text/json-lines", INSTANCE);
    registry.registerMimeType("text/jsonl", INSTANCE);
    registry.registerMimeType("text/jsonlines", INSTANCE);
    registry.registerMimeType("text/ldjson", INSTANCE);
    registry.registerMimeType("text/ndjson", INSTANCE);
    registry.registerMimeType("text/x-json-lines", INSTANCE);
    registry.registerMimeType("text/x-jsonl", INSTANCE);
    registry.registerMimeType("text/x-jsonlines", INSTANCE);
    registry.registerMimeType("text/x-ldjson", INSTANCE);
    registry.registerMimeType("text/x-ndjson", INSTANCE);
    registry.registerOptions(JsonlReadOptions.class, INSTANCE);
  }

  @Override
  public Table read(JsonlReadOptions options) {
    ObjectReader stream = mapper.readerFor(JsonNode.class);
    try {
      Reader reader = options.source().createReader(null);
      JsonParser parser = stream.createParser(reader);
      Iterator<JsonNode> iter = stream.readValues(parser);
      return convertObjects(iter, options);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  private Table convertObjects(Iterator<JsonNode> iter, ReadOptions options) {
    // flatten each object inside the array
    StringBuilder result = new StringBuilder("[");
    boolean first = true;
    for (; iter.hasNext(); ) {
      JsonNode rowObj = iter.next();
      String flattenedRow = null;
      try {
        flattenedRow = JsonFlattener.flatten(mapper.writeValueAsString(rowObj));
      } catch (JsonProcessingException e) {
        throw new RuntimeIOException(e);
      }
      if (!first) {
        result.append(",");
      }
      first = false;
      result.append(flattenedRow);
    }
    String flattenedJsonString = result.append("]").toString();
    JsonNode flattenedJsonObj = null;
    try {
      flattenedJsonObj = mapper.readTree(flattenedJsonString);
    } catch (JsonProcessingException e) {
      throw new RuntimeIOException(e);
    }

    Set<String> colNames = new LinkedHashSet<>();
    for (JsonNode row : flattenedJsonObj) {
      Iterator<String> fieldNames = row.fieldNames();
      while (fieldNames.hasNext()) {
        colNames.add(fieldNames.next());
      }
    }

    List<String> columnNames = new ArrayList<>(colNames);
    List<String[]> dataRows = new ArrayList<>();
    for (JsonNode node : flattenedJsonObj) {
      String[] arr = new String[columnNames.size()];
      for (int i = 0; i < columnNames.size(); i++) {
        if (node.has(columnNames.get(i))) {
          arr[i] = node.get(columnNames.get(i)).asText();
        } else {
          arr[i] = null;
        }
      }
      dataRows.add(arr);
    }

    return TableBuildingUtils.build(columnNames, dataRows, options);
  }

  @Override
  public Table read(Source source) {
    return read(JsonlReadOptions.builder(source).build());
  }
}