DataSplitter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup.tools;

import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

/**
 * Splits corpus data into train/dev/test sets, stratified by language.
 * <p>
 * The split is done per-language so each language is represented
 * proportionally in all three sets. Default split ratio is 80/10/10.
 * </p>
 * <p>
 * The split files are written to disk as tab-delimited files
 * ({@code language\ttext}) so the split is deterministic and reproducible.
 * </p>
 */
public class DataSplitter {

    public static final String TRAIN_FILE = "train.txt";
    public static final String DEV_FILE = "dev.txt";
    public static final String TEST_FILE = "test.txt";

    private final float trainRatio;
    private final float devRatio;
    // testRatio = 1 - trainRatio - devRatio

    private final long seed;

    public DataSplitter() {
        this(0.8f, 0.1f, 42L);
    }

    public DataSplitter(float trainRatio, float devRatio, long seed) {
        if (trainRatio + devRatio >= 1.0f) {
            throw new IllegalArgumentException(
                    "trainRatio + devRatio must be < 1.0: " + trainRatio + " + " + devRatio);
        }
        this.trainRatio = trainRatio;
        this.devRatio = devRatio;
        this.seed = seed;
    }

    /**
     * Split the given sentences into train/dev/test and write them to the output directory.
     *
     * @param sentences all labeled sentences
     * @param outputDir directory to write train.txt, dev.txt, test.txt
     * @return a SplitResult containing the three lists
     * @throws IOException if writing fails
     */
    public SplitResult splitAndWrite(List<LabeledSentence> sentences, Path outputDir)
            throws IOException {
        SplitResult result = split(sentences);
        Files.createDirectories(outputDir);
        writeFile(outputDir.resolve(TRAIN_FILE), result.train);
        writeFile(outputDir.resolve(DEV_FILE), result.dev);
        writeFile(outputDir.resolve(TEST_FILE), result.test);
        return result;
    }

    /**
     * Split sentences into train/dev/test, stratified by language.
     */
    public SplitResult split(List<LabeledSentence> sentences) {
        // Group by language
        Map<String, List<LabeledSentence>> byLang = new HashMap<>();
        for (LabeledSentence s : sentences) {
            byLang.computeIfAbsent(s.getLanguage(), k -> new ArrayList<>()).add(s);
        }

        List<LabeledSentence> train = new ArrayList<>();
        List<LabeledSentence> dev = new ArrayList<>();
        List<LabeledSentence> test = new ArrayList<>();

        Random rng = new Random(seed);

        for (Map.Entry<String, List<LabeledSentence>> entry : byLang.entrySet()) {
            List<LabeledSentence> langSentences = new ArrayList<>(entry.getValue());
            Collections.shuffle(langSentences, rng);

            int n = langSentences.size();
            int trainEnd = (int) (n * trainRatio);
            int devEnd = trainEnd + (int) (n * devRatio);

            train.addAll(langSentences.subList(0, trainEnd));
            dev.addAll(langSentences.subList(trainEnd, devEnd));
            test.addAll(langSentences.subList(devEnd, n));
        }

        return new SplitResult(train, dev, test);
    }

    /**
     * Read a split file back into labeled sentences.
     */
    public static List<LabeledSentence> readSplitFile(Path file) throws IOException {
        List<LabeledSentence> sentences = new ArrayList<>();
        for (String line : Files.readAllLines(file, StandardCharsets.UTF_8)) {
            int tab = line.indexOf('\t');
            if (tab > 0) {
                sentences.add(new LabeledSentence(line.substring(0, tab),
                        line.substring(tab + 1)));
            }
        }
        return sentences;
    }

    private void writeFile(Path file, List<LabeledSentence> sentences) throws IOException {
        try (BufferedWriter writer = Files.newBufferedWriter(file, StandardCharsets.UTF_8)) {
            for (LabeledSentence s : sentences) {
                writer.write(s.getLanguage());
                writer.write('\t');
                writer.write(s.getText());
                writer.newLine();
            }
        }
    }

    public static class SplitResult {
        public final List<LabeledSentence> train;
        public final List<LabeledSentence> dev;
        public final List<LabeledSentence> test;

        SplitResult(List<LabeledSentence> train, List<LabeledSentence> dev,
                    List<LabeledSentence> test) {
            this.train = train;
            this.dev = dev;
            this.test = test;
        }
    }
}