BCP47Validator.java

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2025 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
package com.itextpdf.kernel.utils.checkers;

import java.util.regex.Pattern;

/**
 * This class is a validator for IETF BCP 47 language tag (RFC 5646).
 */
public final class BCP47Validator {
    private static final String REGULAR =
            "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)";
    private static final String IRREGULAR = "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|" +
            "i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)";
    private static final String GRANDFATHERED = "(?<grandfathered>" + IRREGULAR + "|" + REGULAR + ")";
    private static final String PRIVATE_USE = "(?<privateUse>x(-[A-Za-z0-9]{1,8})+)";
    private static final String SINGLETON = "[0-9A-WY-Za-wy-z]";
    private static final String EXTENSION = "(?<extension>" + SINGLETON + "(-[A-Za-z0-9]{2,8})+)";
    private static final String VARIANT = "(?<variant>[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3})";
    private static final String REGION = "(?<region>[A-Za-z]{2}|[0-9]{3})";
    private static final String SCRIPT = "(?<script>[A-Za-z]{4})";
    private static final String EXTLANG = "(?<extlang>[A-Za-z]{3}(-[A-Za-z]{3}){0,2})";
    private static final String LANGUAGE = "(?<language>([A-Za-z]{2,3}(-" + EXTLANG + ")?)|[A-Za-z]{4}|[A-Za-z]{5,8})";
    private static final String LANGTAG = "(" + LANGUAGE + "(-" + SCRIPT + ")?" + "(-" + REGION + ")?"
            + "(-" + VARIANT + ")*" + "(-" + EXTENSION + ")*" + "(-" + PRIVATE_USE + ")?" + ")";
    // Java regex polices doesn't allow duplicate named capture groups,
    // so we have to change the 2nd use <privateUse> to ?<privateUse1>.
    private static final Pattern LANGUAGE_TAG_PATTERN = Pattern.compile("^(" + GRANDFATHERED + "|" + LANGTAG + "|" +
            PRIVATE_USE.replace("privateUse", "privateUse1") + ")$");

    private BCP47Validator() {
        // Private constructor will prevent the instantiation of this class directly.
    }

    /**
     * Validate language tag against RFC 5646.
     *
     * @param languageTag language tag string
     *
     * @return {@code true} if it is a valid tag, {@code false} otherwise
     */
    public static boolean validate(String languageTag) {
        return LANGUAGE_TAG_PATTERN.matcher(languageTag).matches();
    }
}