/src/icu/icu4c/source/common/loclikely.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 1997-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  loclikely.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2010feb25
*   created by: Markus W. Scherer
*
*   Code for likely and minimized locale subtags, separated out from other .cpp files
*   that then do not depend on resource bundle code and likely-subtags data.
*/

#include <string_view>
#include <utility>

#include "unicode/bytestream.h"
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "bytesinkutil.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "loclikelysubtags.h"
#include "ulocimp.h"

namespace {

/**
 * Create a tag string from the supplied parameters.  The lang, script and region
 * parameters may be nullptr pointers. If they are, their corresponding length parameters
 * must be less than or equal to 0.
 *
 * If an illegal argument is provided, the function returns the error
 * U_ILLEGAL_ARGUMENT_ERROR.
 *
 * @param lang The language tag to use.
 * @param langLength The length of the language tag.
 * @param script The script tag to use.
 * @param scriptLength The length of the script tag.
 * @param region The region tag to use.
 * @param regionLength The length of the region tag.
 * @param variant The region tag to use.
 * @param variantLength The length of the region tag.
 * @param trailing Any trailing data to append to the new tag.
 * @param trailingLength The length of the trailing data.
 * @param sink The output sink receiving the tag string.
 * @param err A pointer to a UErrorCode for error reporting.
 **/
void U_CALLCONV
createTagStringWithAlternates(
    const char* lang,
    int32_t langLength,
    const char* script,
    int32_t scriptLength,
    const char* region,
    int32_t regionLength,
    const char* variant,
    int32_t variantLength,
    const char* trailing,
    int32_t trailingLength,
    icu::ByteSink& sink,
    UErrorCode& err) {
    if (U_FAILURE(err)) {
        return;
    }

    if (langLength >= ULOC_LANG_CAPACITY ||
            scriptLength >= ULOC_SCRIPT_CAPACITY ||
            regionLength >= ULOC_COUNTRY_CAPACITY) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    if (langLength > 0) {
        sink.Append(lang, langLength);
    }

    if (scriptLength > 0) {
        sink.Append("_", 1);
        sink.Append(script, scriptLength);
    }

    if (regionLength > 0) {
        sink.Append("_", 1);
        sink.Append(region, regionLength);
    }

    if (variantLength > 0) {
        if (regionLength == 0) {
            /* extra separator is required */
            sink.Append("_", 1);
        }
        sink.Append("_", 1);
        sink.Append(variant, variantLength);
    }

    if (trailingLength > 0) {
        /*
         * Copy the trailing data into the supplied buffer.
         */
        sink.Append(trailing, trailingLength);
    }
}

bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
    int32_t count = 0;
    for (int32_t i = 0; i < variantLength; i++) {
        if (_isIDSeparator(variant[i])) {
            count = 0;
        } else if (count == 8) {
            return false;
        } else {
            count++;
        }
    }
    return true;
}

void
_uloc_addLikelySubtags(const char* localeID,
                       icu::ByteSink& sink,
                       UErrorCode& err) {
    if (U_FAILURE(err)) {
        return;
    }

    if (localeID == nullptr) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    icu::CharString lang;
    icu::CharString script;
    icu::CharString region;
    icu::CharString variant;
    const char* trailing = nullptr;
    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
    if (U_FAILURE(err)) {
        return;
    }

    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    if (lang.length() == 4) {
        if (script.isEmpty()) {
            script = std::move(lang);
            lang.clear();
        } else {
            err = U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
    } else if (lang.length() > 8) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));

    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
    if (U_FAILURE(err)) {
        return;
    }
    // We need to keep l on the stack because lsr may point into internal
    // memory of l.
    icu::Locale l = icu::Locale::createFromName(localeID);
    if (l.isBogus()) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
    if (U_FAILURE(err)) {
        return;
    }
    const char* language = lsr.language;
    if (uprv_strcmp(language, "und") == 0) {
        language = "";
    }
    createTagStringWithAlternates(
        language,
        static_cast<int32_t>(uprv_strlen(language)),
        lsr.script,
        static_cast<int32_t>(uprv_strlen(lsr.script)),
        lsr.region,
        static_cast<int32_t>(uprv_strlen(lsr.region)),
        variant.data(),
        variant.length(),
        trailing,
        trailingLength,
        sink,
        err);
}

void
_uloc_minimizeSubtags(const char* localeID,
                      icu::ByteSink& sink,
                      bool favorScript,
                      UErrorCode& err) {
    if (U_FAILURE(err)) {
        return;
    }

    if (localeID == nullptr) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    icu::CharString lang;
    icu::CharString script;
    icu::CharString region;
    icu::CharString variant;
    const char* trailing = nullptr;
    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
    if (U_FAILURE(err)) {
        return;
    }

    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
        err = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));

    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
    if (U_FAILURE(err)) {
        return;
    }
    icu::LSR lsr = likelySubtags->minimizeSubtags(
        lang.toStringPiece(),
        script.toStringPiece(),
        region.toStringPiece(),
        favorScript,
        err);
    if (U_FAILURE(err)) {
        return;
    }
    const char* language = lsr.language;
    if (uprv_strcmp(language, "und") == 0) {
        language = "";
    }
    createTagStringWithAlternates(
        language,
        static_cast<int32_t>(uprv_strlen(language)),
        lsr.script,
        static_cast<int32_t>(uprv_strlen(lsr.script)),
        lsr.region,
        static_cast<int32_t>(uprv_strlen(lsr.region)),
        variant.data(),
        variant.length(),
        trailing,
        trailingLength,
        sink,
        err);
}

}  // namespace

U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char* localeID,
                      char* maximizedLocaleID,
                      int32_t maximizedLocaleIDCapacity,
                      UErrorCode* status) {
    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
        maximizedLocaleID, maximizedLocaleIDCapacity,
        [&](icu::ByteSink& sink, UErrorCode& status) {
            ulocimp_addLikelySubtags(localeID, sink, status);
        },
        *status);
}

U_EXPORT icu::CharString
ulocimp_addLikelySubtags(const char* localeID,
                         UErrorCode& status) {
    return icu::ByteSinkUtil::viaByteSinkToCharString(
        [&](icu::ByteSink& sink, UErrorCode& status) {
            ulocimp_addLikelySubtags(localeID, sink, status);
        },
        status);
}

U_EXPORT void
ulocimp_addLikelySubtags(const char* localeID,
                         icu::ByteSink& sink,
                         UErrorCode& status) {
    if (U_FAILURE(status)) { return; }
    if (localeID == nullptr) {
        localeID = uloc_getDefault();
    }
    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
    _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
}

U_CAPI int32_t U_EXPORT2
uloc_minimizeSubtags(const char* localeID,
                     char* minimizedLocaleID,
                     int32_t minimizedLocaleIDCapacity,
                     UErrorCode* status) {
    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
        minimizedLocaleID, minimizedLocaleIDCapacity,
        [&](icu::ByteSink& sink, UErrorCode& status) {
            ulocimp_minimizeSubtags(localeID, sink, false, status);
        },
        *status);
}

U_EXPORT icu::CharString
ulocimp_minimizeSubtags(const char* localeID,
                        bool favorScript,
                        UErrorCode& status) {
    return icu::ByteSinkUtil::viaByteSinkToCharString(
        [&](icu::ByteSink& sink, UErrorCode& status) {
            ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
        },
        status);
}

U_EXPORT void
ulocimp_minimizeSubtags(const char* localeID,
                        icu::ByteSink& sink,
                        bool favorScript,
                        UErrorCode& status) {
    if (U_FAILURE(status)) { return; }
    if (localeID == nullptr) {
        localeID = uloc_getDefault();
    }
    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
    _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
}

// Pairs of (language subtag, + or -) for finding out fast if common languages
// are LTR (minus) or RTL (plus).
static const char LANG_DIR_STRING[] =
        "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";

// Implemented here because this calls ulocimp_addLikelySubtags().
U_CAPI UBool U_EXPORT2
uloc_isRightToLeft(const char *locale) {
    UErrorCode errorCode = U_ZERO_ERROR;
    icu::CharString lang;
    icu::CharString script;
    ulocimp_getSubtags(
        locale == nullptr ? uloc_getDefault() : locale,
        &lang, &script, nullptr, nullptr, nullptr, errorCode);
    if (U_FAILURE(errorCode) || script.isEmpty()) {
        // Fastpath: We know the likely scripts and their writing direction
        // for some common languages.
        if (!lang.isEmpty()) {
            const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
            if (langPtr != nullptr) {
                switch (langPtr[lang.length()]) {
                case '-': return false;
                case '+': return true;
                default: break;  // partial match of a longer code
                }
            }
        }
        // Otherwise, find the likely script.
        errorCode = U_ZERO_ERROR;
        icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
        if (U_FAILURE(errorCode)) {
            return false;
        }
        ulocimp_getSubtags(likely.toStringPiece(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
        if (U_FAILURE(errorCode) || script.isEmpty()) {
            return false;
        }
    }
    UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
    return uscript_isRightToLeft(scriptCode);
}

U_NAMESPACE_BEGIN

UBool
Locale::isRightToLeft() const {
    return uloc_isRightToLeft(getBaseName());
}

U_NAMESPACE_END

namespace {
icu::CharString
GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
    icu::CharString result;
    // First check for keyword value
    icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
    int32_t len = kw.length();
    // In UTS35
    //   type = alphanum{3,8} (sep alphanum{3,8})* ;
    // so we know the subdivision must fit the type already.
    //
    //   unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
    //   unicode_region_subtag = (alpha{2} | digit{3}) ;
    //   unicode_subdivision_suffix = alphanum{1,4} ;
    // But we also know there are no id in start with digit{3} in
    // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
    // Therefore we can simplify as
    // unicode_subdivision_id = alpha{2} alphanum{1,4}
    //
    // and only need to accept/reject the code based on the alpha{2} and the length.
    if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
        uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
        // Additional Check
        static icu::RegionValidateMap valid;
        const char region[] = {kw[0], kw[1], '\0'};
        if (valid.isSet(region)) {
            result.append(uprv_toupper(kw[0]), status);
            result.append(uprv_toupper(kw[1]), status);
        }
    }
    return result;
}
}  // namespace

U_EXPORT icu::CharString
ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
                                     UErrorCode& status) {
    if (U_FAILURE(status)) {
        return {};
    }
    icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
    if (U_SUCCESS(status) && rgBuf.isEmpty()) {
        // No valid rg keyword value, try for unicode_region_subtag
        rgBuf = ulocimp_getRegion(localeID == nullptr ? uloc_getDefault() : localeID, status);
        if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
            // Second check for sd keyword value
            rgBuf = GetRegionFromKey(localeID, "sd", status);
            if (U_SUCCESS(status) && rgBuf.isEmpty()) {
                // no unicode_region_subtag but inferRegion true, try likely subtags
                UErrorCode rgStatus = U_ZERO_ERROR;
                icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
                if (U_SUCCESS(rgStatus)) {
                    rgBuf = ulocimp_getRegion(locBuf.toStringPiece(), status);
                }
            }
        }
    }

    return rgBuf;
}

namespace {

// The following data is generated by unit test code inside
// test/intltest/regiontst.cpp from the resource data while
// the test failed.
const uint32_t gValidRegionMap[] = {
    0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, 
    0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, 
    0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, 
    0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, 
    0x0410419a, 0x00408557, 0x00004002, 0x00100001, 
    0x00400408, 0x00000001, 
};

}  // namespace
   //
U_NAMESPACE_BEGIN
RegionValidateMap::RegionValidateMap() {
    uprv_memcpy(map, gValidRegionMap, sizeof(map));
}

RegionValidateMap::~RegionValidateMap() {
}

bool RegionValidateMap::isSet(const char* region) const {
    int32_t index = value(region);
    if (index < 0) {
        return false;
    }
    return 0 != (map[index / 32] & (1L << (index % 32)));
}

bool RegionValidateMap::equals(const RegionValidateMap& that) const {
    return uprv_memcmp(map, that.map, sizeof(map)) == 0;
}

// The code transform two letter a-z to a integer valued between -1, 26x26.
// -1 indicate the region is outside the range of two letter a-z
// the rest of value is between 0 and 676 (= 26x26) and used as an index
// the the bigmap in map. The map is an array of 22 int32_t.
// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
int32_t RegionValidateMap::value(const char* region) const {
    if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
        region[2] == '\0') {
        return (uprv_toupper(region[0])-'A') * 26 +
               (uprv_toupper(region[1])-'A');
    }
    return -1;
}

U_NAMESPACE_END

Coverage Report

Created: 2025-06-24 06:54

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		*
6		* Copyright (C) 1997-2016, International Business Machines
7		* Corporation and others. All Rights Reserved.
8		*
9		*******************************************************************************
10		* file name: loclikely.cpp
11		* encoding: UTF-8
12		* tab size: 8 (not used)
13		* indentation:4
14		*
15		* created on: 2010feb25
16		* created by: Markus W. Scherer
17		*
18		* Code for likely and minimized locale subtags, separated out from other .cpp files
19		* that then do not depend on resource bundle code and likely-subtags data.
20		*/
21
22		#include <string_view>
23		#include <utility>
24
25		#include "unicode/bytestream.h"
26		#include "unicode/utypes.h"
27		#include "unicode/locid.h"
28		#include "unicode/putil.h"
29		#include "unicode/uchar.h"
30		#include "unicode/uloc.h"
31		#include "unicode/ures.h"
32		#include "unicode/uscript.h"
33		#include "bytesinkutil.h"
34		#include "charstr.h"
35		#include "cmemory.h"
36		#include "cstring.h"
37		#include "loclikelysubtags.h"
38		#include "ulocimp.h"
39
40		namespace {
41
42		/**
43		* Create a tag string from the supplied parameters. The lang, script and region
44		* parameters may be nullptr pointers. If they are, their corresponding length parameters
45		* must be less than or equal to 0.
46		*
47		* If an illegal argument is provided, the function returns the error
48		* U_ILLEGAL_ARGUMENT_ERROR.
49		*
50		* @param lang The language tag to use.
51		* @param langLength The length of the language tag.
52		* @param script The script tag to use.
53		* @param scriptLength The length of the script tag.
54		* @param region The region tag to use.
55		* @param regionLength The length of the region tag.
56		* @param variant The region tag to use.
57		* @param variantLength The length of the region tag.
58		* @param trailing Any trailing data to append to the new tag.
59		* @param trailingLength The length of the trailing data.
60		* @param sink The output sink receiving the tag string.
61		* @param err A pointer to a UErrorCode for error reporting.
62		**/
63		void U_CALLCONV
64		createTagStringWithAlternates(
65		const char* lang,
66		int32_t langLength,
67		const char* script,
68		int32_t scriptLength,
69		const char* region,
70		int32_t regionLength,
71		const char* variant,
72		int32_t variantLength,
73		const char* trailing,
74		int32_t trailingLength,
75		icu::ByteSink& sink,
76	74.7k	UErrorCode& err) {
77	74.7k	if (U_FAILURE(err)) {
78	0	return;
79	0	}
80
81	74.7k	if (langLength >= ULOC_LANG_CAPACITY \|\|
82	74.7k	scriptLength >= ULOC_SCRIPT_CAPACITY \|\|
83	74.7k	regionLength >= ULOC_COUNTRY_CAPACITY) {
84	0	err = U_ILLEGAL_ARGUMENT_ERROR;
85	0	return;
86	0	}
87
88	74.7k	if (langLength > 0) {
89	74.7k	sink.Append(lang, langLength);
90	74.7k	}
91
92	74.7k	if (scriptLength > 0) {
93	41.6k	sink.Append("_", 1);
94	41.6k	sink.Append(script, scriptLength);
95	41.6k	}
96
97	74.7k	if (regionLength > 0) {
98	47.5k	sink.Append("_", 1);
99	47.5k	sink.Append(region, regionLength);
100	47.5k	}
101
102	74.7k	if (variantLength > 0) {
103	15.2k	if (regionLength == 0) {
104		/* extra separator is required */
105	5.03k	sink.Append("_", 1);
106	5.03k	}
107	15.2k	sink.Append("_", 1);
108	15.2k	sink.Append(variant, variantLength);
109	15.2k	}
110
111	74.7k	if (trailingLength > 0) {
112		/*
113		* Copy the trailing data into the supplied buffer.
114		*/
115	20.3k	sink.Append(trailing, trailingLength);
116	20.3k	}
117	74.7k	}
118
119	77.9k	bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
120	77.9k	int32_t count = 0;
121	385k	for (int32_t i = 0; i < variantLength; i++) {
122	309k	if (_isIDSeparator(variant[i])) {
123	139k	count = 0;
124	169k	} else if (count == 8) {
125	1.64k	return false;
126	168k	} else {
127	168k	count++;
128	168k	}
129	309k	}
130	76.3k	return true;
131	77.9k	}
132
133		void
134		_uloc_addLikelySubtags(const char* localeID,
135		icu::ByteSink& sink,
136	57.4k	UErrorCode& err) {
137	57.4k	if (U_FAILURE(err)) {
138	671	return;
139	671	}
140
141	56.7k	if (localeID == nullptr) {
142	0	err = U_ILLEGAL_ARGUMENT_ERROR;
143	0	return;
144	0	}
145
146	56.7k	icu::CharString lang;
147	56.7k	icu::CharString script;
148	56.7k	icu::CharString region;
149	56.7k	icu::CharString variant;
150	56.7k	const char* trailing = nullptr;
151	56.7k	ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
152	56.7k	if (U_FAILURE(err)) {
153	56	return;
154	56	}
155
156	56.6k	if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
157	1.24k	err = U_ILLEGAL_ARGUMENT_ERROR;
158	1.24k	return;
159	1.24k	}
160
161	55.4k	if (lang.length() == 4) {
162	1.22k	if (script.isEmpty()) {
163	1.17k	script = std::move(lang);
164	1.17k	lang.clear();
165	1.17k	} else {
166	51	err = U_ILLEGAL_ARGUMENT_ERROR;
167	51	return;
168	51	}
169	54.2k	} else if (lang.length() > 8) {
170	1.51k	err = U_ILLEGAL_ARGUMENT_ERROR;
171	1.51k	return;
172	1.51k	}
173
174	53.8k	int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
175
176	53.8k	const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
177	53.8k	if (U_FAILURE(err)) {
178	0	return;
179	0	}
180		// We need to keep l on the stack because lsr may point into internal
181		// memory of l.
182	53.8k	icu::Locale l = icu::Locale::createFromName(localeID);
183	53.8k	if (l.isBogus()) {
184	4	err = U_ILLEGAL_ARGUMENT_ERROR;
185	4	return;
186	4	}
187	53.8k	icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
188	53.8k	if (U_FAILURE(err)) {
189	0	return;
190	0	}
191	53.8k	const char* language = lsr.language;
192	53.8k	if (uprv_strcmp(language, "und") == 0) {
193	38	language = "";
194	38	}
195	53.8k	createTagStringWithAlternates(
196	53.8k	language,
197	53.8k	static_cast<int32_t>(uprv_strlen(language)),
198	53.8k	lsr.script,
199	53.8k	static_cast<int32_t>(uprv_strlen(lsr.script)),
200	53.8k	lsr.region,
201	53.8k	static_cast<int32_t>(uprv_strlen(lsr.region)),
202	53.8k	variant.data(),
203	53.8k	variant.length(),
204	53.8k	trailing,
205	53.8k	trailingLength,
206	53.8k	sink,
207	53.8k	err);
208	53.8k	}
209
210		void
211		_uloc_minimizeSubtags(const char* localeID,
212		icu::ByteSink& sink,
213		bool favorScript,
214	21.4k	UErrorCode& err) {
215	21.4k	if (U_FAILURE(err)) {
216	170	return;
217	170	}
218
219	21.3k	if (localeID == nullptr) {
220	0	err = U_ILLEGAL_ARGUMENT_ERROR;
221	0	return;
222	0	}
223
224	21.3k	icu::CharString lang;
225	21.3k	icu::CharString script;
226	21.3k	icu::CharString region;
227	21.3k	icu::CharString variant;
228	21.3k	const char* trailing = nullptr;
229	21.3k	ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
230	21.3k	if (U_FAILURE(err)) {
231	7	return;
232	7	}
233
234	21.2k	if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
235	407	err = U_ILLEGAL_ARGUMENT_ERROR;
236	407	return;
237	407	}
238
239	20.8k	int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
240
241	20.8k	const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
242	20.8k	if (U_FAILURE(err)) {
243	0	return;
244	0	}
245	20.8k	icu::LSR lsr = likelySubtags->minimizeSubtags(
246	20.8k	lang.toStringPiece(),
247	20.8k	script.toStringPiece(),
248	20.8k	region.toStringPiece(),
249	20.8k	favorScript,
250	20.8k	err);
251	20.8k	if (U_FAILURE(err)) {
252	0	return;
253	0	}
254	20.8k	const char* language = lsr.language;
255	20.8k	if (uprv_strcmp(language, "und") == 0) {
256	21	language = "";
257	21	}
258	20.8k	createTagStringWithAlternates(
259	20.8k	language,
260	20.8k	static_cast<int32_t>(uprv_strlen(language)),
261	20.8k	lsr.script,
262	20.8k	static_cast<int32_t>(uprv_strlen(lsr.script)),
263	20.8k	lsr.region,
264	20.8k	static_cast<int32_t>(uprv_strlen(lsr.region)),
265	20.8k	variant.data(),
266	20.8k	variant.length(),
267	20.8k	trailing,
268	20.8k	trailingLength,
269	20.8k	sink,
270	20.8k	err);
271	20.8k	}
272
273		} // namespace
274
275		U_CAPI int32_t U_EXPORT2
276		uloc_addLikelySubtags(const char* localeID,
277		char* maximizedLocaleID,
278		int32_t maximizedLocaleIDCapacity,
279	0	UErrorCode* status) {
280	0	return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
281	0	maximizedLocaleID, maximizedLocaleIDCapacity,
282	0	[&](icu::ByteSink& sink, UErrorCode& status) {
283	0	ulocimp_addLikelySubtags(localeID, sink, status);
284	0	},
285	0	*status);
286	0	}
287
288		U_EXPORT icu::CharString
289		ulocimp_addLikelySubtags(const char* localeID,
290	57.4k	UErrorCode& status) {
291	57.4k	return icu::ByteSinkUtil::viaByteSinkToCharString(
292	57.4k	[&](icu::ByteSink& sink, UErrorCode& status) {
293	57.4k	ulocimp_addLikelySubtags(localeID, sink, status);
294	57.4k	},
295	57.4k	status);
296	57.4k	}
297
298		U_EXPORT void
299		ulocimp_addLikelySubtags(const char* localeID,
300		icu::ByteSink& sink,
301	57.4k	UErrorCode& status) {
302	57.4k	if (U_FAILURE(status)) { return; }
303	57.4k	if (localeID == nullptr) {
304	0	localeID = uloc_getDefault();
305	0	}
306	57.4k	icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
307	57.4k	_uloc_addLikelySubtags(localeBuffer.data(), sink, status);
308	57.4k	}
309
310		U_CAPI int32_t U_EXPORT2
311		uloc_minimizeSubtags(const char* localeID,
312		char* minimizedLocaleID,
313		int32_t minimizedLocaleIDCapacity,
314	0	UErrorCode* status) {
315	0	return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
316	0	minimizedLocaleID, minimizedLocaleIDCapacity,
317	0	[&](icu::ByteSink& sink, UErrorCode& status) {
318	0	ulocimp_minimizeSubtags(localeID, sink, false, status);
319	0	},
320	0	*status);
321	0	}
322
323		U_EXPORT icu::CharString
324		ulocimp_minimizeSubtags(const char* localeID,
325		bool favorScript,
326	21.4k	UErrorCode& status) {
327	21.4k	return icu::ByteSinkUtil::viaByteSinkToCharString(
328	21.4k	[&](icu::ByteSink& sink, UErrorCode& status) {
329	21.4k	ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
330	21.4k	},
331	21.4k	status);
332	21.4k	}
333
334		U_EXPORT void
335		ulocimp_minimizeSubtags(const char* localeID,
336		icu::ByteSink& sink,
337		bool favorScript,
338	21.4k	UErrorCode& status) {
339	21.4k	if (U_FAILURE(status)) { return; }
340	21.4k	if (localeID == nullptr) {
341	0	localeID = uloc_getDefault();
342	0	}
343	21.4k	icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
344	21.4k	_uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
345	21.4k	}
346
347		// Pairs of (language subtag, + or -) for finding out fast if common languages
348		// are LTR (minus) or RTL (plus).
349		static const char LANG_DIR_STRING[] =
350		"root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
351
352		// Implemented here because this calls ulocimp_addLikelySubtags().
353		U_CAPI UBool U_EXPORT2
354	4.35k	uloc_isRightToLeft(const char *locale) {
355	4.35k	UErrorCode errorCode = U_ZERO_ERROR;
356	4.35k	icu::CharString lang;
357	4.35k	icu::CharString script;
358	4.35k	ulocimp_getSubtags(
359	4.35k	locale == nullptr ? uloc_getDefault() : locale,
360	4.35k	&lang, &script, nullptr, nullptr, nullptr, errorCode);
361	4.35k	if (U_FAILURE(errorCode) \|\| script.isEmpty()) {
362		// Fastpath: We know the likely scripts and their writing direction
363		// for some common languages.
364	4.33k	if (!lang.isEmpty()) {
365	2.93k	const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
366	2.93k	if (langPtr != nullptr) {
367	81	switch (langPtr[lang.length()]) {
368	11	case '-': return false;
369	1	case '+': return true;
370	69	default: break; // partial match of a longer code
371	81	}
372	81	}
373	2.93k	}
374		// Otherwise, find the likely script.
375	4.32k	errorCode = U_ZERO_ERROR;
376	4.32k	icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
377	4.32k	if (U_FAILURE(errorCode)) {
378	749	return false;
379	749	}
380	3.57k	ulocimp_getSubtags(likely.toStringPiece(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
381	3.57k	if (U_FAILURE(errorCode) \|\| script.isEmpty()) {
382	1.48k	return false;
383	1.48k	}
384	3.57k	}
385	2.10k	UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
386	2.10k	return uscript_isRightToLeft(scriptCode);
387	4.35k	}
388
389		U_NAMESPACE_BEGIN
390
391		UBool
392	0	Locale::isRightToLeft() const {
393	0	return uloc_isRightToLeft(getBaseName());
394	0	}
395
396		U_NAMESPACE_END
397
398		namespace {
399		icu::CharString
400	509k	GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
401	509k	icu::CharString result;
402		// First check for keyword value
403	509k	icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
404	509k	int32_t len = kw.length();
405		// In UTS35
406		// type = alphanum{3,8} (sep alphanum{3,8})* ;
407		// so we know the subdivision must fit the type already.
408		//
409		// unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
410		// unicode_region_subtag = (alpha{2} \| digit{3}) ;
411		// unicode_subdivision_suffix = alphanum{1,4} ;
412		// But we also know there are no id in start with digit{3} in
413		// https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
414		// Therefore we can simplify as
415		// unicode_subdivision_id = alpha{2} alphanum{1,4}
416		//
417		// and only need to accept/reject the code based on the alpha{2} and the length.
418	509k	if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
419	509k	uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
420		// Additional Check
421	273	static icu::RegionValidateMap valid;
422	273	const char region[] = {kw[0], kw[1], '\0'};
423	273	if (valid.isSet(region)) {
424	225	result.append(uprv_toupper(kw[0]), status);
425	225	result.append(uprv_toupper(kw[1]), status);
426	225	}
427	273	}
428	509k	return result;
429	509k	}
430		} // namespace
431
432		U_EXPORT icu::CharString
433		ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
434	484k	UErrorCode& status) {
435	484k	if (U_FAILURE(status)) {
436	0	return {};
437	0	}
438	484k	icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
439	484k	if (U_SUCCESS(status) && rgBuf.isEmpty()) {
440		// No valid rg keyword value, try for unicode_region_subtag
441	480k	rgBuf = ulocimp_getRegion(localeID == nullptr ? uloc_getDefault() : localeID, status);
442	480k	if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
443		// Second check for sd keyword value
444	24.6k	rgBuf = GetRegionFromKey(localeID, "sd", status);
445	24.6k	if (U_SUCCESS(status) && rgBuf.isEmpty()) {
446		// no unicode_region_subtag but inferRegion true, try likely subtags
447	24.4k	UErrorCode rgStatus = U_ZERO_ERROR;
448	24.4k	icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
449	24.4k	if (U_SUCCESS(rgStatus)) {
450	23.1k	rgBuf = ulocimp_getRegion(locBuf.toStringPiece(), status);
451	23.1k	}
452	24.4k	}
453	24.6k	}
454	480k	}
455
456	484k	return rgBuf;
457	484k	}
458
459		namespace {
460
461		// The following data is generated by unit test code inside
462		// test/intltest/regiontst.cpp from the resource data while
463		// the test failed.
464		const uint32_t gValidRegionMap[] = {
465		0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580,
466		0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f,
467		0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b,
468		0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7,
469		0x0410419a, 0x00408557, 0x00004002, 0x00100001,
470		0x00400408, 0x00000001,
471		};
472
473		} // namespace
474		//
475		U_NAMESPACE_BEGIN
476	2	RegionValidateMap::RegionValidateMap() {
477	2	uprv_memcpy(map, gValidRegionMap, sizeof(map));
478	2	}
479
480	2	RegionValidateMap::~RegionValidateMap() {
481	2	}
482
483	273	bool RegionValidateMap::isSet(const char* region) const {
484	273	int32_t index = value(region);
485	273	if (index < 0) {
486	0	return false;
487	0	}
488	273	return 0 != (map[index / 32] & (1L << (index % 32)));
489	273	}
490
491	0	bool RegionValidateMap::equals(const RegionValidateMap& that) const {
492	0	return uprv_memcmp(map, that.map, sizeof(map)) == 0;
493	0	}
494
495		// The code transform two letter a-z to a integer valued between -1, 26x26.
496		// -1 indicate the region is outside the range of two letter a-z
497		// the rest of value is between 0 and 676 (= 26x26) and used as an index
498		// the the bigmap in map. The map is an array of 22 int32_t.
499		// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
500	273	int32_t RegionValidateMap::value(const char* region) const {
501	273	if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
502	273	region[2] == '\0') {
503	273	return (uprv_toupper(region[0])-'A') * 26 +
504	273	(uprv_toupper(region[1])-'A');
505	273	}
506	0	return -1;
507	273	}
508
509		U_NAMESPACE_END