Coverage Report

Created: 2025-06-13 06:34

/src/icu/icu4c/source/common/loclikely.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 1997-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  loclikely.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2010feb25
16
*   created by: Markus W. Scherer
17
*
18
*   Code for likely and minimized locale subtags, separated out from other .cpp files
19
*   that then do not depend on resource bundle code and likely-subtags data.
20
*/
21
22
#include <string_view>
23
#include <utility>
24
25
#include "unicode/bytestream.h"
26
#include "unicode/utypes.h"
27
#include "unicode/locid.h"
28
#include "unicode/putil.h"
29
#include "unicode/uchar.h"
30
#include "unicode/uloc.h"
31
#include "unicode/ures.h"
32
#include "unicode/uscript.h"
33
#include "bytesinkutil.h"
34
#include "charstr.h"
35
#include "cmemory.h"
36
#include "cstring.h"
37
#include "loclikelysubtags.h"
38
#include "ulocimp.h"
39
40
namespace {
41
42
/**
43
 * Create a tag string from the supplied parameters.  The lang, script and region
44
 * parameters may be nullptr pointers. If they are, their corresponding length parameters
45
 * must be less than or equal to 0.
46
 *
47
 * If an illegal argument is provided, the function returns the error
48
 * U_ILLEGAL_ARGUMENT_ERROR.
49
 *
50
 * @param lang The language tag to use.
51
 * @param langLength The length of the language tag.
52
 * @param script The script tag to use.
53
 * @param scriptLength The length of the script tag.
54
 * @param region The region tag to use.
55
 * @param regionLength The length of the region tag.
56
 * @param variant The region tag to use.
57
 * @param variantLength The length of the region tag.
58
 * @param trailing Any trailing data to append to the new tag.
59
 * @param trailingLength The length of the trailing data.
60
 * @param sink The output sink receiving the tag string.
61
 * @param err A pointer to a UErrorCode for error reporting.
62
 **/
63
void U_CALLCONV
64
createTagStringWithAlternates(
65
    const char* lang,
66
    int32_t langLength,
67
    const char* script,
68
    int32_t scriptLength,
69
    const char* region,
70
    int32_t regionLength,
71
    const char* variant,
72
    int32_t variantLength,
73
    const char* trailing,
74
    int32_t trailingLength,
75
    icu::ByteSink& sink,
76
0
    UErrorCode& err) {
77
0
    if (U_FAILURE(err)) {
78
0
        return;
79
0
    }
80
81
0
    if (langLength >= ULOC_LANG_CAPACITY ||
82
0
            scriptLength >= ULOC_SCRIPT_CAPACITY ||
83
0
            regionLength >= ULOC_COUNTRY_CAPACITY) {
84
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
85
0
        return;
86
0
    }
87
88
0
    if (langLength > 0) {
89
0
        sink.Append(lang, langLength);
90
0
    }
91
92
0
    if (scriptLength > 0) {
93
0
        sink.Append("_", 1);
94
0
        sink.Append(script, scriptLength);
95
0
    }
96
97
0
    if (regionLength > 0) {
98
0
        sink.Append("_", 1);
99
0
        sink.Append(region, regionLength);
100
0
    }
101
102
0
    if (variantLength > 0) {
103
0
        if (regionLength == 0) {
104
            /* extra separator is required */
105
0
            sink.Append("_", 1);
106
0
        }
107
0
        sink.Append("_", 1);
108
0
        sink.Append(variant, variantLength);
109
0
    }
110
111
0
    if (trailingLength > 0) {
112
        /*
113
         * Copy the trailing data into the supplied buffer.
114
         */
115
0
        sink.Append(trailing, trailingLength);
116
0
    }
117
0
}
118
119
0
bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
120
0
    int32_t count = 0;
121
0
    for (int32_t i = 0; i < variantLength; i++) {
122
0
        if (_isIDSeparator(variant[i])) {
123
0
            count = 0;
124
0
        } else if (count == 8) {
125
0
            return false;
126
0
        } else {
127
0
            count++;
128
0
        }
129
0
    }
130
0
    return true;
131
0
}
132
133
void
134
_uloc_addLikelySubtags(const char* localeID,
135
                       icu::ByteSink& sink,
136
0
                       UErrorCode& err) {
137
0
    if (U_FAILURE(err)) {
138
0
        return;
139
0
    }
140
141
0
    if (localeID == nullptr) {
142
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
143
0
        return;
144
0
    }
145
146
0
    icu::CharString lang;
147
0
    icu::CharString script;
148
0
    icu::CharString region;
149
0
    icu::CharString variant;
150
0
    const char* trailing = nullptr;
151
0
    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
152
0
    if (U_FAILURE(err)) {
153
0
        return;
154
0
    }
155
156
0
    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
157
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
158
0
        return;
159
0
    }
160
161
0
    if (lang.length() == 4) {
162
0
        if (script.isEmpty()) {
163
0
            script = std::move(lang);
164
0
            lang.clear();
165
0
        } else {
166
0
            err = U_ILLEGAL_ARGUMENT_ERROR;
167
0
            return;
168
0
        }
169
0
    } else if (lang.length() > 8) {
170
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
171
0
        return;
172
0
    }
173
174
0
    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
175
176
0
    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
177
0
    if (U_FAILURE(err)) {
178
0
        return;
179
0
    }
180
    // We need to keep l on the stack because lsr may point into internal
181
    // memory of l.
182
0
    icu::Locale l = icu::Locale::createFromName(localeID);
183
0
    if (l.isBogus()) {
184
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
185
0
        return;
186
0
    }
187
0
    icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
188
0
    if (U_FAILURE(err)) {
189
0
        return;
190
0
    }
191
0
    const char* language = lsr.language;
192
0
    if (uprv_strcmp(language, "und") == 0) {
193
0
        language = "";
194
0
    }
195
0
    createTagStringWithAlternates(
196
0
        language,
197
0
        static_cast<int32_t>(uprv_strlen(language)),
198
0
        lsr.script,
199
0
        static_cast<int32_t>(uprv_strlen(lsr.script)),
200
0
        lsr.region,
201
0
        static_cast<int32_t>(uprv_strlen(lsr.region)),
202
0
        variant.data(),
203
0
        variant.length(),
204
0
        trailing,
205
0
        trailingLength,
206
0
        sink,
207
0
        err);
208
0
}
209
210
void
211
_uloc_minimizeSubtags(const char* localeID,
212
                      icu::ByteSink& sink,
213
                      bool favorScript,
214
0
                      UErrorCode& err) {
215
0
    if (U_FAILURE(err)) {
216
0
        return;
217
0
    }
218
219
0
    if (localeID == nullptr) {
220
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
221
0
        return;
222
0
    }
223
224
0
    icu::CharString lang;
225
0
    icu::CharString script;
226
0
    icu::CharString region;
227
0
    icu::CharString variant;
228
0
    const char* trailing = nullptr;
229
0
    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
230
0
    if (U_FAILURE(err)) {
231
0
        return;
232
0
    }
233
234
0
    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
235
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
236
0
        return;
237
0
    }
238
239
0
    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
240
241
0
    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
242
0
    if (U_FAILURE(err)) {
243
0
        return;
244
0
    }
245
0
    icu::LSR lsr = likelySubtags->minimizeSubtags(
246
0
        lang.toStringPiece(),
247
0
        script.toStringPiece(),
248
0
        region.toStringPiece(),
249
0
        favorScript,
250
0
        err);
251
0
    if (U_FAILURE(err)) {
252
0
        return;
253
0
    }
254
0
    const char* language = lsr.language;
255
0
    if (uprv_strcmp(language, "und") == 0) {
256
0
        language = "";
257
0
    }
258
0
    createTagStringWithAlternates(
259
0
        language,
260
0
        static_cast<int32_t>(uprv_strlen(language)),
261
0
        lsr.script,
262
0
        static_cast<int32_t>(uprv_strlen(lsr.script)),
263
0
        lsr.region,
264
0
        static_cast<int32_t>(uprv_strlen(lsr.region)),
265
0
        variant.data(),
266
0
        variant.length(),
267
0
        trailing,
268
0
        trailingLength,
269
0
        sink,
270
0
        err);
271
0
}
272
273
}  // namespace
274
275
U_CAPI int32_t U_EXPORT2
276
uloc_addLikelySubtags(const char* localeID,
277
                      char* maximizedLocaleID,
278
                      int32_t maximizedLocaleIDCapacity,
279
0
                      UErrorCode* status) {
280
0
    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
281
0
        maximizedLocaleID, maximizedLocaleIDCapacity,
282
0
        [&](icu::ByteSink& sink, UErrorCode& status) {
283
0
            ulocimp_addLikelySubtags(localeID, sink, status);
284
0
        },
285
0
        *status);
286
0
}
287
288
U_EXPORT icu::CharString
289
ulocimp_addLikelySubtags(const char* localeID,
290
0
                         UErrorCode& status) {
291
0
    return icu::ByteSinkUtil::viaByteSinkToCharString(
292
0
        [&](icu::ByteSink& sink, UErrorCode& status) {
293
0
            ulocimp_addLikelySubtags(localeID, sink, status);
294
0
        },
295
0
        status);
296
0
}
297
298
U_EXPORT void
299
ulocimp_addLikelySubtags(const char* localeID,
300
                         icu::ByteSink& sink,
301
0
                         UErrorCode& status) {
302
0
    if (U_FAILURE(status)) { return; }
303
0
    if (localeID == nullptr) {
304
0
        localeID = uloc_getDefault();
305
0
    }
306
0
    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
307
0
    _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
308
0
}
309
310
U_CAPI int32_t U_EXPORT2
311
uloc_minimizeSubtags(const char* localeID,
312
                     char* minimizedLocaleID,
313
                     int32_t minimizedLocaleIDCapacity,
314
0
                     UErrorCode* status) {
315
0
    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
316
0
        minimizedLocaleID, minimizedLocaleIDCapacity,
317
0
        [&](icu::ByteSink& sink, UErrorCode& status) {
318
0
            ulocimp_minimizeSubtags(localeID, sink, false, status);
319
0
        },
320
0
        *status);
321
0
}
322
323
U_EXPORT icu::CharString
324
ulocimp_minimizeSubtags(const char* localeID,
325
                        bool favorScript,
326
0
                        UErrorCode& status) {
327
0
    return icu::ByteSinkUtil::viaByteSinkToCharString(
328
0
        [&](icu::ByteSink& sink, UErrorCode& status) {
329
0
            ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
330
0
        },
331
0
        status);
332
0
}
333
334
U_EXPORT void
335
ulocimp_minimizeSubtags(const char* localeID,
336
                        icu::ByteSink& sink,
337
                        bool favorScript,
338
0
                        UErrorCode& status) {
339
0
    if (U_FAILURE(status)) { return; }
340
0
    if (localeID == nullptr) {
341
0
        localeID = uloc_getDefault();
342
0
    }
343
0
    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
344
0
    _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
345
0
}
346
347
// Pairs of (language subtag, + or -) for finding out fast if common languages
348
// are LTR (minus) or RTL (plus).
349
static const char LANG_DIR_STRING[] =
350
        "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
351
352
// Implemented here because this calls ulocimp_addLikelySubtags().
353
U_CAPI UBool U_EXPORT2
354
0
uloc_isRightToLeft(const char *locale) {
355
0
    UErrorCode errorCode = U_ZERO_ERROR;
356
0
    icu::CharString lang;
357
0
    icu::CharString script;
358
0
    ulocimp_getSubtags(
359
0
        locale == nullptr ? uloc_getDefault() : locale,
360
0
        &lang, &script, nullptr, nullptr, nullptr, errorCode);
361
0
    if (U_FAILURE(errorCode) || script.isEmpty()) {
362
        // Fastpath: We know the likely scripts and their writing direction
363
        // for some common languages.
364
0
        if (!lang.isEmpty()) {
365
0
            const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
366
0
            if (langPtr != nullptr) {
367
0
                switch (langPtr[lang.length()]) {
368
0
                case '-': return false;
369
0
                case '+': return true;
370
0
                default: break;  // partial match of a longer code
371
0
                }
372
0
            }
373
0
        }
374
        // Otherwise, find the likely script.
375
0
        errorCode = U_ZERO_ERROR;
376
0
        icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
377
0
        if (U_FAILURE(errorCode)) {
378
0
            return false;
379
0
        }
380
0
        ulocimp_getSubtags(likely.toStringPiece(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
381
0
        if (U_FAILURE(errorCode) || script.isEmpty()) {
382
0
            return false;
383
0
        }
384
0
    }
385
0
    UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
386
0
    return uscript_isRightToLeft(scriptCode);
387
0
}
388
389
U_NAMESPACE_BEGIN
390
391
UBool
392
0
Locale::isRightToLeft() const {
393
0
    return uloc_isRightToLeft(getBaseName());
394
0
}
395
396
U_NAMESPACE_END
397
398
namespace {
399
icu::CharString
400
0
GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
401
0
    icu::CharString result;
402
    // First check for keyword value
403
0
    icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
404
0
    int32_t len = kw.length();
405
    // In UTS35
406
    //   type = alphanum{3,8} (sep alphanum{3,8})* ;
407
    // so we know the subdivision must fit the type already.
408
    //
409
    //   unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
410
    //   unicode_region_subtag = (alpha{2} | digit{3}) ;
411
    //   unicode_subdivision_suffix = alphanum{1,4} ;
412
    // But we also know there are no id in start with digit{3} in
413
    // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
414
    // Therefore we can simplify as
415
    // unicode_subdivision_id = alpha{2} alphanum{1,4}
416
    //
417
    // and only need to accept/reject the code based on the alpha{2} and the length.
418
0
    if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
419
0
        uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
420
        // Additional Check
421
0
        static icu::RegionValidateMap valid;
422
0
        const char region[] = {kw[0], kw[1], '\0'};
423
0
        if (valid.isSet(region)) {
424
0
            result.append(uprv_toupper(kw[0]), status);
425
0
            result.append(uprv_toupper(kw[1]), status);
426
0
        }
427
0
    }
428
0
    return result;
429
0
}
430
}  // namespace
431
432
U_EXPORT icu::CharString
433
ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
434
0
                                     UErrorCode& status) {
435
0
    if (U_FAILURE(status)) {
436
0
        return {};
437
0
    }
438
0
    icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
439
0
    if (U_SUCCESS(status) && rgBuf.isEmpty()) {
440
        // No valid rg keyword value, try for unicode_region_subtag
441
0
        rgBuf = ulocimp_getRegion(localeID == nullptr ? uloc_getDefault() : localeID, status);
442
0
        if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
443
            // Second check for sd keyword value
444
0
            rgBuf = GetRegionFromKey(localeID, "sd", status);
445
0
            if (U_SUCCESS(status) && rgBuf.isEmpty()) {
446
                // no unicode_region_subtag but inferRegion true, try likely subtags
447
0
                UErrorCode rgStatus = U_ZERO_ERROR;
448
0
                icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
449
0
                if (U_SUCCESS(rgStatus)) {
450
0
                    rgBuf = ulocimp_getRegion(locBuf.toStringPiece(), status);
451
0
                }
452
0
            }
453
0
        }
454
0
    }
455
456
0
    return rgBuf;
457
0
}
458
459
namespace {
460
461
// The following data is generated by unit test code inside
462
// test/intltest/regiontst.cpp from the resource data while
463
// the test failed.
464
const uint32_t gValidRegionMap[] = {
465
    0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, 
466
    0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, 
467
    0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, 
468
    0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, 
469
    0x0410419a, 0x00408557, 0x00004002, 0x00100001, 
470
    0x00400408, 0x00000001, 
471
};
472
473
}  // namespace
474
   //
475
U_NAMESPACE_BEGIN
476
0
RegionValidateMap::RegionValidateMap() {
477
0
    uprv_memcpy(map, gValidRegionMap, sizeof(map));
478
0
}
479
480
0
RegionValidateMap::~RegionValidateMap() {
481
0
}
482
483
0
bool RegionValidateMap::isSet(const char* region) const {
484
0
    int32_t index = value(region);
485
0
    if (index < 0) {
486
0
        return false;
487
0
    }
488
0
    return 0 != (map[index / 32] & (1L << (index % 32)));
489
0
}
490
491
0
bool RegionValidateMap::equals(const RegionValidateMap& that) const {
492
0
    return uprv_memcmp(map, that.map, sizeof(map)) == 0;
493
0
}
494
495
// The code transform two letter a-z to a integer valued between -1, 26x26.
496
// -1 indicate the region is outside the range of two letter a-z
497
// the rest of value is between 0 and 676 (= 26x26) and used as an index
498
// the the bigmap in map. The map is an array of 22 int32_t.
499
// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
500
0
int32_t RegionValidateMap::value(const char* region) const {
501
0
    if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
502
0
        region[2] == '\0') {
503
0
        return (uprv_toupper(region[0])-'A') * 26 +
504
0
               (uprv_toupper(region[1])-'A');
505
0
    }
506
0
    return -1;
507
0
}
508
509
U_NAMESPACE_END