Coverage Report

Created: 2025-06-24 06:54

/src/icu/icu4c/source/common/loclikely.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 1997-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  loclikely.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2010feb25
16
*   created by: Markus W. Scherer
17
*
18
*   Code for likely and minimized locale subtags, separated out from other .cpp files
19
*   that then do not depend on resource bundle code and likely-subtags data.
20
*/
21
22
#include <string_view>
23
#include <utility>
24
25
#include "unicode/bytestream.h"
26
#include "unicode/utypes.h"
27
#include "unicode/locid.h"
28
#include "unicode/putil.h"
29
#include "unicode/uchar.h"
30
#include "unicode/uloc.h"
31
#include "unicode/ures.h"
32
#include "unicode/uscript.h"
33
#include "bytesinkutil.h"
34
#include "charstr.h"
35
#include "cmemory.h"
36
#include "cstring.h"
37
#include "loclikelysubtags.h"
38
#include "ulocimp.h"
39
40
namespace {
41
42
/**
43
 * Create a tag string from the supplied parameters.  The lang, script and region
44
 * parameters may be nullptr pointers. If they are, their corresponding length parameters
45
 * must be less than or equal to 0.
46
 *
47
 * If an illegal argument is provided, the function returns the error
48
 * U_ILLEGAL_ARGUMENT_ERROR.
49
 *
50
 * @param lang The language tag to use.
51
 * @param langLength The length of the language tag.
52
 * @param script The script tag to use.
53
 * @param scriptLength The length of the script tag.
54
 * @param region The region tag to use.
55
 * @param regionLength The length of the region tag.
56
 * @param variant The region tag to use.
57
 * @param variantLength The length of the region tag.
58
 * @param trailing Any trailing data to append to the new tag.
59
 * @param trailingLength The length of the trailing data.
60
 * @param sink The output sink receiving the tag string.
61
 * @param err A pointer to a UErrorCode for error reporting.
62
 **/
63
void U_CALLCONV
64
createTagStringWithAlternates(
65
    const char* lang,
66
    int32_t langLength,
67
    const char* script,
68
    int32_t scriptLength,
69
    const char* region,
70
    int32_t regionLength,
71
    const char* variant,
72
    int32_t variantLength,
73
    const char* trailing,
74
    int32_t trailingLength,
75
    icu::ByteSink& sink,
76
74.7k
    UErrorCode& err) {
77
74.7k
    if (U_FAILURE(err)) {
78
0
        return;
79
0
    }
80
81
74.7k
    if (langLength >= ULOC_LANG_CAPACITY ||
82
74.7k
            scriptLength >= ULOC_SCRIPT_CAPACITY ||
83
74.7k
            regionLength >= ULOC_COUNTRY_CAPACITY) {
84
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
85
0
        return;
86
0
    }
87
88
74.7k
    if (langLength > 0) {
89
74.7k
        sink.Append(lang, langLength);
90
74.7k
    }
91
92
74.7k
    if (scriptLength > 0) {
93
41.6k
        sink.Append("_", 1);
94
41.6k
        sink.Append(script, scriptLength);
95
41.6k
    }
96
97
74.7k
    if (regionLength > 0) {
98
47.5k
        sink.Append("_", 1);
99
47.5k
        sink.Append(region, regionLength);
100
47.5k
    }
101
102
74.7k
    if (variantLength > 0) {
103
15.2k
        if (regionLength == 0) {
104
            /* extra separator is required */
105
5.03k
            sink.Append("_", 1);
106
5.03k
        }
107
15.2k
        sink.Append("_", 1);
108
15.2k
        sink.Append(variant, variantLength);
109
15.2k
    }
110
111
74.7k
    if (trailingLength > 0) {
112
        /*
113
         * Copy the trailing data into the supplied buffer.
114
         */
115
20.3k
        sink.Append(trailing, trailingLength);
116
20.3k
    }
117
74.7k
}
118
119
77.9k
bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
120
77.9k
    int32_t count = 0;
121
385k
    for (int32_t i = 0; i < variantLength; i++) {
122
309k
        if (_isIDSeparator(variant[i])) {
123
139k
            count = 0;
124
169k
        } else if (count == 8) {
125
1.64k
            return false;
126
168k
        } else {
127
168k
            count++;
128
168k
        }
129
309k
    }
130
76.3k
    return true;
131
77.9k
}
132
133
void
134
_uloc_addLikelySubtags(const char* localeID,
135
                       icu::ByteSink& sink,
136
57.4k
                       UErrorCode& err) {
137
57.4k
    if (U_FAILURE(err)) {
138
671
        return;
139
671
    }
140
141
56.7k
    if (localeID == nullptr) {
142
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
143
0
        return;
144
0
    }
145
146
56.7k
    icu::CharString lang;
147
56.7k
    icu::CharString script;
148
56.7k
    icu::CharString region;
149
56.7k
    icu::CharString variant;
150
56.7k
    const char* trailing = nullptr;
151
56.7k
    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
152
56.7k
    if (U_FAILURE(err)) {
153
56
        return;
154
56
    }
155
156
56.6k
    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
157
1.24k
        err = U_ILLEGAL_ARGUMENT_ERROR;
158
1.24k
        return;
159
1.24k
    }
160
161
55.4k
    if (lang.length() == 4) {
162
1.22k
        if (script.isEmpty()) {
163
1.17k
            script = std::move(lang);
164
1.17k
            lang.clear();
165
1.17k
        } else {
166
51
            err = U_ILLEGAL_ARGUMENT_ERROR;
167
51
            return;
168
51
        }
169
54.2k
    } else if (lang.length() > 8) {
170
1.51k
        err = U_ILLEGAL_ARGUMENT_ERROR;
171
1.51k
        return;
172
1.51k
    }
173
174
53.8k
    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
175
176
53.8k
    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
177
53.8k
    if (U_FAILURE(err)) {
178
0
        return;
179
0
    }
180
    // We need to keep l on the stack because lsr may point into internal
181
    // memory of l.
182
53.8k
    icu::Locale l = icu::Locale::createFromName(localeID);
183
53.8k
    if (l.isBogus()) {
184
4
        err = U_ILLEGAL_ARGUMENT_ERROR;
185
4
        return;
186
4
    }
187
53.8k
    icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
188
53.8k
    if (U_FAILURE(err)) {
189
0
        return;
190
0
    }
191
53.8k
    const char* language = lsr.language;
192
53.8k
    if (uprv_strcmp(language, "und") == 0) {
193
38
        language = "";
194
38
    }
195
53.8k
    createTagStringWithAlternates(
196
53.8k
        language,
197
53.8k
        static_cast<int32_t>(uprv_strlen(language)),
198
53.8k
        lsr.script,
199
53.8k
        static_cast<int32_t>(uprv_strlen(lsr.script)),
200
53.8k
        lsr.region,
201
53.8k
        static_cast<int32_t>(uprv_strlen(lsr.region)),
202
53.8k
        variant.data(),
203
53.8k
        variant.length(),
204
53.8k
        trailing,
205
53.8k
        trailingLength,
206
53.8k
        sink,
207
53.8k
        err);
208
53.8k
}
209
210
void
211
_uloc_minimizeSubtags(const char* localeID,
212
                      icu::ByteSink& sink,
213
                      bool favorScript,
214
21.4k
                      UErrorCode& err) {
215
21.4k
    if (U_FAILURE(err)) {
216
170
        return;
217
170
    }
218
219
21.3k
    if (localeID == nullptr) {
220
0
        err = U_ILLEGAL_ARGUMENT_ERROR;
221
0
        return;
222
0
    }
223
224
21.3k
    icu::CharString lang;
225
21.3k
    icu::CharString script;
226
21.3k
    icu::CharString region;
227
21.3k
    icu::CharString variant;
228
21.3k
    const char* trailing = nullptr;
229
21.3k
    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
230
21.3k
    if (U_FAILURE(err)) {
231
7
        return;
232
7
    }
233
234
21.2k
    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
235
407
        err = U_ILLEGAL_ARGUMENT_ERROR;
236
407
        return;
237
407
    }
238
239
20.8k
    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
240
241
20.8k
    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
242
20.8k
    if (U_FAILURE(err)) {
243
0
        return;
244
0
    }
245
20.8k
    icu::LSR lsr = likelySubtags->minimizeSubtags(
246
20.8k
        lang.toStringPiece(),
247
20.8k
        script.toStringPiece(),
248
20.8k
        region.toStringPiece(),
249
20.8k
        favorScript,
250
20.8k
        err);
251
20.8k
    if (U_FAILURE(err)) {
252
0
        return;
253
0
    }
254
20.8k
    const char* language = lsr.language;
255
20.8k
    if (uprv_strcmp(language, "und") == 0) {
256
21
        language = "";
257
21
    }
258
20.8k
    createTagStringWithAlternates(
259
20.8k
        language,
260
20.8k
        static_cast<int32_t>(uprv_strlen(language)),
261
20.8k
        lsr.script,
262
20.8k
        static_cast<int32_t>(uprv_strlen(lsr.script)),
263
20.8k
        lsr.region,
264
20.8k
        static_cast<int32_t>(uprv_strlen(lsr.region)),
265
20.8k
        variant.data(),
266
20.8k
        variant.length(),
267
20.8k
        trailing,
268
20.8k
        trailingLength,
269
20.8k
        sink,
270
20.8k
        err);
271
20.8k
}
272
273
}  // namespace
274
275
U_CAPI int32_t U_EXPORT2
276
uloc_addLikelySubtags(const char* localeID,
277
                      char* maximizedLocaleID,
278
                      int32_t maximizedLocaleIDCapacity,
279
0
                      UErrorCode* status) {
280
0
    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
281
0
        maximizedLocaleID, maximizedLocaleIDCapacity,
282
0
        [&](icu::ByteSink& sink, UErrorCode& status) {
283
0
            ulocimp_addLikelySubtags(localeID, sink, status);
284
0
        },
285
0
        *status);
286
0
}
287
288
U_EXPORT icu::CharString
289
ulocimp_addLikelySubtags(const char* localeID,
290
57.4k
                         UErrorCode& status) {
291
57.4k
    return icu::ByteSinkUtil::viaByteSinkToCharString(
292
57.4k
        [&](icu::ByteSink& sink, UErrorCode& status) {
293
57.4k
            ulocimp_addLikelySubtags(localeID, sink, status);
294
57.4k
        },
295
57.4k
        status);
296
57.4k
}
297
298
U_EXPORT void
299
ulocimp_addLikelySubtags(const char* localeID,
300
                         icu::ByteSink& sink,
301
57.4k
                         UErrorCode& status) {
302
57.4k
    if (U_FAILURE(status)) { return; }
303
57.4k
    if (localeID == nullptr) {
304
0
        localeID = uloc_getDefault();
305
0
    }
306
57.4k
    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
307
57.4k
    _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
308
57.4k
}
309
310
U_CAPI int32_t U_EXPORT2
311
uloc_minimizeSubtags(const char* localeID,
312
                     char* minimizedLocaleID,
313
                     int32_t minimizedLocaleIDCapacity,
314
0
                     UErrorCode* status) {
315
0
    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
316
0
        minimizedLocaleID, minimizedLocaleIDCapacity,
317
0
        [&](icu::ByteSink& sink, UErrorCode& status) {
318
0
            ulocimp_minimizeSubtags(localeID, sink, false, status);
319
0
        },
320
0
        *status);
321
0
}
322
323
U_EXPORT icu::CharString
324
ulocimp_minimizeSubtags(const char* localeID,
325
                        bool favorScript,
326
21.4k
                        UErrorCode& status) {
327
21.4k
    return icu::ByteSinkUtil::viaByteSinkToCharString(
328
21.4k
        [&](icu::ByteSink& sink, UErrorCode& status) {
329
21.4k
            ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
330
21.4k
        },
331
21.4k
        status);
332
21.4k
}
333
334
U_EXPORT void
335
ulocimp_minimizeSubtags(const char* localeID,
336
                        icu::ByteSink& sink,
337
                        bool favorScript,
338
21.4k
                        UErrorCode& status) {
339
21.4k
    if (U_FAILURE(status)) { return; }
340
21.4k
    if (localeID == nullptr) {
341
0
        localeID = uloc_getDefault();
342
0
    }
343
21.4k
    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
344
21.4k
    _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
345
21.4k
}
346
347
// Pairs of (language subtag, + or -) for finding out fast if common languages
348
// are LTR (minus) or RTL (plus).
349
static const char LANG_DIR_STRING[] =
350
        "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
351
352
// Implemented here because this calls ulocimp_addLikelySubtags().
353
U_CAPI UBool U_EXPORT2
354
4.35k
uloc_isRightToLeft(const char *locale) {
355
4.35k
    UErrorCode errorCode = U_ZERO_ERROR;
356
4.35k
    icu::CharString lang;
357
4.35k
    icu::CharString script;
358
4.35k
    ulocimp_getSubtags(
359
4.35k
        locale == nullptr ? uloc_getDefault() : locale,
360
4.35k
        &lang, &script, nullptr, nullptr, nullptr, errorCode);
361
4.35k
    if (U_FAILURE(errorCode) || script.isEmpty()) {
362
        // Fastpath: We know the likely scripts and their writing direction
363
        // for some common languages.
364
4.33k
        if (!lang.isEmpty()) {
365
2.93k
            const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
366
2.93k
            if (langPtr != nullptr) {
367
81
                switch (langPtr[lang.length()]) {
368
11
                case '-': return false;
369
1
                case '+': return true;
370
69
                default: break;  // partial match of a longer code
371
81
                }
372
81
            }
373
2.93k
        }
374
        // Otherwise, find the likely script.
375
4.32k
        errorCode = U_ZERO_ERROR;
376
4.32k
        icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
377
4.32k
        if (U_FAILURE(errorCode)) {
378
749
            return false;
379
749
        }
380
3.57k
        ulocimp_getSubtags(likely.toStringPiece(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
381
3.57k
        if (U_FAILURE(errorCode) || script.isEmpty()) {
382
1.48k
            return false;
383
1.48k
        }
384
3.57k
    }
385
2.10k
    UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
386
2.10k
    return uscript_isRightToLeft(scriptCode);
387
4.35k
}
388
389
U_NAMESPACE_BEGIN
390
391
UBool
392
0
Locale::isRightToLeft() const {
393
0
    return uloc_isRightToLeft(getBaseName());
394
0
}
395
396
U_NAMESPACE_END
397
398
namespace {
399
icu::CharString
400
509k
GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
401
509k
    icu::CharString result;
402
    // First check for keyword value
403
509k
    icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
404
509k
    int32_t len = kw.length();
405
    // In UTS35
406
    //   type = alphanum{3,8} (sep alphanum{3,8})* ;
407
    // so we know the subdivision must fit the type already.
408
    //
409
    //   unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
410
    //   unicode_region_subtag = (alpha{2} | digit{3}) ;
411
    //   unicode_subdivision_suffix = alphanum{1,4} ;
412
    // But we also know there are no id in start with digit{3} in
413
    // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
414
    // Therefore we can simplify as
415
    // unicode_subdivision_id = alpha{2} alphanum{1,4}
416
    //
417
    // and only need to accept/reject the code based on the alpha{2} and the length.
418
509k
    if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
419
509k
        uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
420
        // Additional Check
421
273
        static icu::RegionValidateMap valid;
422
273
        const char region[] = {kw[0], kw[1], '\0'};
423
273
        if (valid.isSet(region)) {
424
225
            result.append(uprv_toupper(kw[0]), status);
425
225
            result.append(uprv_toupper(kw[1]), status);
426
225
        }
427
273
    }
428
509k
    return result;
429
509k
}
430
}  // namespace
431
432
U_EXPORT icu::CharString
433
ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
434
484k
                                     UErrorCode& status) {
435
484k
    if (U_FAILURE(status)) {
436
0
        return {};
437
0
    }
438
484k
    icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
439
484k
    if (U_SUCCESS(status) && rgBuf.isEmpty()) {
440
        // No valid rg keyword value, try for unicode_region_subtag
441
480k
        rgBuf = ulocimp_getRegion(localeID == nullptr ? uloc_getDefault() : localeID, status);
442
480k
        if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
443
            // Second check for sd keyword value
444
24.6k
            rgBuf = GetRegionFromKey(localeID, "sd", status);
445
24.6k
            if (U_SUCCESS(status) && rgBuf.isEmpty()) {
446
                // no unicode_region_subtag but inferRegion true, try likely subtags
447
24.4k
                UErrorCode rgStatus = U_ZERO_ERROR;
448
24.4k
                icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
449
24.4k
                if (U_SUCCESS(rgStatus)) {
450
23.1k
                    rgBuf = ulocimp_getRegion(locBuf.toStringPiece(), status);
451
23.1k
                }
452
24.4k
            }
453
24.6k
        }
454
480k
    }
455
456
484k
    return rgBuf;
457
484k
}
458
459
namespace {
460
461
// The following data is generated by unit test code inside
462
// test/intltest/regiontst.cpp from the resource data while
463
// the test failed.
464
const uint32_t gValidRegionMap[] = {
465
    0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, 
466
    0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, 
467
    0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, 
468
    0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, 
469
    0x0410419a, 0x00408557, 0x00004002, 0x00100001, 
470
    0x00400408, 0x00000001, 
471
};
472
473
}  // namespace
474
   //
475
U_NAMESPACE_BEGIN
476
2
RegionValidateMap::RegionValidateMap() {
477
2
    uprv_memcpy(map, gValidRegionMap, sizeof(map));
478
2
}
479
480
2
RegionValidateMap::~RegionValidateMap() {
481
2
}
482
483
273
bool RegionValidateMap::isSet(const char* region) const {
484
273
    int32_t index = value(region);
485
273
    if (index < 0) {
486
0
        return false;
487
0
    }
488
273
    return 0 != (map[index / 32] & (1L << (index % 32)));
489
273
}
490
491
0
bool RegionValidateMap::equals(const RegionValidateMap& that) const {
492
0
    return uprv_memcmp(map, that.map, sizeof(map)) == 0;
493
0
}
494
495
// The code transform two letter a-z to a integer valued between -1, 26x26.
496
// -1 indicate the region is outside the range of two letter a-z
497
// the rest of value is between 0 and 676 (= 26x26) and used as an index
498
// the the bigmap in map. The map is an array of 22 int32_t.
499
// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
500
273
int32_t RegionValidateMap::value(const char* region) const {
501
273
    if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
502
273
        region[2] == '\0') {
503
273
        return (uprv_toupper(region[0])-'A') * 26 +
504
273
               (uprv_toupper(region[1])-'A');
505
273
    }
506
0
    return -1;
507
273
}
508
509
U_NAMESPACE_END