Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/uloc.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 1997-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*
9
* File ULOC.CPP
10
*
11
* Modification History:
12
*
13
*   Date        Name        Description
14
*   04/01/97    aliu        Creation.
15
*   08/21/98    stephen     JDK 1.2 sync
16
*   12/08/98    rtg         New Locale implementation and C API
17
*   03/15/99    damiba      overhaul.
18
*   04/06/99    stephen     changed setDefault() to realloc and copy
19
*   06/14/99    stephen     Changed calls to ures_open for new params
20
*   07/21/99    stephen     Modified setDefault() to propagate to C++
21
*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22
*                           brought canonicalization code into line with spec
23
*****************************************************************************/
24
25
/*
26
   POSIX's locale format, from putil.c: [no spaces]
27
28
     ll [ _CC ] [ . MM ] [ @ VV]
29
30
     l = lang, C = ctry, M = charmap, V = variant
31
*/
32
33
#include <algorithm>
34
#include <optional>
35
#include <string_view>
36
37
#include "unicode/bytestream.h"
38
#include "unicode/errorcode.h"
39
#include "unicode/stringpiece.h"
40
#include "unicode/utypes.h"
41
#include "unicode/ustring.h"
42
#include "unicode/uloc.h"
43
44
#include "bytesinkutil.h"
45
#include "putilimp.h"
46
#include "ustr_imp.h"
47
#include "ulocimp.h"
48
#include "umutex.h"
49
#include "cstring.h"
50
#include "cmemory.h"
51
#include "locmap.h"
52
#include "uarrsort.h"
53
#include "uenumimp.h"
54
#include "uassert.h"
55
#include "charstr.h"
56
57
U_NAMESPACE_USE
58
59
/* ### Declarations **************************************************/
60
61
/* Locale stuff from locid.cpp */
62
U_CFUNC void locale_set_default(const char *id);
63
U_CFUNC const char *locale_get_default();
64
65
namespace {
66
67
/* ### Data tables **************************************************/
68
69
/**
70
 * Table of language codes, both 2- and 3-letter, with preference
71
 * given to 2-letter codes where possible.  Includes 3-letter codes
72
 * that lack a 2-letter equivalent.
73
 *
74
 * This list must be in sorted order.  This list is returned directly
75
 * to the user by some API.
76
 *
77
 * This list must be kept in sync with LANGUAGES_3, with corresponding
78
 * entries matched.
79
 *
80
 * This table should be terminated with a nullptr entry, followed by a
81
 * second list, and another nullptr entry.  The first list is visible to
82
 * user code when this array is returned by API.  The second list
83
 * contains codes we support, but do not expose through user API.
84
 *
85
 * Notes
86
 *
87
 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
88
 * include the revisions up to 2001/7/27 *CWB*
89
 *
90
 * The 3 character codes are the terminology codes like RFC 3066.  This
91
 * is compatible with prior ICU codes
92
 *
93
 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
94
 * table but now at the end of the table because 3 character codes are
95
 * duplicates.  This avoids bad searches going from 3 to 2 character
96
 * codes.
97
 *
98
 * The range qaa-qtz is reserved for local use
99
 */
100
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
101
/* ISO639 table version is 20150505 */
102
/* Subsequent hand addition of selected languages */
103
constexpr const char* LANGUAGES[] = {
104
    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
105
    "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
106
    "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
107
    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
108
    "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
109
    "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
110
    "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
111
    "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
112
    "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
113
    "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
114
    "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
115
    "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
116
    "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
117
    "cs",  "csb", "csw", "cu",  "cv",  "cy",
118
    "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
119
    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
120
    "dyo", "dyu", "dz",  "dzg",
121
    "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
122
    "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
123
    "ext",
124
    "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
125
    "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
126
    "frs", "fur", "fy",
127
    "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
128
    "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
129
    "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
130
    "gur", "guz", "gv",  "gwi",
131
    "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
132
    "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
133
    "hup", "hy",  "hz",
134
    "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
135
    "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
136
    "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
137
    "jv",
138
    "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
139
    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
140
    "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
141
    "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
142
    "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
143
    "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
144
    "kv",  "kw",  "kxv", "ky",
145
    "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
146
    "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
147
    "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
148
    "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
149
    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
150
    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
151
    "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
152
    "ml",  "mn",  "mnc", "mni",
153
    "moh", "mos", "mr",  "mrj",
154
    "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
155
    "my",  "mye", "myv", "mzn",
156
    "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
157
    "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
158
    "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
159
    "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
160
    "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
161
    "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
162
    "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
163
    "pon", "prg", "pro", "ps",  "pt",
164
    "qu",  "quc", "qug",
165
    "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
166
    "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
167
    "rw",  "rwk",
168
    "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
169
    "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
170
    "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
171
    "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
172
    "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
173
    "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
174
    "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
175
    "sv",  "sw",  "swb", "syc", "syr", "szl",
176
    "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
177
    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
178
    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
179
    "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
180
    "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
181
    "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
182
    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
183
    "vo", "vot", "vro", "vun",
184
    "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
185
    "xal", "xh",  "xmf", "xnr", "xog",
186
    "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
187
    "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
188
    "zun", "zxx", "zza",
189
nullptr,
190
    "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
191
nullptr
192
};
193
194
constexpr const char* DEPRECATED_LANGUAGES[]={
195
    "in", "iw", "ji", "jw", "mo", nullptr, nullptr
196
};
197
constexpr const char* REPLACEMENT_LANGUAGES[]={
198
    "id", "he", "yi", "jv", "ro", nullptr, nullptr
199
};
200
201
/**
202
 * Table of 3-letter language codes.
203
 *
204
 * This is a lookup table used to convert 3-letter language codes to
205
 * their 2-letter equivalent, where possible.  It must be kept in sync
206
 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
207
 * same language as LANGUAGES_3[i].  The commented-out lines are
208
 * copied from LANGUAGES to make eyeballing this baby easier.
209
 *
210
 * Where a 3-letter language code has no 2-letter equivalent, the
211
 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
212
 *
213
 * This table should be terminated with a nullptr entry, followed by a
214
 * second list, and another nullptr entry.  The two lists correspond to
215
 * the two lists in LANGUAGES.
216
 */
217
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
218
/* ISO639 table version is 20150505 */
219
/* Subsequent hand addition of selected languages */
220
constexpr const char* LANGUAGES_3[] = {
221
    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
222
    "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
223
    "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
224
    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
225
    "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
226
    "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
227
    "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
228
    "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
229
    "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
230
    "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
231
    "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
232
    "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
233
    "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
234
    "ces", "csb", "csw", "chu", "chv", "cym",
235
    "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
236
    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
237
    "dyo", "dyu", "dzo", "dzg",
238
    "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
239
    "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
240
    "ext",
241
    "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
242
    "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
243
    "frs", "fur", "fry",
244
    "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
245
    "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
246
    "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
247
    "gur", "guz", "glv", "gwi",
248
    "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
249
    "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
250
    "hup", "hye", "her",
251
    "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
252
    "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
253
    "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
254
    "jav",
255
    "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
256
    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
257
    "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
258
    "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
259
    "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
260
    "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
261
    "kom", "cor", "kxv", "kir",
262
    "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
263
    "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
264
    "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
265
    "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
266
    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
267
    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
268
    "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
269
    "mal", "mon", "mnc", "mni",
270
    "moh", "mos", "mar", "mrj",
271
    "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
272
    "mya", "mye", "myv", "mzn",
273
    "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
274
    "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
275
    "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
276
    "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
277
    "oci", "oji", "orm", "ori", "oss", "osa", "ota",
278
    "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
279
    "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
280
    "pon", "prg", "pro", "pus", "por",
281
    "que", "quc", "qug",
282
    "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
283
    "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
284
    "kin", "rwk",
285
    "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
286
    "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
287
    "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
288
    "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
289
    "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
290
    "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
291
    "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
292
    "swe", "swa", "swb", "syc", "syr", "szl",
293
    "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
294
    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
295
    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
296
    "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
297
    "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
298
    "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
299
    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
300
    "vol", "vot", "vro", "vun",
301
    "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
302
    "xal", "xho", "xmf", "xnr", "xog",
303
    "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
304
    "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
305
    "zun", "zxx", "zza",
306
nullptr,
307
/*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
308
    "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
309
nullptr
310
};
311
312
/**
313
 * Table of 2-letter country codes.
314
 *
315
 * This list must be in sorted order.  This list is returned directly
316
 * to the user by some API.
317
 *
318
 * This list must be kept in sync with COUNTRIES_3, with corresponding
319
 * entries matched.
320
 *
321
 * This table should be terminated with a nullptr entry, followed by a
322
 * second list, and another nullptr entry.  The first list is visible to
323
 * user code when this array is returned by API.  The second list
324
 * contains codes we support, but do not expose through user API.
325
 *
326
 * Notes:
327
 *
328
 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
329
 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
330
 * new codes keeping the old ones for compatibility updated to include
331
 * 1999/12/03 revisions *CWB*
332
 *
333
 * RO(ROM) is now RO(ROU) according to
334
 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
335
 */
336
constexpr const char* COUNTRIES[] = {
337
    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
338
    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
339
    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
340
    "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
341
    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
342
    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",
343
    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
344
    "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
345
    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
346
    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
347
    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
348
    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
349
    "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
350
    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
351
    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
352
    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
353
    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
354
    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
355
    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
356
    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
357
    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
358
    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
359
    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
360
    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
361
    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
362
    "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
363
    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
364
    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
365
    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
366
    "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
367
nullptr,
368
    "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
369
nullptr
370
};
371
372
constexpr const char* DEPRECATED_COUNTRIES[] = {
373
    "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
374
};
375
constexpr const char* REPLACEMENT_COUNTRIES[] = {
376
/*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
377
    "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr  /* replacement country codes */
378
};
379
380
/**
381
 * Table of 3-letter country codes.
382
 *
383
 * This is a lookup table used to convert 3-letter country codes to
384
 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
385
 * For all valid i, COUNTRIES[i] must refer to the same country as
386
 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
387
 * to make eyeballing this baby easier.
388
 *
389
 * This table should be terminated with a nullptr entry, followed by a
390
 * second list, and another nullptr entry.  The two lists correspond to
391
 * the two lists in COUNTRIES.
392
 */
393
constexpr const char* COUNTRIES_3[] = {
394
/*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
395
    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
396
/*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
397
    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
398
/*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
399
    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
400
/*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
401
    "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
402
/*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
403
    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
404
/*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",     */
405
    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
406
/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
407
    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
408
/*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
409
    "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
410
/*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
411
    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
412
/*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
413
    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
414
/*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
415
    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
416
/*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
417
    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
418
/*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
419
    "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
420
/*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
421
    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
422
/*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
423
    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
424
/*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
425
    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
426
/*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
427
    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
428
/*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
429
    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
430
/*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
431
    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
432
/*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
433
    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
434
/*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
435
    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
436
/*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
437
    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
438
/*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
439
    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
440
/*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
441
    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
442
/*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
443
    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
444
/*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
445
    "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
446
/*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
447
    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
448
/*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
449
    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
450
/*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
451
    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
452
/*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
453
    "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
454
nullptr,
455
/*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
456
    "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
457
nullptr
458
};
459
460
typedef struct CanonicalizationMap {
461
    const char *id;          /* input ID */
462
    const char *canonicalID; /* canonicalized output ID */
463
} CanonicalizationMap;
464
465
/**
466
 * A map to canonicalize locale IDs.  This handles a variety of
467
 * different semantic kinds of transformations.
468
 */
469
constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
470
    { "art__LOJBAN",    "jbo" }, /* registered name */
471
    { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
472
    { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
473
    { "zh__GUOYU",      "zh" }, /* registered name */
474
    { "zh__HAKKA",      "hak" }, /* registered name */
475
    { "zh__XIANG",      "hsn" }, /* registered name */
476
    // subtags with 3 chars won't be treated as variants.
477
    { "zh_GAN",         "gan" }, /* registered name */
478
    { "zh_MIN_NAN",     "nan" }, /* registered name */
479
    { "zh_WUU",         "wuu" }, /* registered name */
480
    { "zh_YUE",         "yue" }, /* registered name */
481
};
482
483
/* ### BCP47 Conversion *******************************************/
484
/* Gets the size of the shortest subtag in the given localeID. */
485
6.95M
int32_t getShortestSubtagLength(std::string_view localeID) {
486
6.95M
    int32_t localeIDLength = static_cast<int32_t>(localeID.length());
487
6.95M
    int32_t length = localeIDLength;
488
6.95M
    int32_t tmpLength = 0;
489
6.95M
    int32_t i;
490
6.95M
    bool reset = true;
491
492
932M
    for (i = 0; i < localeIDLength; i++) {
493
925M
        if (localeID[i] != '_' && localeID[i] != '-') {
494
823M
            if (reset) {
495
103M
                tmpLength = 0;
496
103M
                reset = false;
497
103M
            }
498
823M
            tmpLength++;
499
823M
        } else {
500
102M
            if (tmpLength != 0 && tmpLength < length) {
501
5.59M
                length = tmpLength;
502
5.59M
            }
503
102M
            reset = true;
504
102M
        }
505
925M
    }
506
507
6.95M
    return length;
508
6.95M
}
509
/* Test if the locale id has BCP47 u extension and does not have '@' */
510
7.31M
inline bool _hasBCP47Extension(std::string_view id) {
511
7.31M
    return id.find('@') == std::string_view::npos && getShortestSubtagLength(id) == 1;
512
7.31M
}
513
514
/* ### Keywords **************************************************/
515
102M
inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
516
301M
inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
517
/* Punctuation/symbols allowed in legacy key values */
518
46.7M
inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
519
520
}  // namespace
521
522
389k
#define ULOC_KEYWORD_BUFFER_LEN 25
523
142k
#define ULOC_MAX_NO_KEYWORDS 25
524
525
U_CAPI const char * U_EXPORT2
526
7.36M
locale_getKeywordsStart(std::string_view localeID) {
527
7.36M
    if (size_t pos = localeID.find('@'); pos != std::string_view::npos) {
528
437k
        return localeID.data() + pos;
529
437k
    }
530
#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
531
    else {
532
        /* We do this because the @ sign is variant, and the @ sign used on one
533
        EBCDIC machine won't be compiled the same way on other EBCDIC based
534
        machines. */
535
        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
536
        const uint8_t *charToFind = ebcdicSigns;
537
        while(*charToFind) {
538
            if (size_t pos = localeID.find(*charToFind); pos != std::string_view::npos) {
539
                return localeID.data() + pos;
540
            }
541
            charToFind++;
542
        }
543
    }
544
#endif
545
6.92M
    return nullptr;
546
7.36M
}
547
548
namespace {
549
550
/**
551
 * @param keywordName incoming name to be canonicalized
552
 * @param status return status (keyword too long)
553
 * @return the keyword name
554
 */
555
CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status)
556
1.64M
{
557
1.64M
  if (U_FAILURE(status)) { return {}; }
558
1.64M
  CharString result;
559
560
9.45M
  for (char c : keywordName) {
561
9.45M
    if (!UPRV_ISALPHANUM(c)) {
562
146
      status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
563
146
      return {};
564
146
    }
565
9.45M
    result.append(uprv_tolower(c), status);
566
9.45M
  }
567
1.64M
  if (result.isEmpty()) {
568
0
    status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
569
0
    return {};
570
0
  }
571
572
1.64M
  return result;
573
1.64M
}
574
575
typedef struct {
576
    char keyword[ULOC_KEYWORD_BUFFER_LEN];
577
    int32_t keywordLen;
578
    const char *valueStart;
579
    int32_t valueLen;
580
} KeywordStruct;
581
582
int32_t U_CALLCONV
583
761k
compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
584
761k
    const char* leftString = static_cast<const KeywordStruct*>(left)->keyword;
585
761k
    const char* rightString = static_cast<const KeywordStruct*>(right)->keyword;
586
761k
    return uprv_strcmp(leftString, rightString);
587
761k
}
588
589
}  // namespace
590
591
U_EXPORT CharString
592
ulocimp_getKeywords(std::string_view localeID,
593
                    char prev,
594
                    bool valuesToo,
595
                    UErrorCode& status)
596
12.9k
{
597
12.9k
    return ByteSinkUtil::viaByteSinkToCharString(
598
12.9k
        [&](ByteSink& sink, UErrorCode& status) {
599
12.9k
            ulocimp_getKeywords(localeID,
600
12.9k
                                prev,
601
12.9k
                                sink,
602
12.9k
                                valuesToo,
603
12.9k
                                status);
604
12.9k
        },
605
12.9k
        status);
606
12.9k
}
607
608
U_EXPORT void
609
ulocimp_getKeywords(std::string_view localeID,
610
                    char prev,
611
                    ByteSink& sink,
612
                    bool valuesToo,
613
                    UErrorCode& status)
614
142k
{
615
142k
    if (U_FAILURE(status)) { return; }
616
617
142k
    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
618
619
142k
    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
620
142k
    int32_t numKeywords = 0;
621
142k
    size_t equalSign = std::string_view::npos;
622
142k
    size_t semicolon = std::string_view::npos;
623
142k
    int32_t i = 0, j, n;
624
625
142k
    if(prev == '@') { /* start of keyword definition */
626
        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
627
390k
        do {
628
390k
            bool duplicate = false;
629
            /* skip leading spaces */
630
446k
            while (!localeID.empty() && localeID.front() == ' ') {
631
55.9k
                localeID.remove_prefix(1);
632
55.9k
            }
633
390k
            if (localeID.empty()) { /* handle trailing "; " */
634
78
                break;
635
78
            }
636
390k
            if(numKeywords == maxKeywords) {
637
960
                status = U_INTERNAL_PROGRAM_ERROR;
638
960
                return;
639
960
            }
640
389k
            equalSign = localeID.find('=');
641
389k
            semicolon = localeID.find(';');
642
            /* lack of '=' [foo@currency] is illegal */
643
            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
644
389k
            if (equalSign == std::string_view::npos ||
645
389k
                (semicolon != std::string_view::npos && semicolon < equalSign)) {
646
190
                status = U_INVALID_FORMAT_ERROR;
647
190
                return;
648
190
            }
649
            /* zero-length keyword is an error. */
650
389k
            if (equalSign == 0) {
651
338
                status = U_INVALID_FORMAT_ERROR;
652
338
                return;
653
338
            }
654
            /* need to normalize both keyword and keyword name */
655
389k
            if (equalSign >= ULOC_KEYWORD_BUFFER_LEN) {
656
                /* keyword name too long for internal buffer */
657
701
                status = U_INTERNAL_PROGRAM_ERROR;
658
701
                return;
659
701
            }
660
2.19M
            for (i = 0, n = 0; static_cast<size_t>(i) < equalSign; ++i) {
661
1.80M
                if (localeID[i] != ' ') {
662
1.80M
                    keywordList[numKeywords].keyword[n++] = uprv_tolower(localeID[i]);
663
1.80M
                }
664
1.80M
            }
665
666
388k
            keywordList[numKeywords].keyword[n] = 0;
667
388k
            keywordList[numKeywords].keywordLen = n;
668
            /* now grab the value part. First we skip the '=' */
669
388k
            equalSign++;
670
            /* then we leading spaces */
671
28.8M
            while (equalSign < localeID.length() && localeID[equalSign] == ' ') {
672
28.4M
                equalSign++;
673
28.4M
            }
674
675
            /* Premature end or zero-length value */
676
388k
            if (equalSign == localeID.length() || equalSign == semicolon) {
677
344
                status = U_INVALID_FORMAT_ERROR;
678
344
                return;
679
344
            }
680
681
388k
            keywordList[numKeywords].valueStart = localeID.data() + equalSign;
682
683
388k
            std::string_view value = localeID;
684
388k
            if (semicolon != std::string_view::npos) {
685
248k
                value.remove_suffix(value.length() - semicolon);
686
248k
                localeID.remove_prefix(semicolon + 1);
687
248k
            } else {
688
139k
                localeID = {};
689
139k
            }
690
388k
            value.remove_prefix(equalSign);
691
388k
            if (size_t last = value.find_last_not_of(' '); last != std::string_view::npos) {
692
388k
                value.remove_suffix(value.length() - last - 1);
693
388k
            }
694
388k
            keywordList[numKeywords].valueLen = static_cast<int32_t>(value.length());
695
696
            /* If this is a duplicate keyword, then ignore it */
697
2.05M
            for (j=0; j<numKeywords; ++j) {
698
1.67M
                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
699
7.18k
                    duplicate = true;
700
7.18k
                    break;
701
7.18k
                }
702
1.67M
            }
703
388k
            if (!duplicate) {
704
381k
                ++numKeywords;
705
381k
            }
706
388k
        } while (!localeID.empty());
707
708
        /* now we have a list of keywords */
709
        /* we need to sort it */
710
140k
        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
711
712
        /* Now construct the keyword part */
713
496k
        for(i = 0; i < numKeywords; i++) {
714
356k
            sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
715
356k
            if(valuesToo) {
716
326k
                sink.Append("=", 1);
717
326k
                sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
718
326k
                if(i < numKeywords - 1) {
719
198k
                    sink.Append(";", 1);
720
198k
                }
721
326k
            } else {
722
30.0k
                sink.Append("\0", 1);
723
30.0k
            }
724
356k
        }
725
140k
    }
726
142k
}
727
728
U_CAPI int32_t U_EXPORT2
729
uloc_getKeywordValue(const char* localeID,
730
                     const char* keywordName,
731
                     char* buffer, int32_t bufferCapacity,
732
                     UErrorCode* status)
733
572k
{
734
572k
    if (U_FAILURE(*status)) { return 0; }
735
571k
    if (keywordName == nullptr || *keywordName == '\0') {
736
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
737
0
        return 0;
738
0
    }
739
571k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
740
571k
        buffer, bufferCapacity,
741
571k
        [&](ByteSink& sink, UErrorCode& status) {
742
571k
            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
743
571k
        },
744
571k
        *status);
745
571k
}
746
747
U_EXPORT CharString
748
ulocimp_getKeywordValue(const char* localeID,
749
                        std::string_view keywordName,
750
                        UErrorCode& status)
751
1.04M
{
752
1.04M
    return ByteSinkUtil::viaByteSinkToCharString(
753
1.04M
        [&](ByteSink& sink, UErrorCode& status) {
754
1.04M
            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
755
1.04M
        },
756
1.04M
        status);
757
1.04M
}
758
759
U_EXPORT void
760
ulocimp_getKeywordValue(const char* localeID,
761
                        std::string_view keywordName,
762
                        icu::ByteSink& sink,
763
                        UErrorCode& status)
764
1.63M
{
765
1.63M
    if (U_FAILURE(status)) { return; }
766
767
1.63M
    if (localeID == nullptr || keywordName.empty()) {
768
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
769
0
        return;
770
0
    }
771
772
1.63M
    const char* startSearchHere = nullptr;
773
1.63M
    const char* nextSeparator = nullptr;
774
775
1.63M
    CharString tempBuffer;
776
1.63M
    const char* tmpLocaleID;
777
778
1.63M
    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
779
1.63M
    if (U_FAILURE(status)) {
780
146
      return;
781
146
    }
782
783
1.63M
    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
784
51.2k
        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status);
785
51.2k
        tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
786
1.58M
    } else {
787
1.58M
        tmpLocaleID=localeID;
788
1.58M
    }
789
790
1.63M
    startSearchHere = locale_getKeywordsStart(tmpLocaleID);
791
1.63M
    if(startSearchHere == nullptr) {
792
        /* no keywords, return at once */
793
1.45M
        return;
794
1.45M
    }
795
796
    /* find the first keyword */
797
364k
    while(startSearchHere) {
798
282k
        const char* keyValueTail;
799
800
282k
        startSearchHere++; /* skip @ or ; */
801
282k
        nextSeparator = uprv_strchr(startSearchHere, '=');
802
282k
        if(!nextSeparator) {
803
10.3k
            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
804
10.3k
            return;
805
10.3k
        }
806
        /* strip leading & trailing spaces (TC decided to tolerate these) */
807
272k
        while(*startSearchHere == ' ') {
808
0
            startSearchHere++;
809
0
        }
810
272k
        keyValueTail = nextSeparator;
811
272k
        while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
812
0
            keyValueTail--;
813
0
        }
814
        /* now keyValueTail points to first char after the keyName */
815
        /* copy & normalize keyName from locale */
816
272k
        if (startSearchHere == keyValueTail) {
817
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
818
0
            return;
819
0
        }
820
272k
        CharString localeKeywordName;
821
1.89M
        while (startSearchHere < keyValueTail) {
822
1.62M
          if (!UPRV_ISALPHANUM(*startSearchHere)) {
823
8.73k
            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
824
8.73k
            return;
825
8.73k
          }
826
1.61M
          localeKeywordName.append(uprv_tolower(*startSearchHere++), status);
827
1.61M
        }
828
263k
        if (U_FAILURE(status)) {
829
0
            return;
830
0
        }
831
832
263k
        startSearchHere = uprv_strchr(nextSeparator, ';');
833
834
263k
        if (canonKeywordName == localeKeywordName) {
835
             /* current entry matches the keyword. */
836
78.1k
           nextSeparator++; /* skip '=' */
837
            /* First strip leading & trailing spaces (TC decided to tolerate these) */
838
78.1k
            while(*nextSeparator == ' ') {
839
0
              nextSeparator++;
840
0
            }
841
78.1k
            keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
842
78.1k
            while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
843
0
              keyValueTail--;
844
0
            }
845
            /* Now copy the value, but check well-formedness */
846
78.1k
            if (nextSeparator == keyValueTail) {
847
0
              status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
848
0
              return;
849
0
            }
850
155M
            while (nextSeparator < keyValueTail) {
851
155M
              if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
852
826
                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
853
826
                return;
854
826
              }
855
              /* Should we lowercase value to return here? Tests expect as-is. */
856
155M
              sink.Append(nextSeparator++, 1);
857
155M
            }
858
77.2k
            return;
859
78.1k
        }
860
263k
    }
861
179k
}
862
863
U_CAPI int32_t U_EXPORT2
864
uloc_setKeywordValue(const char* keywordName,
865
                     const char* keywordValue,
866
                     char* buffer, int32_t bufferCapacity,
867
                     UErrorCode* status)
868
0
{
869
0
    if (U_FAILURE(*status)) { return 0; }
870
871
0
    if (keywordName == nullptr || *keywordName == 0) {
872
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
873
0
        return 0;
874
0
    }
875
876
0
    if (bufferCapacity <= 1) {
877
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
878
0
        return 0;
879
0
    }
880
881
0
    int32_t bufLen = (int32_t)uprv_strlen(buffer);
882
0
    if(bufferCapacity<bufLen) {
883
        /* The capacity is less than the length?! Is this NUL terminated? */
884
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
885
0
        return 0;
886
0
    }
887
888
0
    char* keywords = const_cast<char*>(
889
0
        locale_getKeywordsStart({buffer, static_cast<std::string_view::size_type>(bufLen)}));
890
0
    int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer;
891
    // Remove -1 from the capacity so that this function can guarantee NUL termination.
892
0
    CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
893
0
                              bufferCapacity - baseLen - 1);
894
0
    int32_t reslen = ulocimp_setKeywordValue(
895
0
        keywords == nullptr ? std::string_view() : keywords,
896
0
        keywordName,
897
0
        keywordValue == nullptr ? std::string_view() : keywordValue,
898
0
        sink,
899
0
        *status);
900
901
0
    if (U_FAILURE(*status)) {
902
0
        return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0;
903
0
    }
904
905
    // See the documentation for this function, it's guaranteed to never
906
    // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR.
907
    // In this case, nothing has been written to the sink, so it cannot have Overflowed().
908
0
    U_ASSERT(!sink.Overflowed());
909
0
    U_ASSERT(reslen >= 0);
910
0
    return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
911
0
}
912
913
U_EXPORT void
914
ulocimp_setKeywordValue(std::string_view keywordName,
915
                        std::string_view keywordValue,
916
                        CharString& localeID,
917
                        UErrorCode& status)
918
15.4k
{
919
15.4k
    if (U_FAILURE(status)) { return; }
920
15.4k
    std::string_view keywords;
921
15.4k
    if (const char* start = locale_getKeywordsStart(localeID.toStringPiece()); start != nullptr) {
922
        // This is safe because CharString::truncate() doesn't actually erase any
923
        // data, but simply sets the position for where new data will be written.
924
4.34k
        int32_t size = start - localeID.data();
925
4.34k
        keywords = localeID.toStringPiece();
926
4.34k
        keywords.remove_prefix(size);
927
4.34k
        localeID.truncate(size);
928
4.34k
    }
929
15.4k
    CharStringByteSink sink(&localeID);
930
15.4k
    ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
931
15.4k
}
932
933
U_EXPORT int32_t
934
ulocimp_setKeywordValue(std::string_view keywords,
935
                        std::string_view keywordName,
936
                        std::string_view keywordValue,
937
                        ByteSink& sink,
938
                        UErrorCode& status)
939
15.4k
{
940
15.4k
    if (U_FAILURE(status)) { return 0; }
941
942
    /* TODO: sorting. removal. */
943
15.4k
    int32_t needLen = 0;
944
15.4k
    int32_t rc;
945
15.4k
    CharString updatedKeysAndValues;
946
15.4k
    bool handledInputKeyAndValue = false;
947
15.4k
    char keyValuePrefix = '@';
948
949
15.4k
    if (status == U_STRING_NOT_TERMINATED_WARNING) {
950
0
        status = U_ZERO_ERROR;
951
0
    }
952
15.4k
    if (keywordName.empty()) {
953
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
954
0
        return 0;
955
0
    }
956
15.4k
    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
957
15.4k
    if (U_FAILURE(status)) {
958
0
        return 0;
959
0
    }
960
961
15.4k
    CharString canonKeywordValue;
962
59.4M
    for (char c : keywordValue) {
963
59.4M
        if (!UPRV_ISALPHANUM(c) && !UPRV_OK_VALUE_PUNCTUATION(c)) {
964
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
965
0
            return 0;
966
0
        }
967
        /* Should we force lowercase in value to set? */
968
59.4M
        canonKeywordValue.append(c, status);
969
59.4M
    }
970
15.4k
    if (U_FAILURE(status)) {
971
0
        return 0;
972
0
    }
973
974
15.4k
    if (keywords.size() <= 1) {
975
11.0k
        if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
976
0
            U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
977
0
            return 0;
978
0
        }
979
980
11.0k
        needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length();
981
11.0k
        int32_t capacity = 0;
982
11.0k
        char* buffer = sink.GetAppendBuffer(
983
11.0k
                needLen, needLen, nullptr, needLen, &capacity);
984
11.0k
        if (capacity < needLen || buffer == nullptr) {
985
0
            status = U_BUFFER_OVERFLOW_ERROR;
986
0
            return needLen; /* no change */
987
0
        }
988
11.0k
        char* it = buffer;
989
990
11.0k
        *it++ = '@';
991
11.0k
        uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length());
992
11.0k
        it += canonKeywordName.length();
993
11.0k
        *it++ = '=';
994
11.0k
        uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length());
995
11.0k
        sink.Append(buffer, needLen);
996
11.0k
        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
997
11.0k
        return needLen;
998
11.0k
    } /* end shortcut - no @ */
999
1000
    /* search for keyword */
1001
30.4k
    for (size_t keywordStart = 0; keywordStart != std::string_view::npos;) {
1002
26.1k
        keywordStart++; /* skip @ or ; */
1003
26.1k
        size_t nextEqualsign = keywords.find('=', keywordStart);
1004
26.1k
        if (nextEqualsign == std::string_view::npos) {
1005
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
1006
0
            return 0;
1007
0
        }
1008
        /* strip leading & trailing spaces (TC decided to tolerate these) */
1009
26.1k
        while (keywordStart < keywords.size() && keywords[keywordStart] == ' ') {
1010
0
            keywordStart++;
1011
0
        }
1012
26.1k
        size_t keyValueTail = nextEqualsign;
1013
26.1k
        while (keyValueTail > keywordStart && keywords[keyValueTail - 1] == ' ') {
1014
0
            keyValueTail--;
1015
0
        }
1016
        /* now keyValueTail points to first char after the keyName */
1017
        /* copy & normalize keyName from locale */
1018
26.1k
        if (keywordStart == keyValueTail) {
1019
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1020
0
            return 0;
1021
0
        }
1022
26.1k
        CharString localeKeywordName;
1023
194k
        while (keywordStart < keyValueTail) {
1024
168k
            if (!UPRV_ISALPHANUM(keywords[keywordStart])) {
1025
15
                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1026
15
                return 0;
1027
15
            }
1028
168k
            localeKeywordName.append(uprv_tolower(keywords[keywordStart++]), status);
1029
168k
        }
1030
26.0k
        if (U_FAILURE(status)) {
1031
0
            return 0;
1032
0
        }
1033
1034
26.0k
        size_t nextSeparator = keywords.find(';', nextEqualsign);
1035
1036
        /* start processing the value part */
1037
26.0k
        nextEqualsign++; /* skip '=' */
1038
        /* First strip leading & trailing spaces (TC decided to tolerate these) */
1039
26.0k
        while (nextEqualsign < keywords.size() && keywords[nextEqualsign] == ' ') {
1040
0
            nextEqualsign++;
1041
0
        }
1042
26.0k
        keyValueTail = nextSeparator == std::string_view::npos ? keywords.size() : nextSeparator;
1043
26.0k
        while (keyValueTail > nextEqualsign && keywords[keyValueTail - 1] == ' ') {
1044
0
            keyValueTail--;
1045
0
        }
1046
26.0k
        if (nextEqualsign == keyValueTail) {
1047
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1048
0
            return 0;
1049
0
        }
1050
1051
26.0k
        rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data());
1052
26.0k
        if(rc == 0) {
1053
            /* Current entry matches the input keyword. Update the entry */
1054
1.72k
            if (!canonKeywordValue.isEmpty()) { /* updating a value */
1055
1.71k
                updatedKeysAndValues.append(keyValuePrefix, status);
1056
1.71k
                keyValuePrefix = ';'; /* for any subsequent key-value pair */
1057
1.71k
                updatedKeysAndValues.append(canonKeywordName, status);
1058
1.71k
                updatedKeysAndValues.append('=', status);
1059
1.71k
                updatedKeysAndValues.append(canonKeywordValue, status);
1060
1.71k
            } /* else removing this entry, don't emit anything */
1061
1.72k
            handledInputKeyAndValue = true;
1062
24.3k
        } else {
1063
           /* input keyword sorts earlier than current entry, add before current entry */
1064
24.3k
            if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1065
                /* insert new entry at this location */
1066
60
                updatedKeysAndValues.append(keyValuePrefix, status);
1067
60
                keyValuePrefix = ';'; /* for any subsequent key-value pair */
1068
60
                updatedKeysAndValues.append(canonKeywordName, status);
1069
60
                updatedKeysAndValues.append('=', status);
1070
60
                updatedKeysAndValues.append(canonKeywordValue, status);
1071
60
                handledInputKeyAndValue = true;
1072
60
            }
1073
            /* copy the current entry */
1074
24.3k
            updatedKeysAndValues.append(keyValuePrefix, status);
1075
24.3k
            keyValuePrefix = ';'; /* for any subsequent key-value pair */
1076
24.3k
            updatedKeysAndValues.append(localeKeywordName, status);
1077
24.3k
            updatedKeysAndValues.append('=', status);
1078
24.3k
            updatedKeysAndValues.append(keywords.data() + nextEqualsign,
1079
24.3k
                                        static_cast<int32_t>(keyValueTail - nextEqualsign), status);
1080
24.3k
        }
1081
26.0k
        if (nextSeparator == std::string_view::npos && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1082
            /* append new entry at the end, it sorts later than existing entries */
1083
2.56k
            updatedKeysAndValues.append(keyValuePrefix, status);
1084
            /* skip keyValuePrefix update, no subsequent key-value pair */
1085
2.56k
            updatedKeysAndValues.append(canonKeywordName, status);
1086
2.56k
            updatedKeysAndValues.append('=', status);
1087
2.56k
            updatedKeysAndValues.append(canonKeywordValue, status);
1088
2.56k
            handledInputKeyAndValue = true;
1089
2.56k
        }
1090
26.0k
        keywordStart = nextSeparator;
1091
26.0k
    } /* end loop searching */
1092
1093
    /* Any error from updatedKeysAndValues.append above would be internal and not due to
1094
     * problems with the passed-in locale. So if we did encounter problems with the
1095
     * passed-in locale above, those errors took precedence and overrode any error
1096
     * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1097
     * are errors here they are from updatedKeysAndValues.append; they do cause an
1098
     * error return but the passed-in locale is unmodified and the original bufLen is
1099
     * returned.
1100
     */
1101
4.33k
    if (!handledInputKeyAndValue || U_FAILURE(status)) {
1102
        /* if input key/value specified removal of a keyword not present in locale, or
1103
         * there was an error in CharString.append, leave original locale alone. */
1104
0
        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1105
        // The sink is expected to be a buffer which already contains the full
1106
        // locale string, so when it isn't going to be modified there's no need
1107
        // to actually write any data to it, as the data is already there. Only
1108
        // the first character needs to be overwritten (changing '\0' to '@').
1109
0
        needLen = static_cast<int32_t>(keywords.size());
1110
0
        int32_t capacity = 0;
1111
0
        char* buffer = sink.GetAppendBuffer(
1112
0
                needLen, needLen, nullptr, needLen, &capacity);
1113
0
        if (capacity < needLen || buffer == nullptr) {
1114
0
            status = U_BUFFER_OVERFLOW_ERROR;
1115
0
        } else {
1116
0
            *buffer = '@';
1117
0
            sink.Append(buffer, needLen);
1118
0
        }
1119
0
        return needLen;
1120
0
    }
1121
1122
4.33k
    needLen = updatedKeysAndValues.length();
1123
    // Check to see can we fit the updatedKeysAndValues, if not, return
1124
    // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1125
    // We do this because this API function does not behave like most others:
1126
    // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1127
    // When the contents fits but without the terminating NUL, in this case we need to not change
1128
    // the buffer contents and return with a buffer overflow error.
1129
4.33k
    if (needLen > 0) {
1130
4.32k
        int32_t capacity = 0;
1131
4.32k
        char* buffer = sink.GetAppendBuffer(
1132
4.32k
                needLen, needLen, nullptr, needLen, &capacity);
1133
4.32k
        if (capacity < needLen || buffer == nullptr) {
1134
0
            status = U_BUFFER_OVERFLOW_ERROR;
1135
0
            return needLen;
1136
0
        }
1137
4.32k
        uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
1138
4.32k
        sink.Append(buffer, needLen);
1139
4.32k
    }
1140
4.33k
    U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1141
4.33k
    return needLen;
1142
4.33k
}
1143
1144
/* ### ID parsing implementation **************************************************/
1145
1146
namespace {
1147
1148
13.2M
inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
1149
1150
/*returns true if one of the special prefixes is here (s=string)
1151
  'x-' or 'i-' */
1152
13.5M
inline bool _isIDPrefix(std::string_view s) {
1153
13.5M
    return s.size() >= 2 && _isPrefixLetter(s[0]) && _isIDSeparator(s[1]);
1154
13.5M
}
1155
1156
/* Dot terminates it because of POSIX form  where dot precedes the codepage
1157
 * except for variant
1158
 */
1159
100M
inline bool _isTerminator(char a) { return a == '.' || a == '@'; }
1160
1161
9.27M
inline bool _isBCP47Extension(std::string_view p) {
1162
9.27M
    return p.size() >= 3 &&
1163
9.10M
           p[0] == '-' &&
1164
450k
           (p[1] == 't' || p[1] == 'T' ||
1165
443k
            p[1] == 'u' || p[1] == 'U' ||
1166
425k
            p[1] == 'x' || p[1] == 'X') &&
1167
32.2k
           p[2] == '-';
1168
9.27M
}
1169
1170
/**
1171
 * Lookup 'key' in the array 'list'.  The array 'list' should contain
1172
 * a nullptr entry, followed by more entries, and a second nullptr entry.
1173
 *
1174
 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1175
 * COUNTRIES_3.
1176
 */
1177
std::optional<int16_t> _findIndex(const char* const* list, const char* key)
1178
1.32M
{
1179
1.32M
    const char* const* anchor = list;
1180
1.32M
    int32_t pass = 0;
1181
1182
    /* Make two passes through two nullptr-terminated arrays at 'list' */
1183
1.56M
    while (pass++ < 2) {
1184
363M
        while (*list) {
1185
363M
            if (uprv_strcmp(key, *list) == 0) {
1186
1.19M
                return static_cast<int16_t>(list - anchor);
1187
1.19M
            }
1188
362M
            list++;
1189
362M
        }
1190
246k
        ++list;     /* skip final nullptr *CWB*/
1191
246k
    }
1192
123k
    return std::nullopt;
1193
1.32M
}
1194
1195
}  // namespace
1196
1197
U_CFUNC const char*
1198
0
uloc_getCurrentCountryID(const char* oldID){
1199
0
    std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1200
0
    return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID;
1201
0
}
1202
U_CFUNC const char*
1203
0
uloc_getCurrentLanguageID(const char* oldID){
1204
0
    std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1205
0
    return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID;
1206
0
}
1207
1208
namespace {
1209
1210
/*
1211
 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
1212
 * avoid duplicating code to handle the earlier locale ID pieces
1213
 * in the functions for the later ones by
1214
 * setting the *pEnd pointer to where they stopped parsing
1215
 *
1216
 * TODO try to use this in Locale
1217
 */
1218
1219
13.5M
size_t _getLanguage(std::string_view localeID, ByteSink* sink, UErrorCode& status) {
1220
13.5M
    size_t skip = 0;
1221
13.5M
    if (localeID.size() == 4 && uprv_strnicmp(localeID.data(), "root", 4) == 0) {
1222
60.0k
        skip = 4;
1223
60.0k
        localeID.remove_prefix(skip);
1224
13.4M
    } else if (localeID.size() >= 3 && uprv_strnicmp(localeID.data(), "und", 3) == 0 &&
1225
102k
               (localeID.size() == 3 ||
1226
15.1k
                localeID[3] == '-' ||
1227
15.0k
                localeID[3] == '_' ||
1228
89.7k
                localeID[3] == '@')) {
1229
89.7k
        skip = 3;
1230
89.7k
        localeID.remove_prefix(skip);
1231
89.7k
    }
1232
1233
13.5M
    constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1;  // Minus NUL.
1234
1235
    /* if it starts with i- or x- then copy that prefix */
1236
13.5M
    size_t len = _isIDPrefix(localeID) ? 2 : 0;
1237
41.8M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1238
28.2M
        if (len == MAXLEN) {
1239
4.78k
            status = U_ILLEGAL_ARGUMENT_ERROR;
1240
4.78k
            return 0;
1241
4.78k
        }
1242
28.2M
        len++;
1243
28.2M
    }
1244
1245
13.5M
    if (sink == nullptr || len == 0) { return skip + len; }
1246
1247
8.88M
    int32_t minCapacity = uprv_max(static_cast<int32_t>(len), 4);  // Minimum 3 letters plus NUL.
1248
8.88M
    char scratch[MAXLEN];
1249
8.88M
    int32_t capacity = 0;
1250
8.88M
    char* buffer = sink->GetAppendBuffer(
1251
8.88M
            minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1252
1253
28.3M
    for (size_t i = 0; i < len; ++i) {
1254
19.4M
        buffer[i] = uprv_tolower(localeID[i]);
1255
19.4M
    }
1256
8.88M
    if (localeID.size() >= 2 && _isIDSeparator(localeID[1])) {
1257
59.4k
        buffer[1] = '-';
1258
59.4k
    }
1259
1260
8.88M
    if (len == 3) {
1261
        /* convert 3 character code to 2 character code if possible *CWB*/
1262
1.24M
        U_ASSERT(capacity >= 4);
1263
1.24M
        buffer[3] = '\0';
1264
1.24M
        std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer);
1265
1.24M
        if (offset.has_value()) {
1266
1.19M
            const char* const alias = LANGUAGES[*offset];
1267
1.19M
            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
1268
1.19M
            return skip + len;
1269
1.19M
        }
1270
1.24M
    }
1271
1272
7.68M
    sink->Append(buffer, static_cast<int32_t>(len));
1273
7.68M
    return skip + len;
1274
8.88M
}
1275
1276
11.9M
size_t _getScript(std::string_view localeID, ByteSink* sink) {
1277
11.9M
    constexpr int32_t LENGTH = 4;
1278
1279
11.9M
    size_t len = 0;
1280
41.0M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) &&
1281
29.2M
            uprv_isASCIILetter(localeID[len])) {
1282
29.0M
        if (len == LENGTH) { return 0; }
1283
29.0M
        len++;
1284
29.0M
    }
1285
11.9M
    if (len != LENGTH) { return 0; }
1286
1287
3.02M
    if (sink == nullptr) { return len; }
1288
1289
1.67M
    char scratch[LENGTH];
1290
1.67M
    int32_t capacity = 0;
1291
1.67M
    char* buffer = sink->GetAppendBuffer(
1292
1.67M
            LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity);
1293
1294
1.67M
    buffer[0] = uprv_toupper(localeID[0]);
1295
6.69M
    for (int32_t i = 1; i < LENGTH; ++i) {
1296
5.02M
        buffer[i] = uprv_tolower(localeID[i]);
1297
5.02M
    }
1298
1299
1.67M
    sink->Append(buffer, LENGTH);
1300
1.67M
    return len;
1301
3.02M
}
1302
1303
8.24M
size_t _getRegion(std::string_view localeID, ByteSink* sink) {
1304
8.24M
    constexpr int32_t MINLEN = 2;
1305
8.24M
    constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1;  // Minus NUL.
1306
1307
8.24M
    size_t len = 0;
1308
24.0M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1309
15.8M
        if (len == MAXLEN) { return 0; }
1310
15.8M
        len++;
1311
15.8M
    }
1312
8.23M
    if (len < MINLEN) { return 0; }
1313
1314
7.84M
    if (sink == nullptr) { return len; }
1315
1316
5.69M
    char scratch[ULOC_COUNTRY_CAPACITY];
1317
5.69M
    int32_t capacity = 0;
1318
5.69M
    char* buffer = sink->GetAppendBuffer(
1319
5.69M
            ULOC_COUNTRY_CAPACITY,
1320
5.69M
            ULOC_COUNTRY_CAPACITY,
1321
5.69M
            scratch,
1322
5.69M
            UPRV_LENGTHOF(scratch),
1323
5.69M
            &capacity);
1324
1325
17.1M
    for (size_t i = 0; i < len; ++i) {
1326
11.4M
        buffer[i] = uprv_toupper(localeID[i]);
1327
11.4M
    }
1328
1329
5.69M
    if (len == 3) {
1330
        /* convert 3 character code to 2 character code if possible *CWB*/
1331
78.8k
        U_ASSERT(capacity >= 4);
1332
78.8k
        buffer[3] = '\0';
1333
78.8k
        std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer);
1334
78.8k
        if (offset.has_value()) {
1335
115
            const char* const alias = COUNTRIES[*offset];
1336
115
            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
1337
115
            return len;
1338
115
        }
1339
78.8k
    }
1340
1341
5.69M
    sink->Append(buffer, static_cast<int32_t>(len));
1342
5.69M
    return len;
1343
5.69M
}
1344
1345
/**
1346
 * @param needSeparator if true, then add leading '_' if any variants
1347
 * are added to 'variant'
1348
 */
1349
size_t
1350
_getVariant(std::string_view localeID,
1351
            char prev,
1352
            ByteSink* sink,
1353
            bool needSeparator,
1354
3.52M
            UErrorCode& status) {
1355
3.52M
    if (U_FAILURE(status) || localeID.empty()) return 0;
1356
1357
    // Reasonable upper limit for variants
1358
    // There are no strict limitation of the syntax of variant in the legacy
1359
    // locale format. If the locale is constructed from unicode_locale_id
1360
    // as defined in UTS35, then we know each unicode_variant_subtag
1361
    // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
1362
    // 179 would allow 20 unicode_variant_subtag with sep in the
1363
    // unicode_locale_id
1364
    // 8*20 + 1*(20-1) = 179
1365
3.52M
    constexpr int32_t MAX_VARIANTS_LENGTH = 179;
1366
1367
    /* get one or more variant tags and separate them with '_' */
1368
3.52M
    size_t index = 0;
1369
3.52M
    if (_isIDSeparator(prev)) {
1370
        /* get a variant string after a '-' or '_' */
1371
8.94M
        for (std::string_view sub = localeID;;) {
1372
8.94M
            size_t next = sub.find_first_of(".@_-");
1373
            // For historical reasons, a trailing separator is included in the variant.
1374
8.94M
            bool finished = next == std::string_view::npos || next + 1 == sub.length();
1375
8.94M
            size_t limit = finished ? sub.length() : next;
1376
8.94M
            index += limit;
1377
8.94M
            if (index > MAX_VARIANTS_LENGTH) {
1378
1.33k
                status = U_ILLEGAL_ARGUMENT_ERROR;
1379
1.33k
                return 0;
1380
1.33k
            }
1381
1382
8.94M
            if (sink != nullptr) {
1383
8.93M
                if (needSeparator) {
1384
5.41M
                    sink->Append("_", 1);
1385
5.41M
                } else {
1386
3.51M
                    needSeparator = true;
1387
3.51M
                }
1388
1389
8.93M
                int32_t length = static_cast<int32_t>(limit);
1390
8.93M
                int32_t minCapacity = uprv_min(length, MAX_VARIANTS_LENGTH);
1391
8.93M
                char scratch[MAX_VARIANTS_LENGTH];
1392
8.93M
                int32_t capacity = 0;
1393
8.93M
                char* buffer = sink->GetAppendBuffer(
1394
8.93M
                        minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1395
1396
31.0M
                for (size_t i = 0; i < limit; ++i) {
1397
22.1M
                    buffer[i] = uprv_toupper(sub[i]);
1398
22.1M
                }
1399
8.93M
                sink->Append(buffer, length);
1400
8.93M
            }
1401
1402
8.94M
            if (finished) { return index; }
1403
5.49M
            sub.remove_prefix(next);
1404
5.49M
            if (_isTerminator(sub.front()) || _isBCP47Extension(sub)) { return index; }
1405
5.42M
            sub.remove_prefix(1);
1406
5.42M
            index++;
1407
5.42M
        }
1408
3.51M
    }
1409
1410
4.44k
    size_t skip = 0;
1411
    /* if there is no variant tag after a '-' or '_' then look for '@' */
1412
4.44k
    if (prev == '@') {
1413
        /* keep localeID */
1414
4.44k
    } else if (const char* p = locale_getKeywordsStart(localeID); p != nullptr) {
1415
0
        skip = 1 + p - localeID.data(); /* point after the '@' */
1416
0
        localeID.remove_prefix(skip);
1417
0
    } else {
1418
0
        return 0;
1419
0
    }
1420
179k
    for (; index < localeID.size() && !_isTerminator(localeID[index]); index++) {
1421
175k
        if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1422
541
            status = U_ILLEGAL_ARGUMENT_ERROR;
1423
541
            return 0;
1424
541
        }
1425
174k
        if (needSeparator) {
1426
1.82k
            if (sink != nullptr) {
1427
1.82k
                sink->Append("_", 1);
1428
1.82k
            }
1429
1.82k
            needSeparator = false;
1430
1.82k
        }
1431
174k
        if (sink != nullptr) {
1432
174k
            char c = uprv_toupper(localeID[index]);
1433
174k
            if (c == '-' || c == ',') c = '_';
1434
174k
            sink->Append(&c, 1);
1435
174k
        }
1436
174k
    }
1437
3.90k
    return skip + index;
1438
4.44k
}
1439
1440
}  // namespace
1441
1442
U_EXPORT CharString
1443
2.83k
ulocimp_getLanguage(std::string_view localeID, UErrorCode& status) {
1444
2.83k
    return ByteSinkUtil::viaByteSinkToCharString(
1445
2.83k
        [&](ByteSink& sink, UErrorCode& status) {
1446
2.83k
            ulocimp_getSubtags(
1447
2.83k
                    localeID,
1448
2.83k
                    &sink,
1449
2.83k
                    nullptr,
1450
2.83k
                    nullptr,
1451
2.83k
                    nullptr,
1452
2.83k
                    nullptr,
1453
2.83k
                    status);
1454
2.83k
        },
1455
2.83k
        status);
1456
2.83k
}
1457
1458
U_EXPORT CharString
1459
2.83k
ulocimp_getScript(std::string_view localeID, UErrorCode& status) {
1460
2.83k
    return ByteSinkUtil::viaByteSinkToCharString(
1461
2.83k
        [&](ByteSink& sink, UErrorCode& status) {
1462
2.83k
            ulocimp_getSubtags(
1463
2.83k
                    localeID,
1464
2.83k
                    nullptr,
1465
2.83k
                    &sink,
1466
2.83k
                    nullptr,
1467
2.83k
                    nullptr,
1468
2.83k
                    nullptr,
1469
2.83k
                    status);
1470
2.83k
        },
1471
2.83k
        status);
1472
2.83k
}
1473
1474
U_EXPORT CharString
1475
499k
ulocimp_getRegion(std::string_view localeID, UErrorCode& status) {
1476
499k
    return ByteSinkUtil::viaByteSinkToCharString(
1477
499k
        [&](ByteSink& sink, UErrorCode& status) {
1478
499k
            ulocimp_getSubtags(
1479
499k
                    localeID,
1480
499k
                    nullptr,
1481
499k
                    nullptr,
1482
499k
                    &sink,
1483
499k
                    nullptr,
1484
499k
                    nullptr,
1485
499k
                    status);
1486
499k
        },
1487
499k
        status);
1488
499k
}
1489
1490
U_EXPORT CharString
1491
5.66k
ulocimp_getVariant(std::string_view localeID, UErrorCode& status) {
1492
5.66k
    return ByteSinkUtil::viaByteSinkToCharString(
1493
5.66k
        [&](ByteSink& sink, UErrorCode& status) {
1494
5.66k
            ulocimp_getSubtags(
1495
5.66k
                    localeID,
1496
5.66k
                    nullptr,
1497
5.66k
                    nullptr,
1498
5.66k
                    nullptr,
1499
5.66k
                    &sink,
1500
5.66k
                    nullptr,
1501
5.66k
                    status);
1502
5.66k
        },
1503
5.66k
        status);
1504
5.66k
}
1505
1506
U_EXPORT void
1507
ulocimp_getSubtags(
1508
        std::string_view localeID,
1509
        CharString* language,
1510
        CharString* script,
1511
        CharString* region,
1512
        CharString* variant,
1513
        const char** pEnd,
1514
9.64M
        UErrorCode& status) {
1515
9.64M
    if (U_FAILURE(status)) { return; }
1516
1517
9.64M
    std::optional<CharStringByteSink> languageSink;
1518
9.64M
    std::optional<CharStringByteSink> scriptSink;
1519
9.64M
    std::optional<CharStringByteSink> regionSink;
1520
9.64M
    std::optional<CharStringByteSink> variantSink;
1521
1522
9.64M
    if (language != nullptr) { languageSink.emplace(language); }
1523
9.64M
    if (script != nullptr) { scriptSink.emplace(script); }
1524
9.64M
    if (region != nullptr) { regionSink.emplace(region); }
1525
9.64M
    if (variant != nullptr) { variantSink.emplace(variant); }
1526
1527
9.64M
    ulocimp_getSubtags(
1528
9.64M
            localeID,
1529
9.64M
            languageSink.has_value() ? &*languageSink : nullptr,
1530
9.64M
            scriptSink.has_value() ? &*scriptSink : nullptr,
1531
9.64M
            regionSink.has_value() ? &*regionSink : nullptr,
1532
9.64M
            variantSink.has_value() ? &*variantSink : nullptr,
1533
9.64M
            pEnd,
1534
9.64M
            status);
1535
9.64M
}
1536
1537
U_EXPORT void
1538
ulocimp_getSubtags(
1539
        std::string_view localeID,
1540
        ByteSink* language,
1541
        ByteSink* script,
1542
        ByteSink* region,
1543
        ByteSink* variant,
1544
        const char** pEnd,
1545
13.9M
        UErrorCode& status) {
1546
13.9M
    if (U_FAILURE(status)) { return; }
1547
1548
13.9M
    if (pEnd != nullptr) {
1549
5.75M
        *pEnd = localeID.data();
1550
8.18M
    } else if (language == nullptr &&
1551
4.29M
               script == nullptr &&
1552
4.29M
               region == nullptr &&
1553
3.78M
               variant == nullptr) {
1554
0
        return;
1555
0
    }
1556
1557
13.9M
    if (localeID.empty()) { return; }
1558
1559
13.5M
    bool hasRegion = false;
1560
1561
13.5M
    {
1562
13.5M
        size_t len = _getLanguage(localeID, language, status);
1563
13.5M
        if (U_FAILURE(status)) { return; }
1564
13.5M
        if (len > 0) {
1565
13.0M
            localeID.remove_prefix(len);
1566
13.0M
        }
1567
13.5M
    }
1568
1569
13.5M
    if (pEnd != nullptr) {
1570
5.34M
        *pEnd = localeID.data();
1571
8.18M
    } else if (script == nullptr &&
1572
4.28M
               region == nullptr &&
1573
3.79M
               variant == nullptr) {
1574
2.81k
        return;
1575
2.81k
    }
1576
1577
13.5M
    if (localeID.empty()) { return; }
1578
1579
12.1M
    if (_isIDSeparator(localeID.front())) {
1580
11.9M
        std::string_view sub = localeID;
1581
11.9M
        sub.remove_prefix(1);
1582
11.9M
        size_t len = _getScript(sub, script);
1583
11.9M
        if (len > 0) {
1584
3.02M
            localeID.remove_prefix(len + 1);
1585
3.02M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1586
3.02M
        }
1587
11.9M
    }
1588
1589
12.1M
    if ((region == nullptr && variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1590
1591
8.44M
    if (_isIDSeparator(localeID.front())) {
1592
8.24M
        std::string_view sub = localeID;
1593
8.24M
        sub.remove_prefix(1);
1594
8.24M
        size_t len = _getRegion(sub, region);
1595
8.24M
        if (len > 0) {
1596
7.84M
            hasRegion = true;
1597
7.84M
            localeID.remove_prefix(len + 1);
1598
7.84M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1599
7.84M
        }
1600
8.24M
    }
1601
1602
8.44M
    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1603
1604
3.76M
    bool hasVariant = false;
1605
1606
3.76M
    if (_isIDSeparator(localeID.front()) && !_isBCP47Extension(localeID)) {
1607
3.52M
        std::string_view sub = localeID;
1608
        /* If there was no country ID, skip a possible extra IDSeparator */
1609
3.52M
        size_t skip = !hasRegion && localeID.size() > 1 && _isIDSeparator(localeID[1]) ? 2 : 1;
1610
3.52M
        sub.remove_prefix(skip);
1611
3.52M
        size_t len = _getVariant(sub, localeID[0], variant, false, status);
1612
3.52M
        if (U_FAILURE(status)) { return; }
1613
3.52M
        if (len > 0) {
1614
3.51M
            hasVariant = true;
1615
3.51M
            localeID.remove_prefix(skip + len);
1616
3.51M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1617
3.51M
        }
1618
3.52M
    }
1619
1620
3.76M
    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1621
1622
312k
    if (_isBCP47Extension(localeID)) {
1623
6.84k
        localeID.remove_prefix(2);
1624
6.84k
        constexpr char vaposix[] = "-va-posix";
1625
6.84k
        constexpr size_t length = sizeof vaposix - 1;
1626
6.36M
        for (size_t next;; localeID.remove_prefix(next)) {
1627
6.36M
            next = localeID.find('-', 1);
1628
6.36M
            if (next == std::string_view::npos) { break; }
1629
6.36M
            next = localeID.find('-', next + 1);
1630
6.36M
            bool finished = next == std::string_view::npos;
1631
6.36M
            std::string_view sub = localeID;
1632
6.36M
            if (!finished) { sub.remove_suffix(sub.length() - next); }
1633
1634
6.36M
            if (sub.length() == length && uprv_strnicmp(sub.data(), vaposix, length) == 0) {
1635
1.91M
                if (variant != nullptr) {
1636
1.91M
                    if (hasVariant) { variant->Append("_", 1); }
1637
1.91M
                    constexpr char posix[] = "POSIX";
1638
1.91M
                    variant->Append(posix, sizeof posix - 1);
1639
1.91M
                }
1640
1.91M
                if (pEnd != nullptr) { *pEnd = localeID.data() + length; }
1641
1.91M
            }
1642
1643
6.36M
            if (finished) { break; }
1644
6.36M
        }
1645
6.84k
    }
1646
312k
}
1647
1648
/* Keyword enumeration */
1649
1650
typedef struct UKeywordsContext {
1651
    char* keywords;
1652
    char* current;
1653
} UKeywordsContext;
1654
1655
U_CDECL_BEGIN
1656
1657
static void U_CALLCONV
1658
4.51k
uloc_kw_closeKeywords(UEnumeration *enumerator) {
1659
4.51k
    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1660
4.51k
    uprv_free(enumerator->context);
1661
4.51k
    uprv_free(enumerator);
1662
4.51k
}
1663
1664
static int32_t U_CALLCONV
1665
968
uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1666
968
    char *kw = ((UKeywordsContext *)en->context)->keywords;
1667
968
    int32_t result = 0;
1668
2.50k
    while(*kw) {
1669
1.53k
        result++;
1670
1.53k
        kw += uprv_strlen(kw)+1;
1671
1.53k
    }
1672
968
    return result;
1673
968
}
1674
1675
static const char * U_CALLCONV
1676
uloc_kw_nextKeyword(UEnumeration* en,
1677
                    int32_t* resultLength,
1678
7.98k
                    UErrorCode* /*status*/) {
1679
7.98k
    const char* result = ((UKeywordsContext *)en->context)->current;
1680
7.98k
    int32_t len = 0;
1681
7.98k
    if(*result) {
1682
6.19k
        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1683
6.19k
        ((UKeywordsContext *)en->context)->current += len+1;
1684
6.19k
    } else {
1685
1.79k
        result = nullptr;
1686
1.79k
    }
1687
7.98k
    if (resultLength) {
1688
7.98k
        *resultLength = len;
1689
7.98k
    }
1690
7.98k
    return result;
1691
7.98k
}
1692
1693
static void U_CALLCONV
1694
uloc_kw_resetKeywords(UEnumeration* en,
1695
0
                      UErrorCode* /*status*/) {
1696
0
    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1697
0
}
1698
1699
U_CDECL_END
1700
1701
1702
static const UEnumeration gKeywordsEnum = {
1703
    nullptr,
1704
    nullptr,
1705
    uloc_kw_closeKeywords,
1706
    uloc_kw_countKeywords,
1707
    uenum_unextDefault,
1708
    uloc_kw_nextKeyword,
1709
    uloc_kw_resetKeywords
1710
};
1711
1712
U_CAPI UEnumeration* U_EXPORT2
1713
uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1714
4.51k
{
1715
4.51k
    if (U_FAILURE(*status)) { return nullptr; }
1716
1717
4.51k
    LocalMemory<UKeywordsContext> myContext;
1718
4.51k
    LocalMemory<UEnumeration> result;
1719
1720
4.51k
    myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1721
4.51k
    result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1722
4.51k
    if (myContext.isNull() || result.isNull()) {
1723
0
        *status = U_MEMORY_ALLOCATION_ERROR;
1724
0
        return nullptr;
1725
0
    }
1726
4.51k
    uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1727
4.51k
    myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1728
4.51k
    if (myContext->keywords == nullptr) {
1729
0
        *status = U_MEMORY_ALLOCATION_ERROR;
1730
0
        return nullptr;
1731
0
    }
1732
4.51k
    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1733
4.51k
    myContext->keywords[keywordListSize] = 0;
1734
4.51k
    myContext->current = myContext->keywords;
1735
4.51k
    result->context = myContext.orphan();
1736
4.51k
    return result.orphan();
1737
4.51k
}
1738
1739
U_CAPI UEnumeration* U_EXPORT2
1740
uloc_openKeywords(const char* localeID,
1741
                        UErrorCode* status)
1742
7.21k
{
1743
7.21k
    if(status==nullptr || U_FAILURE(*status)) {
1744
0
        return nullptr;
1745
0
    }
1746
1747
7.21k
    CharString tempBuffer;
1748
7.21k
    const char* tmpLocaleID;
1749
1750
7.21k
    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
1751
2.57k
        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status);
1752
2.57k
        tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1753
4.63k
    } else {
1754
4.63k
        if (localeID==nullptr) {
1755
0
            localeID=uloc_getDefault();
1756
0
        }
1757
4.63k
        tmpLocaleID=localeID;
1758
4.63k
    }
1759
1760
7.21k
    ulocimp_getSubtags(
1761
7.21k
            tmpLocaleID,
1762
7.21k
            nullptr,
1763
7.21k
            nullptr,
1764
7.21k
            nullptr,
1765
7.21k
            nullptr,
1766
7.21k
            &tmpLocaleID,
1767
7.21k
            *status);
1768
7.21k
    if (U_FAILURE(*status)) {
1769
234
        return nullptr;
1770
234
    }
1771
1772
    /* keywords are located after '@' */
1773
6.97k
    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1774
4.85k
        CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status);
1775
4.85k
        if (U_FAILURE(*status)) {
1776
334
            return nullptr;
1777
334
        }
1778
4.51k
        return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1779
4.85k
    }
1780
2.12k
    return nullptr;
1781
6.97k
}
1782
1783
1784
/* bit-flags for 'options' parameter of _canonicalize */
1785
8.94M
#define _ULOC_STRIP_KEYWORDS 0x2
1786
17.1M
#define _ULOC_CANONICALIZE   0x1
1787
1788
namespace {
1789
1790
22.6M
inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; }
1791
1792
constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1793
constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default);
1794
1795
/**
1796
 * Canonicalize the given localeID, to level 1 or to level 2,
1797
 * depending on the options.  To specify level 1, pass in options=0.
1798
 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1799
 *
1800
 * This is the code underlying uloc_getName and uloc_canonicalize.
1801
 */
1802
void
1803
_canonicalize(std::string_view localeID,
1804
              ByteSink& sink,
1805
              uint32_t options,
1806
5.68M
              UErrorCode& err) {
1807
5.68M
    if (U_FAILURE(err)) {
1808
0
        return;
1809
0
    }
1810
1811
5.68M
    int32_t j, fieldCount=0;
1812
5.68M
    CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
1813
5.68M
    CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1814
5.68M
    std::string_view origLocaleID;
1815
5.68M
    std::string_view tmpLocaleID;
1816
5.68M
    size_t keywordAssign = std::string_view::npos;
1817
5.68M
    size_t separatorIndicator = std::string_view::npos;
1818
1819
5.68M
    if (_hasBCP47Extension(localeID)) {
1820
224k
        std::string_view localeIDPtr = localeID;
1821
1822
        // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1823
224k
        if (localeID.size() >= 2 && localeID.find('_') != std::string_view::npos && localeID[1] != '-' && localeID[1] != '_') {
1824
100k
            localeIDWithHyphens.append(localeID, err);
1825
100k
            if (U_SUCCESS(err)) {
1826
250M
                for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1827
250M
                    if (*p == '_') {
1828
7.16M
                        *p = '-';
1829
7.16M
                    }
1830
250M
                }
1831
100k
                localeIDPtr = localeIDWithHyphens.toStringPiece();
1832
100k
            }
1833
100k
        }
1834
1835
224k
        tempBuffer = ulocimp_forLanguageTag(localeIDPtr.data(), static_cast<int32_t>(localeIDPtr.size()), nullptr, err);
1836
224k
        tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? static_cast<std::string_view>(tempBuffer.toStringPiece()) : localeIDPtr;
1837
5.45M
    } else {
1838
5.45M
        tmpLocaleID=localeID;
1839
5.45M
    }
1840
1841
5.68M
    origLocaleID=tmpLocaleID;
1842
1843
    /* get all pieces, one after another, and separate with '_' */
1844
5.68M
    CharString tag;
1845
5.68M
    CharString script;
1846
5.68M
    CharString country;
1847
5.68M
    CharString variant;
1848
5.68M
    const char* end = nullptr;
1849
5.68M
    ulocimp_getSubtags(
1850
5.68M
            tmpLocaleID,
1851
5.68M
            &tag,
1852
5.68M
            &script,
1853
5.68M
            &country,
1854
5.68M
            &variant,
1855
5.68M
            &end,
1856
5.68M
            err);
1857
5.68M
    if (U_FAILURE(err)) {
1858
5.74k
        return;
1859
5.74k
    }
1860
5.67M
    U_ASSERT(end != nullptr);
1861
5.67M
    if (end > tmpLocaleID.data()) {
1862
5.20M
        tmpLocaleID.remove_prefix(end - tmpLocaleID.data());
1863
5.20M
    }
1864
1865
5.67M
    if (tag.length() == I_DEFAULT_LENGTH && origLocaleID.length() >= I_DEFAULT_LENGTH &&
1866
5.67M
            uprv_strncmp(origLocaleID.data(), i_default, I_DEFAULT_LENGTH) == 0) {
1867
206
        tag.clear();
1868
206
        tag.append(uloc_getDefault(), err);
1869
5.67M
    } else {
1870
5.67M
        if (!script.isEmpty()) {
1871
380k
            ++fieldCount;
1872
380k
            tag.append('_', err);
1873
380k
            tag.append(script, err);
1874
380k
        }
1875
5.67M
        if (!country.isEmpty()) {
1876
3.99M
            ++fieldCount;
1877
3.99M
            tag.append('_', err);
1878
3.99M
            tag.append(country, err);
1879
3.99M
        }
1880
5.67M
        if (!variant.isEmpty()) {
1881
2.39M
            ++fieldCount;
1882
2.39M
            if (country.isEmpty()) {
1883
134k
                tag.append('_', err);
1884
134k
            }
1885
2.39M
            tag.append('_', err);
1886
2.39M
            tag.append(variant, err);
1887
2.39M
        }
1888
5.67M
    }
1889
1890
    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1891
5.67M
    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && !tmpLocaleID.empty() && tmpLocaleID.front() == '.') {
1892
16.9k
        tag.append('.', err);
1893
16.9k
        tmpLocaleID.remove_prefix(1);
1894
16.9k
        size_t length;
1895
16.9k
        if (size_t atPos = tmpLocaleID.find('@'); atPos != std::string_view::npos) {
1896
3.68k
            length = atPos;
1897
13.2k
        } else {
1898
13.2k
            length = tmpLocaleID.length();
1899
13.2k
        }
1900
        // The longest charset name we found in IANA charset registry
1901
        // https://www.iana.org/assignments/character-sets/ is
1902
        // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
1903
        // we therefore restrict the length here to be 64 which is a power of 2
1904
        // number that is longer than 45.
1905
16.9k
        constexpr size_t kMaxCharsetLength = 64;
1906
16.9k
        if (length > kMaxCharsetLength) {
1907
278
           err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1908
278
           return;
1909
278
        }
1910
16.6k
        if (length > 0) {
1911
13.8k
            tag.append(tmpLocaleID.data(), static_cast<int32_t>(length), err);
1912
13.8k
            tmpLocaleID.remove_prefix(length);
1913
13.8k
        }
1914
16.6k
    }
1915
1916
    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1917
       After this, tmpLocaleID either starts at '@' or is empty. */
1918
5.67M
    if (const char* start = locale_getKeywordsStart(tmpLocaleID); start != nullptr) {
1919
228k
        if (start > tmpLocaleID.data()) {
1920
1.94k
            tmpLocaleID.remove_prefix(start - tmpLocaleID.data());
1921
1.94k
        }
1922
228k
        keywordAssign = tmpLocaleID.find('=');
1923
228k
        separatorIndicator = tmpLocaleID.find(';');
1924
5.44M
    } else {
1925
5.44M
        tmpLocaleID = {};
1926
5.44M
    }
1927
1928
    /* Copy POSIX-style variant, if any [mr@FOO] */
1929
5.67M
    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1930
5.55M
        !tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
1931
21.4k
        tag.append(tmpLocaleID, err);
1932
21.4k
        tmpLocaleID = {};
1933
21.4k
    }
1934
1935
5.67M
    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1936
        /* Handle @FOO variant if @ is present and not followed by = */
1937
117k
        if (!tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
1938
            /* Add missing '_' if needed */
1939
4.72k
            if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
1940
6.23k
                do {
1941
6.23k
                    tag.append('_', err);
1942
6.23k
                    ++fieldCount;
1943
6.23k
                } while(fieldCount<2);
1944
4.39k
            }
1945
1946
4.72k
            CharStringByteSink s(&tag);
1947
4.72k
            std::string_view sub = tmpLocaleID;
1948
4.72k
            sub.remove_prefix(1);
1949
4.72k
            _getVariant(sub, '@', &s, !variant.isEmpty(), err);
1950
4.72k
            if (U_FAILURE(err)) { return; }
1951
4.72k
        }
1952
1953
        /* Look up the ID in the canonicalization map */
1954
1.28M
        for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1955
1.16M
            StringPiece id(CANONICALIZE_MAP[j].id);
1956
1.16M
            if (tag == id) {
1957
65
                if (id.empty() && !tmpLocaleID.empty()) {
1958
0
                    break; /* Don't remap "" if keywords present */
1959
0
                }
1960
65
                tag.clear();
1961
65
                tag.append(CANONICALIZE_MAP[j].canonicalID, err);
1962
65
                break;
1963
65
            }
1964
1.16M
        }
1965
116k
    }
1966
1967
5.67M
    sink.Append(tag.data(), tag.length());
1968
1969
5.67M
    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1970
2.40M
        if (!tmpLocaleID.empty() && keywordAssign != std::string_view::npos &&
1971
130k
            (separatorIndicator == std::string_view::npos || separatorIndicator > keywordAssign)) {
1972
129k
            sink.Append("@", 1);
1973
129k
            ++fieldCount;
1974
129k
            tmpLocaleID.remove_prefix(1);
1975
129k
            ulocimp_getKeywords(tmpLocaleID, '@', sink, true, err);
1976
129k
        }
1977
2.40M
    }
1978
5.67M
}
1979
1980
}  // namespace
1981
1982
/* ### ID parsing API **************************************************/
1983
1984
U_CAPI int32_t  U_EXPORT2
1985
uloc_getParent(const char*    localeID,
1986
               char* parent,
1987
               int32_t parentCapacity,
1988
               UErrorCode* err)
1989
0
{
1990
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
1991
0
        parent, parentCapacity,
1992
0
        [&](ByteSink& sink, UErrorCode& status) {
1993
0
            ulocimp_getParent(localeID, sink, status);
1994
0
        },
1995
0
        *err);
1996
0
}
1997
1998
U_EXPORT CharString
1999
ulocimp_getParent(const char* localeID,
2000
                  UErrorCode& err)
2001
382k
{
2002
382k
    return ByteSinkUtil::viaByteSinkToCharString(
2003
382k
        [&](ByteSink& sink, UErrorCode& status) {
2004
382k
            ulocimp_getParent(localeID, sink, status);
2005
382k
        },
2006
382k
        err);
2007
382k
}
2008
2009
U_EXPORT void
2010
ulocimp_getParent(const char* localeID,
2011
                  icu::ByteSink& sink,
2012
                  UErrorCode& err)
2013
382k
{
2014
382k
    if (U_FAILURE(err)) { return; }
2015
2016
382k
    const char *lastUnderscore;
2017
382k
    int32_t i;
2018
2019
382k
    if (localeID == nullptr)
2020
0
        localeID = uloc_getDefault();
2021
2022
382k
    lastUnderscore=uprv_strrchr(localeID, '_');
2023
382k
    if(lastUnderscore!=nullptr) {
2024
279k
        i = static_cast<int32_t>(lastUnderscore - localeID);
2025
279k
    } else {
2026
102k
        i=0;
2027
102k
    }
2028
2029
382k
    if (i > 0) {
2030
276k
        if (uprv_strnicmp(localeID, "und_", 4) == 0) {
2031
0
            localeID += 3;
2032
0
            i -= 3;
2033
0
        }
2034
276k
        sink.Append(localeID, i);
2035
276k
    }
2036
382k
}
2037
2038
U_CAPI int32_t U_EXPORT2
2039
uloc_getLanguage(const char*    localeID,
2040
         char* language,
2041
         int32_t languageCapacity,
2042
         UErrorCode* err)
2043
0
{
2044
0
    if (localeID == nullptr) {
2045
0
        localeID = uloc_getDefault();
2046
0
    }
2047
2048
    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
2049
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2050
0
        language, languageCapacity,
2051
0
        [&](ByteSink& sink, UErrorCode& status) {
2052
0
            ulocimp_getSubtags(
2053
0
                    localeID,
2054
0
                    &sink,
2055
0
                    nullptr,
2056
0
                    nullptr,
2057
0
                    nullptr,
2058
0
                    nullptr,
2059
0
                    status);
2060
0
        },
2061
0
        *err);
2062
0
}
2063
2064
U_CAPI int32_t U_EXPORT2
2065
uloc_getScript(const char*    localeID,
2066
         char* script,
2067
         int32_t scriptCapacity,
2068
         UErrorCode* err)
2069
0
{
2070
0
    if (localeID == nullptr) {
2071
0
        localeID = uloc_getDefault();
2072
0
    }
2073
2074
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2075
0
        script, scriptCapacity,
2076
0
        [&](ByteSink& sink, UErrorCode& status) {
2077
0
            ulocimp_getSubtags(
2078
0
                    localeID,
2079
0
                    nullptr,
2080
0
                    &sink,
2081
0
                    nullptr,
2082
0
                    nullptr,
2083
0
                    nullptr,
2084
0
                    status);
2085
0
        },
2086
0
        *err);
2087
0
}
2088
2089
U_CAPI int32_t  U_EXPORT2
2090
uloc_getCountry(const char* localeID,
2091
            char* country,
2092
            int32_t countryCapacity,
2093
            UErrorCode* err)
2094
1.74k
{
2095
1.74k
    if (localeID == nullptr) {
2096
0
        localeID = uloc_getDefault();
2097
0
    }
2098
2099
1.74k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2100
1.74k
        country, countryCapacity,
2101
1.74k
        [&](ByteSink& sink, UErrorCode& status) {
2102
1.74k
            ulocimp_getSubtags(
2103
1.74k
                    localeID,
2104
1.74k
                    nullptr,
2105
1.74k
                    nullptr,
2106
1.74k
                    &sink,
2107
1.74k
                    nullptr,
2108
1.74k
                    nullptr,
2109
1.74k
                    status);
2110
1.74k
        },
2111
1.74k
        *err);
2112
1.74k
}
2113
2114
U_CAPI int32_t  U_EXPORT2
2115
uloc_getVariant(const char* localeID,
2116
                char* variant,
2117
                int32_t variantCapacity,
2118
                UErrorCode* err)
2119
0
{
2120
0
    if (localeID == nullptr) {
2121
0
        localeID = uloc_getDefault();
2122
0
    }
2123
2124
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2125
0
        variant, variantCapacity,
2126
0
        [&](ByteSink& sink, UErrorCode& status) {
2127
0
            ulocimp_getSubtags(
2128
0
                    localeID,
2129
0
                    nullptr,
2130
0
                    nullptr,
2131
0
                    nullptr,
2132
0
                    &sink,
2133
0
                    nullptr,
2134
0
                    status);
2135
0
        },
2136
0
        *err);
2137
0
}
2138
2139
U_CAPI int32_t  U_EXPORT2
2140
uloc_getName(const char* localeID,
2141
             char* name,
2142
             int32_t nameCapacity,
2143
             UErrorCode* err)
2144
11.1k
{
2145
11.1k
    if (localeID == nullptr) {
2146
0
        localeID = uloc_getDefault();
2147
0
    }
2148
11.1k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2149
11.1k
        name, nameCapacity,
2150
11.1k
        [&](ByteSink& sink, UErrorCode& status) {
2151
11.1k
            ulocimp_getName(localeID, sink, status);
2152
11.1k
        },
2153
11.1k
        *err);
2154
11.1k
}
2155
2156
U_EXPORT CharString
2157
ulocimp_getName(std::string_view localeID,
2158
                UErrorCode& err)
2159
915k
{
2160
915k
    return ByteSinkUtil::viaByteSinkToCharString(
2161
915k
        [&](ByteSink& sink, UErrorCode& status) {
2162
915k
            ulocimp_getName(localeID, sink, status);
2163
915k
        },
2164
915k
        err);
2165
915k
}
2166
2167
U_EXPORT void
2168
ulocimp_getName(std::string_view localeID,
2169
                ByteSink& sink,
2170
                UErrorCode& err)
2171
2.28M
{
2172
2.28M
    _canonicalize(localeID, sink, 0, err);
2173
2.28M
}
2174
2175
U_CAPI int32_t  U_EXPORT2
2176
uloc_getBaseName(const char* localeID,
2177
                 char* name,
2178
                 int32_t nameCapacity,
2179
                 UErrorCode* err)
2180
0
{
2181
0
    if (localeID == nullptr) {
2182
0
        localeID = uloc_getDefault();
2183
0
    }
2184
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2185
0
        name, nameCapacity,
2186
0
        [&](ByteSink& sink, UErrorCode& status) {
2187
0
            ulocimp_getBaseName(localeID, sink, status);
2188
0
        },
2189
0
        *err);
2190
0
}
2191
2192
U_EXPORT CharString
2193
ulocimp_getBaseName(std::string_view localeID,
2194
                    UErrorCode& err)
2195
3.27M
{
2196
3.27M
    return ByteSinkUtil::viaByteSinkToCharString(
2197
3.27M
        [&](ByteSink& sink, UErrorCode& status) {
2198
3.27M
            ulocimp_getBaseName(localeID, sink, status);
2199
3.27M
        },
2200
3.27M
        err);
2201
3.27M
}
2202
2203
U_EXPORT void
2204
ulocimp_getBaseName(std::string_view localeID,
2205
                    ByteSink& sink,
2206
                    UErrorCode& err)
2207
3.27M
{
2208
3.27M
    _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
2209
3.27M
}
2210
2211
U_CAPI int32_t  U_EXPORT2
2212
uloc_canonicalize(const char* localeID,
2213
                  char* name,
2214
                  int32_t nameCapacity,
2215
                  UErrorCode* err)
2216
3.73k
{
2217
3.73k
    if (localeID == nullptr) {
2218
0
        localeID = uloc_getDefault();
2219
0
    }
2220
3.73k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2221
3.73k
        name, nameCapacity,
2222
3.73k
        [&](ByteSink& sink, UErrorCode& status) {
2223
3.73k
            ulocimp_canonicalize(localeID, sink, status);
2224
3.73k
        },
2225
3.73k
        *err);
2226
3.73k
}
2227
2228
U_EXPORT CharString
2229
ulocimp_canonicalize(std::string_view localeID,
2230
                     UErrorCode& err)
2231
94.9k
{
2232
94.9k
    return ByteSinkUtil::viaByteSinkToCharString(
2233
94.9k
        [&](ByteSink& sink, UErrorCode& status) {
2234
94.9k
            ulocimp_canonicalize(localeID, sink, status);
2235
94.9k
        },
2236
94.9k
        err);
2237
94.9k
}
2238
2239
U_EXPORT void
2240
ulocimp_canonicalize(std::string_view localeID,
2241
                     ByteSink& sink,
2242
                     UErrorCode& err)
2243
117k
{
2244
117k
    _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
2245
117k
}
2246
2247
U_CAPI const char*  U_EXPORT2
2248
uloc_getISO3Language(const char* localeID)
2249
0
{
2250
0
    UErrorCode err = U_ZERO_ERROR;
2251
2252
0
    if (localeID == nullptr)
2253
0
    {
2254
0
        localeID = uloc_getDefault();
2255
0
    }
2256
0
    CharString lang = ulocimp_getLanguage(localeID, err);
2257
0
    if (U_FAILURE(err))
2258
0
        return "";
2259
0
    std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data());
2260
0
    return offset.has_value() ? LANGUAGES_3[*offset] : "";
2261
0
}
2262
2263
U_CAPI const char*  U_EXPORT2
2264
uloc_getISO3Country(const char* localeID)
2265
0
{
2266
0
    UErrorCode err = U_ZERO_ERROR;
2267
2268
0
    if (localeID == nullptr)
2269
0
    {
2270
0
        localeID = uloc_getDefault();
2271
0
    }
2272
0
    CharString cntry = ulocimp_getRegion(localeID, err);
2273
0
    if (U_FAILURE(err))
2274
0
        return "";
2275
0
    std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data());
2276
0
    return offset.has_value() ? COUNTRIES_3[*offset] : "";
2277
0
}
2278
2279
U_CAPI uint32_t  U_EXPORT2
2280
uloc_getLCID(const char* localeID)
2281
0
{
2282
0
    UErrorCode status = U_ZERO_ERROR;
2283
0
    uint32_t   lcid = 0;
2284
2285
    /* Check for incomplete id. */
2286
0
    if (!localeID || uprv_strlen(localeID) < 2) {
2287
0
        return 0;
2288
0
    }
2289
2290
    // First, attempt Windows platform lookup if available, but fall
2291
    // through to catch any special cases (ICU vs Windows name differences).
2292
0
    lcid = uprv_convertToLCIDPlatform(localeID, &status);
2293
0
    if (U_FAILURE(status)) {
2294
0
        return 0;
2295
0
    }
2296
0
    if (lcid > 0) {
2297
        // Windows found an LCID, return that
2298
0
        return lcid;
2299
0
    }
2300
2301
0
    CharString langID = ulocimp_getLanguage(localeID, status);
2302
0
    if (U_FAILURE(status)) {
2303
0
        return 0;
2304
0
    }
2305
2306
0
    if (uprv_strchr(localeID, '@')) {
2307
        // uprv_convertToLCID does not support keywords other than collation.
2308
        // Remove all keywords except collation.
2309
0
        CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status);
2310
0
        if (U_SUCCESS(status) && !collVal.isEmpty()) {
2311
0
            CharString tmpLocaleID = ulocimp_getBaseName(localeID, status);
2312
0
            ulocimp_setKeywordValue("collation", collVal.toStringPiece(), tmpLocaleID, status);
2313
0
            if (U_SUCCESS(status)) {
2314
0
                return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status);
2315
0
            }
2316
0
        }
2317
2318
        // fall through - all keywords are simply ignored
2319
0
        status = U_ZERO_ERROR;
2320
0
    }
2321
2322
0
    return uprv_convertToLCID(langID.data(), localeID, &status);
2323
0
}
2324
2325
U_CAPI int32_t U_EXPORT2
2326
uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2327
                UErrorCode *status)
2328
0
{
2329
0
    return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2330
0
}
2331
2332
/* ### Default locale **************************************************/
2333
2334
U_CAPI const char*  U_EXPORT2
2335
uloc_getDefault()
2336
4.98M
{
2337
4.98M
    return locale_get_default();
2338
4.98M
}
2339
2340
U_CAPI void  U_EXPORT2
2341
uloc_setDefault(const char*   newDefaultLocale,
2342
             UErrorCode* err)
2343
0
{
2344
0
    if (U_FAILURE(*err))
2345
0
        return;
2346
    /* the error code isn't currently used for anything by this function*/
2347
2348
    /* propagate change to C++ */
2349
0
    locale_set_default(newDefaultLocale);
2350
0
}
2351
2352
/**
2353
 * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2354
 * to an array of pointers to arrays of char.  All of these pointers are owned
2355
 * by ICU-- do not delete them, and do not write through them.  The array is
2356
 * terminated with a null pointer.
2357
 */
2358
U_CAPI const char* const*  U_EXPORT2
2359
uloc_getISOLanguages()
2360
0
{
2361
0
    return LANGUAGES;
2362
0
}
2363
2364
/**
2365
 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2366
 * pointer to an array of pointers to arrays of char.  All of these pointers are
2367
 * owned by ICU-- do not delete them, and do not write through them.  The array is
2368
 * terminated with a null pointer.
2369
 */
2370
U_CAPI const char* const*  U_EXPORT2
2371
uloc_getISOCountries()
2372
0
{
2373
0
    return COUNTRIES;
2374
0
}
2375
2376
U_CAPI const char* U_EXPORT2
2377
uloc_toUnicodeLocaleKey(const char* keyword)
2378
2.05k
{
2379
2.05k
    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
2380
2.05k
    std::optional<std::string_view> result = ulocimp_toBcpKeyWithFallback(keyword);
2381
2.05k
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2382
2.05k
}
2383
2384
U_EXPORT std::optional<std::string_view>
2385
ulocimp_toBcpKeyWithFallback(std::string_view keyword)
2386
12.5k
{
2387
12.5k
    std::optional<std::string_view> bcpKey = ulocimp_toBcpKey(keyword);
2388
12.5k
    if (!bcpKey.has_value() &&
2389
4.88k
        ultag_isUnicodeLocaleKey(keyword.data(), static_cast<int32_t>(keyword.size()))) {
2390
        // unknown keyword, but syntax is fine..
2391
4.57k
        return keyword;
2392
4.57k
    }
2393
8.00k
    return bcpKey;
2394
12.5k
}
2395
2396
U_CAPI const char* U_EXPORT2
2397
uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2398
4.55k
{
2399
4.55k
    if (keyword == nullptr || *keyword == '\0' ||
2400
4.55k
        value == nullptr || *value == '\0') { return nullptr; }
2401
4.55k
    std::optional<std::string_view> result = ulocimp_toBcpTypeWithFallback(keyword, value);
2402
4.55k
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2403
4.55k
}
2404
2405
U_EXPORT std::optional<std::string_view>
2406
ulocimp_toBcpTypeWithFallback(std::string_view keyword, std::string_view value)
2407
16.6k
{
2408
16.6k
    std::optional<std::string_view> bcpType = ulocimp_toBcpType(keyword, value);
2409
16.6k
    if (!bcpType.has_value() &&
2410
6.02k
        ultag_isUnicodeLocaleType(value.data(), static_cast<int32_t>(value.size()))) {
2411
        // unknown keyword, but syntax is fine..
2412
5.56k
        return value;
2413
5.56k
    }
2414
11.0k
    return bcpType;
2415
16.6k
}
2416
2417
namespace {
2418
2419
bool
2420
isWellFormedLegacyKey(std::string_view key)
2421
17.1M
{
2422
17.1M
    return std::all_of(key.begin(), key.end(), UPRV_ISALPHANUM);
2423
17.1M
}
2424
2425
bool
2426
isWellFormedLegacyType(std::string_view legacyType)
2427
89.9k
{
2428
89.9k
    int32_t alphaNumLen = 0;
2429
52.2M
    for (char c : legacyType) {
2430
52.2M
        if (c == '_' || c == '/' || c == '-') {
2431
10.9M
            if (alphaNumLen == 0) {
2432
0
                return false;
2433
0
            }
2434
10.9M
            alphaNumLen = 0;
2435
41.3M
        } else if (UPRV_ISALPHANUM(c)) {
2436
41.3M
            alphaNumLen++;
2437
41.3M
        } else {
2438
0
            return false;
2439
0
        }
2440
52.2M
    }
2441
89.9k
    return alphaNumLen != 0;
2442
89.9k
}
2443
2444
}  // namespace
2445
2446
U_CAPI const char* U_EXPORT2
2447
uloc_toLegacyKey(const char* keyword)
2448
0
{
2449
0
    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
2450
0
    std::optional<std::string_view> result = ulocimp_toLegacyKeyWithFallback(keyword);
2451
0
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2452
0
}
2453
2454
U_EXPORT std::optional<std::string_view>
2455
ulocimp_toLegacyKeyWithFallback(std::string_view keyword)
2456
17.3M
{
2457
17.3M
    std::optional<std::string_view> legacyKey = ulocimp_toLegacyKey(keyword);
2458
17.3M
    if (!legacyKey.has_value() && isWellFormedLegacyKey(keyword)) {
2459
        // Checks if the specified locale key is well-formed with the legacy locale syntax.
2460
        //
2461
        // Note:
2462
        //  LDML/CLDR provides some definition of keyword syntax in
2463
        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2464
        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2465
        //  Keys can only consist of [0-9a-zA-Z].
2466
17.1M
        return keyword;
2467
17.1M
    }
2468
272k
    return legacyKey;
2469
17.3M
}
2470
2471
U_CAPI const char* U_EXPORT2
2472
uloc_toLegacyType(const char* keyword, const char* value)
2473
0
{
2474
0
    if (keyword == nullptr || *keyword == '\0' ||
2475
0
        value == nullptr || *value == '\0') { return nullptr; }
2476
0
    std::optional<std::string_view> result = ulocimp_toLegacyTypeWithFallback(keyword, value);
2477
0
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2478
0
}
2479
2480
U_EXPORT std::optional<std::string_view>
2481
ulocimp_toLegacyTypeWithFallback(std::string_view keyword, std::string_view value)
2482
109k
{
2483
109k
    std::optional<std::string_view> legacyType = ulocimp_toLegacyType(keyword, value);
2484
109k
    if (!legacyType.has_value() && isWellFormedLegacyType(value)) {
2485
        // Checks if the specified locale type is well-formed with the legacy locale syntax.
2486
        //
2487
        // Note:
2488
        //  LDML/CLDR provides some definition of keyword syntax in
2489
        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2490
        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2491
        //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2492
        //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2493
89.9k
        return value;
2494
89.9k
    }
2495
19.0k
    return legacyType;
2496
109k
}
2497
2498
/*eof*/