Coverage Report

Created: 2026-03-31 06:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/uloc.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 1997-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*
9
* File ULOC.CPP
10
*
11
* Modification History:
12
*
13
*   Date        Name        Description
14
*   04/01/97    aliu        Creation.
15
*   08/21/98    stephen     JDK 1.2 sync
16
*   12/08/98    rtg         New Locale implementation and C API
17
*   03/15/99    damiba      overhaul.
18
*   04/06/99    stephen     changed setDefault() to realloc and copy
19
*   06/14/99    stephen     Changed calls to ures_open for new params
20
*   07/21/99    stephen     Modified setDefault() to propagate to C++
21
*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22
*                           brought canonicalization code into line with spec
23
*****************************************************************************/
24
25
/*
26
   POSIX's locale format, from putil.c: [no spaces]
27
28
     ll [ _CC ] [ . MM ] [ @ VV]
29
30
     l = lang, C = ctry, M = charmap, V = variant
31
*/
32
33
#include <algorithm>
34
#include <optional>
35
#include <string_view>
36
37
#include "unicode/bytestream.h"
38
#include "unicode/errorcode.h"
39
#include "unicode/stringpiece.h"
40
#include "unicode/utypes.h"
41
#include "unicode/ustring.h"
42
#include "unicode/uloc.h"
43
44
#include "bytesinkutil.h"
45
#include "putilimp.h"
46
#include "ustr_imp.h"
47
#include "ulocimp.h"
48
#include "umutex.h"
49
#include "cstring.h"
50
#include "cmemory.h"
51
#include "locmap.h"
52
#include "uarrsort.h"
53
#include "uenumimp.h"
54
#include "uassert.h"
55
#include "charstr.h"
56
57
U_NAMESPACE_USE
58
59
/* ### Declarations **************************************************/
60
61
/* Locale stuff from locid.cpp */
62
U_CFUNC void locale_set_default(const char *id);
63
U_CFUNC const char *locale_get_default();
64
65
namespace {
66
67
/* ### Data tables **************************************************/
68
69
/**
70
 * Table of language codes, both 2- and 3-letter, with preference
71
 * given to 2-letter codes where possible.  Includes 3-letter codes
72
 * that lack a 2-letter equivalent.
73
 *
74
 * This list must be in sorted order.  This list is returned directly
75
 * to the user by some API.
76
 *
77
 * This list must be kept in sync with LANGUAGES_3, with corresponding
78
 * entries matched.
79
 *
80
 * This table should be terminated with a nullptr entry, followed by a
81
 * second list, and another nullptr entry.  The first list is visible to
82
 * user code when this array is returned by API.  The second list
83
 * contains codes we support, but do not expose through user API.
84
 *
85
 * Notes
86
 *
87
 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
88
 * include the revisions up to 2001/7/27 *CWB*
89
 *
90
 * The 3 character codes are the terminology codes like RFC 3066.  This
91
 * is compatible with prior ICU codes
92
 *
93
 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
94
 * table but now at the end of the table because 3 character codes are
95
 * duplicates.  This avoids bad searches going from 3 to 2 character
96
 * codes.
97
 *
98
 * The range qaa-qtz is reserved for local use
99
 */
100
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
101
/* ISO639 table version is 20150505 */
102
/* Subsequent hand addition of selected languages */
103
constexpr const char* LANGUAGES[] = {
104
    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
105
    "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
106
    "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
107
    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
108
    "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
109
    "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
110
    "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
111
    "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
112
    "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
113
    "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
114
    "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
115
    "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
116
    "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
117
    "cs",  "csb", "csw", "cu",  "cv",  "cy",
118
    "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
119
    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
120
    "dyo", "dyu", "dz",  "dzg",
121
    "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
122
    "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
123
    "ext",
124
    "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
125
    "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
126
    "frs", "fur", "fy",
127
    "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
128
    "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
129
    "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
130
    "gur", "guz", "gv",  "gwi",
131
    "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
132
    "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
133
    "hup", "hy",  "hz",
134
    "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
135
    "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
136
    "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
137
    "jv",
138
    "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
139
    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
140
    "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
141
    "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
142
    "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
143
    "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
144
    "kv",  "kw",  "kxv", "ky",
145
    "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
146
    "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
147
    "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
148
    "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
149
    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
150
    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
151
    "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
152
    "ml",  "mn",  "mnc", "mni",
153
    "moh", "mos", "mr",  "mrj",
154
    "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
155
    "my",  "mye", "myv", "mzn",
156
    "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
157
    "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
158
    "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
159
    "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
160
    "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
161
    "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
162
    "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
163
    "pon", "prg", "pro", "ps",  "pt",
164
    "qu",  "quc", "qug",
165
    "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
166
    "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
167
    "rw",  "rwk",
168
    "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
169
    "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
170
    "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
171
    "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
172
    "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
173
    "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
174
    "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
175
    "sv",  "sw",  "swb", "syc", "syr", "szl",
176
    "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
177
    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
178
    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
179
    "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
180
    "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
181
    "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
182
    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
183
    "vo", "vot", "vro", "vun",
184
    "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
185
    "xal", "xh",  "xmf", "xnr", "xog",
186
    "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
187
    "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
188
    "zun", "zxx", "zza",
189
nullptr,
190
    "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
191
nullptr
192
};
193
194
constexpr const char* DEPRECATED_LANGUAGES[]={
195
    "in", "iw", "ji", "jw", "mo", nullptr, nullptr
196
};
197
constexpr const char* REPLACEMENT_LANGUAGES[]={
198
    "id", "he", "yi", "jv", "ro", nullptr, nullptr
199
};
200
201
/**
202
 * Table of 3-letter language codes.
203
 *
204
 * This is a lookup table used to convert 3-letter language codes to
205
 * their 2-letter equivalent, where possible.  It must be kept in sync
206
 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
207
 * same language as LANGUAGES_3[i].  The commented-out lines are
208
 * copied from LANGUAGES to make eyeballing this baby easier.
209
 *
210
 * Where a 3-letter language code has no 2-letter equivalent, the
211
 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
212
 *
213
 * This table should be terminated with a nullptr entry, followed by a
214
 * second list, and another nullptr entry.  The two lists correspond to
215
 * the two lists in LANGUAGES.
216
 */
217
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
218
/* ISO639 table version is 20150505 */
219
/* Subsequent hand addition of selected languages */
220
constexpr const char* LANGUAGES_3[] = {
221
    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
222
    "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
223
    "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
224
    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
225
    "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
226
    "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
227
    "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
228
    "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
229
    "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
230
    "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
231
    "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
232
    "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
233
    "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
234
    "ces", "csb", "csw", "chu", "chv", "cym",
235
    "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
236
    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
237
    "dyo", "dyu", "dzo", "dzg",
238
    "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
239
    "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
240
    "ext",
241
    "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
242
    "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
243
    "frs", "fur", "fry",
244
    "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
245
    "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
246
    "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
247
    "gur", "guz", "glv", "gwi",
248
    "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
249
    "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
250
    "hup", "hye", "her",
251
    "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
252
    "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
253
    "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
254
    "jav",
255
    "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
256
    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
257
    "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
258
    "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
259
    "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
260
    "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
261
    "kom", "cor", "kxv", "kir",
262
    "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
263
    "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
264
    "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
265
    "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
266
    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
267
    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
268
    "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
269
    "mal", "mon", "mnc", "mni",
270
    "moh", "mos", "mar", "mrj",
271
    "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
272
    "mya", "mye", "myv", "mzn",
273
    "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
274
    "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
275
    "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
276
    "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
277
    "oci", "oji", "orm", "ori", "oss", "osa", "ota",
278
    "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
279
    "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
280
    "pon", "prg", "pro", "pus", "por",
281
    "que", "quc", "qug",
282
    "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
283
    "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
284
    "kin", "rwk",
285
    "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
286
    "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
287
    "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
288
    "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
289
    "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
290
    "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
291
    "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
292
    "swe", "swa", "swb", "syc", "syr", "szl",
293
    "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
294
    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
295
    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
296
    "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
297
    "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
298
    "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
299
    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
300
    "vol", "vot", "vro", "vun",
301
    "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
302
    "xal", "xho", "xmf", "xnr", "xog",
303
    "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
304
    "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
305
    "zun", "zxx", "zza",
306
nullptr,
307
/*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
308
    "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
309
nullptr
310
};
311
312
/**
313
 * Table of 2-letter country codes.
314
 *
315
 * This list must be in sorted order.  This list is returned directly
316
 * to the user by some API.
317
 *
318
 * This list must be kept in sync with COUNTRIES_3, with corresponding
319
 * entries matched.
320
 *
321
 * This table should be terminated with a nullptr entry, followed by a
322
 * second list, and another nullptr entry.  The first list is visible to
323
 * user code when this array is returned by API.  The second list
324
 * contains codes we support, but do not expose through user API.
325
 *
326
 * Notes:
327
 *
328
 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
329
 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
330
 * new codes keeping the old ones for compatibility updated to include
331
 * 1999/12/03 revisions *CWB*
332
 *
333
 * RO(ROM) is now RO(ROU) according to
334
 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
335
 */
336
constexpr const char* COUNTRIES[] = {
337
    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
338
    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
339
    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
340
    "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
341
    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
342
    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",
343
    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
344
    "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
345
    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
346
    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
347
    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
348
    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
349
    "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
350
    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
351
    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
352
    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
353
    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
354
    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
355
    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
356
    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
357
    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
358
    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
359
    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
360
    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
361
    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
362
    "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
363
    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
364
    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
365
    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
366
    "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
367
nullptr,
368
    "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
369
nullptr
370
};
371
372
constexpr const char* DEPRECATED_COUNTRIES[] = {
373
    "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
374
};
375
constexpr const char* REPLACEMENT_COUNTRIES[] = {
376
/*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
377
    "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr  /* replacement country codes */
378
};
379
380
/**
381
 * Table of 3-letter country codes.
382
 *
383
 * This is a lookup table used to convert 3-letter country codes to
384
 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
385
 * For all valid i, COUNTRIES[i] must refer to the same country as
386
 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
387
 * to make eyeballing this baby easier.
388
 *
389
 * This table should be terminated with a nullptr entry, followed by a
390
 * second list, and another nullptr entry.  The two lists correspond to
391
 * the two lists in COUNTRIES.
392
 */
393
constexpr const char* COUNTRIES_3[] = {
394
/*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
395
    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
396
/*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
397
    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
398
/*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
399
    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
400
/*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
401
    "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
402
/*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
403
    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
404
/*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",     */
405
    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
406
/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
407
    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
408
/*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
409
    "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
410
/*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
411
    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
412
/*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
413
    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
414
/*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
415
    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
416
/*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
417
    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
418
/*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
419
    "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
420
/*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
421
    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
422
/*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
423
    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
424
/*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
425
    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
426
/*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
427
    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
428
/*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
429
    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
430
/*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
431
    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
432
/*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
433
    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
434
/*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
435
    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
436
/*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
437
    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
438
/*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
439
    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
440
/*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
441
    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
442
/*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
443
    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
444
/*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
445
    "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
446
/*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
447
    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
448
/*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
449
    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
450
/*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
451
    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
452
/*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
453
    "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
454
nullptr,
455
/*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
456
    "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
457
nullptr
458
};
459
460
typedef struct CanonicalizationMap {
461
    const char *id;          /* input ID */
462
    const char *canonicalID; /* canonicalized output ID */
463
} CanonicalizationMap;
464
465
/**
466
 * A map to canonicalize locale IDs.  This handles a variety of
467
 * different semantic kinds of transformations.
468
 */
469
constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
470
    { "art__LOJBAN",    "jbo" }, /* registered name */
471
    { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
472
    { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
473
    { "zh__GUOYU",      "zh" }, /* registered name */
474
    { "zh__HAKKA",      "hak" }, /* registered name */
475
    { "zh__XIANG",      "hsn" }, /* registered name */
476
    // subtags with 3 chars won't be treated as variants.
477
    { "zh_GAN",         "gan" }, /* registered name */
478
    { "zh_MIN_NAN",     "nan" }, /* registered name */
479
    { "zh_WUU",         "wuu" }, /* registered name */
480
    { "zh_YUE",         "yue" }, /* registered name */
481
};
482
483
/* ### BCP47 Conversion *******************************************/
484
/* Gets the size of the shortest subtag in the given localeID. */
485
8.46M
int32_t getShortestSubtagLength(std::string_view localeID) {
486
8.46M
    int32_t localeIDLength = static_cast<int32_t>(localeID.length());
487
8.46M
    int32_t length = localeIDLength;
488
8.46M
    int32_t tmpLength = 0;
489
8.46M
    int32_t i;
490
8.46M
    bool reset = true;
491
492
912M
    for (i = 0; i < localeIDLength; i++) {
493
904M
        if (localeID[i] != '_' && localeID[i] != '-') {
494
801M
            if (reset) {
495
106M
                tmpLength = 0;
496
106M
                reset = false;
497
106M
            }
498
801M
            tmpLength++;
499
801M
        } else {
500
102M
            if (tmpLength != 0 && tmpLength < length) {
501
6.77M
                length = tmpLength;
502
6.77M
            }
503
102M
            reset = true;
504
102M
        }
505
904M
    }
506
507
8.46M
    return length;
508
8.46M
}
509
/* Test if the locale id has BCP47 u extension and does not have '@' */
510
8.82M
inline bool _hasBCP47Extension(std::string_view id) {
511
8.82M
    return id.find('@') == std::string_view::npos && getShortestSubtagLength(id) == 1;
512
8.82M
}
513
514
/* ### Keywords **************************************************/
515
89.7M
inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
516
299M
inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
517
/* Punctuation/symbols allowed in legacy key values */
518
42.8M
inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
519
520
}  // namespace
521
522
389k
#define ULOC_KEYWORD_BUFFER_LEN 25
523
142k
#define ULOC_MAX_NO_KEYWORDS 25
524
525
U_CAPI const char * U_EXPORT2
526
8.87M
locale_getKeywordsStart(std::string_view localeID) {
527
8.87M
    if (size_t pos = localeID.find('@'); pos != std::string_view::npos) {
528
441k
        return localeID.data() + pos;
529
441k
    }
530
#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
531
    else {
532
        /* We do this because the @ sign is variant, and the @ sign used on one
533
        EBCDIC machine won't be compiled the same way on other EBCDIC based
534
        machines. */
535
        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
536
        const uint8_t *charToFind = ebcdicSigns;
537
        while(*charToFind) {
538
            if (size_t pos = localeID.find(*charToFind); pos != std::string_view::npos) {
539
                return localeID.data() + pos;
540
            }
541
            charToFind++;
542
        }
543
    }
544
#endif
545
8.43M
    return nullptr;
546
8.87M
}
547
548
namespace {
549
550
/**
551
 * @param keywordName incoming name to be canonicalized
552
 * @param status return status (keyword too long)
553
 * @return the keyword name
554
 */
555
CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status)
556
1.93M
{
557
1.93M
  if (U_FAILURE(status)) { return {}; }
558
1.93M
  CharString result;
559
560
11.0M
  for (char c : keywordName) {
561
11.0M
    if (!UPRV_ISALPHANUM(c)) {
562
174
      status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
563
174
      return {};
564
174
    }
565
11.0M
    result.append(uprv_tolower(c), status);
566
11.0M
  }
567
1.93M
  if (result.isEmpty()) {
568
0
    status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
569
0
    return {};
570
0
  }
571
572
1.93M
  return result;
573
1.93M
}
574
575
typedef struct {
576
    char keyword[ULOC_KEYWORD_BUFFER_LEN];
577
    int32_t keywordLen;
578
    const char *valueStart;
579
    int32_t valueLen;
580
} KeywordStruct;
581
582
int32_t U_CALLCONV
583
760k
compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
584
760k
    const char* leftString = static_cast<const KeywordStruct*>(left)->keyword;
585
760k
    const char* rightString = static_cast<const KeywordStruct*>(right)->keyword;
586
760k
    return uprv_strcmp(leftString, rightString);
587
760k
}
588
589
}  // namespace
590
591
U_EXPORT CharString
592
ulocimp_getKeywords(std::string_view localeID,
593
                    char prev,
594
                    bool valuesToo,
595
                    UErrorCode& status)
596
13.0k
{
597
13.0k
    return ByteSinkUtil::viaByteSinkToCharString(
598
13.0k
        [&](ByteSink& sink, UErrorCode& status) {
599
13.0k
            ulocimp_getKeywords(localeID,
600
13.0k
                                prev,
601
13.0k
                                sink,
602
13.0k
                                valuesToo,
603
13.0k
                                status);
604
13.0k
        },
605
13.0k
        status);
606
13.0k
}
607
608
U_EXPORT void
609
ulocimp_getKeywords(std::string_view localeID,
610
                    char prev,
611
                    ByteSink& sink,
612
                    bool valuesToo,
613
                    UErrorCode& status)
614
142k
{
615
142k
    if (U_FAILURE(status)) { return; }
616
617
142k
    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
618
619
142k
    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
620
142k
    int32_t numKeywords = 0;
621
142k
    size_t equalSign = std::string_view::npos;
622
142k
    size_t semicolon = std::string_view::npos;
623
142k
    int32_t i = 0, j, n;
624
625
142k
    if(prev == '@') { /* start of keyword definition */
626
        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
627
391k
        do {
628
391k
            bool duplicate = false;
629
            /* skip leading spaces */
630
403k
            while (!localeID.empty() && localeID.front() == ' ') {
631
12.5k
                localeID.remove_prefix(1);
632
12.5k
            }
633
391k
            if (localeID.empty()) { /* handle trailing "; " */
634
79
                break;
635
79
            }
636
390k
            if(numKeywords == maxKeywords) {
637
997
                status = U_INTERNAL_PROGRAM_ERROR;
638
997
                return;
639
997
            }
640
389k
            equalSign = localeID.find('=');
641
389k
            semicolon = localeID.find(';');
642
            /* lack of '=' [foo@currency] is illegal */
643
            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
644
389k
            if (equalSign == std::string_view::npos ||
645
389k
                (semicolon != std::string_view::npos && semicolon < equalSign)) {
646
223
                status = U_INVALID_FORMAT_ERROR;
647
223
                return;
648
223
            }
649
            /* zero-length keyword is an error. */
650
389k
            if (equalSign == 0) {
651
362
                status = U_INVALID_FORMAT_ERROR;
652
362
                return;
653
362
            }
654
            /* need to normalize both keyword and keyword name */
655
389k
            if (equalSign >= ULOC_KEYWORD_BUFFER_LEN) {
656
                /* keyword name too long for internal buffer */
657
741
                status = U_INTERNAL_PROGRAM_ERROR;
658
741
                return;
659
741
            }
660
2.20M
            for (i = 0, n = 0; static_cast<size_t>(i) < equalSign; ++i) {
661
1.81M
                if (localeID[i] != ' ') {
662
1.81M
                    keywordList[numKeywords].keyword[n++] = uprv_tolower(localeID[i]);
663
1.81M
                }
664
1.81M
            }
665
666
388k
            keywordList[numKeywords].keyword[n] = 0;
667
388k
            keywordList[numKeywords].keywordLen = n;
668
            /* now grab the value part. First we skip the '=' */
669
388k
            equalSign++;
670
            /* then we leading spaces */
671
21.1M
            while (equalSign < localeID.length() && localeID[equalSign] == ' ') {
672
20.8M
                equalSign++;
673
20.8M
            }
674
675
            /* Premature end or zero-length value */
676
388k
            if (equalSign == localeID.length() || equalSign == semicolon) {
677
359
                status = U_INVALID_FORMAT_ERROR;
678
359
                return;
679
359
            }
680
681
388k
            keywordList[numKeywords].valueStart = localeID.data() + equalSign;
682
683
388k
            std::string_view value = localeID;
684
388k
            if (semicolon != std::string_view::npos) {
685
249k
                value.remove_suffix(value.length() - semicolon);
686
249k
                localeID.remove_prefix(semicolon + 1);
687
249k
            } else {
688
139k
                localeID = {};
689
139k
            }
690
388k
            value.remove_prefix(equalSign);
691
388k
            if (size_t last = value.find_last_not_of(' '); last != std::string_view::npos) {
692
388k
                value.remove_suffix(value.length() - last - 1);
693
388k
            }
694
388k
            keywordList[numKeywords].valueLen = static_cast<int32_t>(value.length());
695
696
            /* If this is a duplicate keyword, then ignore it */
697
2.06M
            for (j=0; j<numKeywords; ++j) {
698
1.68M
                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
699
7.78k
                    duplicate = true;
700
7.78k
                    break;
701
7.78k
                }
702
1.68M
            }
703
388k
            if (!duplicate) {
704
380k
                ++numKeywords;
705
380k
            }
706
388k
        } while (!localeID.empty());
707
708
        /* now we have a list of keywords */
709
        /* we need to sort it */
710
139k
        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
711
712
        /* Now construct the keyword part */
713
494k
        for(i = 0; i < numKeywords; i++) {
714
354k
            sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
715
354k
            if(valuesToo) {
716
324k
                sink.Append("=", 1);
717
324k
                sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
718
324k
                if(i < numKeywords - 1) {
719
197k
                    sink.Append(";", 1);
720
197k
                }
721
324k
            } else {
722
30.5k
                sink.Append("\0", 1);
723
30.5k
            }
724
354k
        }
725
139k
    }
726
142k
}
727
728
U_CAPI int32_t U_EXPORT2
729
uloc_getKeywordValue(const char* localeID,
730
                     const char* keywordName,
731
                     char* buffer, int32_t bufferCapacity,
732
                     UErrorCode* status)
733
669k
{
734
669k
    if (U_FAILURE(*status)) { return 0; }
735
668k
    if (keywordName == nullptr || *keywordName == '\0') {
736
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
737
0
        return 0;
738
0
    }
739
668k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
740
668k
        buffer, bufferCapacity,
741
668k
        [&](ByteSink& sink, UErrorCode& status) {
742
668k
            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
743
668k
        },
744
668k
        *status);
745
668k
}
746
747
U_EXPORT CharString
748
ulocimp_getKeywordValue(const char* localeID,
749
                        std::string_view keywordName,
750
                        UErrorCode& status)
751
1.23M
{
752
1.23M
    return ByteSinkUtil::viaByteSinkToCharString(
753
1.23M
        [&](ByteSink& sink, UErrorCode& status) {
754
1.23M
            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
755
1.23M
        },
756
1.23M
        status);
757
1.23M
}
758
759
U_EXPORT void
760
ulocimp_getKeywordValue(const char* localeID,
761
                        std::string_view keywordName,
762
                        icu::ByteSink& sink,
763
                        UErrorCode& status)
764
1.91M
{
765
1.91M
    if (U_FAILURE(status)) { return; }
766
767
1.91M
    if (localeID == nullptr || keywordName.empty()) {
768
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
769
0
        return;
770
0
    }
771
772
1.91M
    const char* startSearchHere = nullptr;
773
1.91M
    const char* nextSeparator = nullptr;
774
775
1.91M
    CharString tempBuffer;
776
1.91M
    const char* tmpLocaleID;
777
778
1.91M
    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
779
1.91M
    if (U_FAILURE(status)) {
780
174
      return;
781
174
    }
782
783
1.91M
    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
784
52.8k
        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status);
785
52.8k
        tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
786
1.86M
    } else {
787
1.86M
        tmpLocaleID=localeID;
788
1.86M
    }
789
790
1.91M
    startSearchHere = locale_getKeywordsStart(tmpLocaleID);
791
1.91M
    if(startSearchHere == nullptr) {
792
        /* no keywords, return at once */
793
1.73M
        return;
794
1.73M
    }
795
796
    /* find the first keyword */
797
370k
    while(startSearchHere) {
798
288k
        const char* keyValueTail;
799
800
288k
        startSearchHere++; /* skip @ or ; */
801
288k
        nextSeparator = uprv_strchr(startSearchHere, '=');
802
288k
        if(!nextSeparator) {
803
10.3k
            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
804
10.3k
            return;
805
10.3k
        }
806
        /* strip leading & trailing spaces (TC decided to tolerate these) */
807
277k
        while(*startSearchHere == ' ') {
808
0
            startSearchHere++;
809
0
        }
810
277k
        keyValueTail = nextSeparator;
811
277k
        while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
812
0
            keyValueTail--;
813
0
        }
814
        /* now keyValueTail points to first char after the keyName */
815
        /* copy & normalize keyName from locale */
816
277k
        if (startSearchHere == keyValueTail) {
817
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
818
0
            return;
819
0
        }
820
277k
        CharString localeKeywordName;
821
1.93M
        while (startSearchHere < keyValueTail) {
822
1.66M
          if (!UPRV_ISALPHANUM(*startSearchHere)) {
823
8.77k
            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
824
8.77k
            return;
825
8.77k
          }
826
1.65M
          localeKeywordName.append(uprv_tolower(*startSearchHere++), status);
827
1.65M
        }
828
269k
        if (U_FAILURE(status)) {
829
0
            return;
830
0
        }
831
832
269k
        startSearchHere = uprv_strchr(nextSeparator, ';');
833
834
269k
        if (canonKeywordName == localeKeywordName) {
835
             /* current entry matches the keyword. */
836
82.3k
           nextSeparator++; /* skip '=' */
837
            /* First strip leading & trailing spaces (TC decided to tolerate these) */
838
82.3k
            while(*nextSeparator == ' ') {
839
0
              nextSeparator++;
840
0
            }
841
82.3k
            keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
842
82.3k
            while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
843
0
              keyValueTail--;
844
0
            }
845
            /* Now copy the value, but check well-formedness */
846
82.3k
            if (nextSeparator == keyValueTail) {
847
0
              status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
848
0
              return;
849
0
            }
850
162M
            while (nextSeparator < keyValueTail) {
851
162M
              if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
852
836
                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
853
836
                return;
854
836
              }
855
              /* Should we lowercase value to return here? Tests expect as-is. */
856
162M
              sink.Append(nextSeparator++, 1);
857
162M
            }
858
81.5k
            return;
859
82.3k
        }
860
269k
    }
861
184k
}
862
863
U_CAPI int32_t U_EXPORT2
864
uloc_setKeywordValue(const char* keywordName,
865
                     const char* keywordValue,
866
                     char* buffer, int32_t bufferCapacity,
867
                     UErrorCode* status)
868
0
{
869
0
    if (U_FAILURE(*status)) { return 0; }
870
871
0
    if (keywordName == nullptr || *keywordName == 0) {
872
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
873
0
        return 0;
874
0
    }
875
876
0
    if (bufferCapacity <= 1) {
877
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
878
0
        return 0;
879
0
    }
880
881
0
    int32_t bufLen = (int32_t)uprv_strlen(buffer);
882
0
    if(bufferCapacity<bufLen) {
883
        /* The capacity is less than the length?! Is this NUL terminated? */
884
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
885
0
        return 0;
886
0
    }
887
888
0
    char* keywords = const_cast<char*>(
889
0
        locale_getKeywordsStart({buffer, static_cast<std::string_view::size_type>(bufLen)}));
890
0
    int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer;
891
    // Remove -1 from the capacity so that this function can guarantee NUL termination.
892
0
    CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
893
0
                              bufferCapacity - baseLen - 1);
894
0
    int32_t reslen = ulocimp_setKeywordValue(
895
0
        keywords == nullptr ? std::string_view() : keywords,
896
0
        keywordName,
897
0
        keywordValue == nullptr ? std::string_view() : keywordValue,
898
0
        sink,
899
0
        *status);
900
901
0
    if (U_FAILURE(*status)) {
902
0
        return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0;
903
0
    }
904
905
    // See the documentation for this function, it's guaranteed to never
906
    // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR.
907
    // In this case, nothing has been written to the sink, so it cannot have Overflowed().
908
0
    U_ASSERT(!sink.Overflowed());
909
0
    U_ASSERT(reslen >= 0);
910
0
    return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
911
0
}
912
913
U_EXPORT void
914
ulocimp_setKeywordValue(std::string_view keywordName,
915
                        std::string_view keywordValue,
916
                        CharString& localeID,
917
                        UErrorCode& status)
918
15.9k
{
919
15.9k
    if (U_FAILURE(status)) { return; }
920
15.9k
    std::string_view keywords;
921
15.9k
    if (const char* start = locale_getKeywordsStart(localeID.toStringPiece()); start != nullptr) {
922
        // This is safe because CharString::truncate() doesn't actually erase any
923
        // data, but simply sets the position for where new data will be written.
924
4.38k
        int32_t size = start - localeID.data();
925
4.38k
        keywords = localeID.toStringPiece();
926
4.38k
        keywords.remove_prefix(size);
927
4.38k
        localeID.truncate(size);
928
4.38k
    }
929
15.9k
    CharStringByteSink sink(&localeID);
930
15.9k
    ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
931
15.9k
}
932
933
U_EXPORT int32_t
934
ulocimp_setKeywordValue(std::string_view keywords,
935
                        std::string_view keywordName,
936
                        std::string_view keywordValue,
937
                        ByteSink& sink,
938
                        UErrorCode& status)
939
15.9k
{
940
15.9k
    if (U_FAILURE(status)) { return 0; }
941
942
    /* TODO: sorting. removal. */
943
15.9k
    int32_t needLen = 0;
944
15.9k
    int32_t rc;
945
15.9k
    CharString updatedKeysAndValues;
946
15.9k
    bool handledInputKeyAndValue = false;
947
15.9k
    char keyValuePrefix = '@';
948
949
15.9k
    if (status == U_STRING_NOT_TERMINATED_WARNING) {
950
0
        status = U_ZERO_ERROR;
951
0
    }
952
15.9k
    if (keywordName.empty()) {
953
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
954
0
        return 0;
955
0
    }
956
15.9k
    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
957
15.9k
    if (U_FAILURE(status)) {
958
0
        return 0;
959
0
    }
960
961
15.9k
    CharString canonKeywordValue;
962
51.4M
    for (char c : keywordValue) {
963
51.4M
        if (!UPRV_ISALPHANUM(c) && !UPRV_OK_VALUE_PUNCTUATION(c)) {
964
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
965
0
            return 0;
966
0
        }
967
        /* Should we force lowercase in value to set? */
968
51.4M
        canonKeywordValue.append(c, status);
969
51.4M
    }
970
15.9k
    if (U_FAILURE(status)) {
971
0
        return 0;
972
0
    }
973
974
15.9k
    if (keywords.size() <= 1) {
975
11.5k
        if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
976
0
            U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
977
0
            return 0;
978
0
        }
979
980
11.5k
        needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length();
981
11.5k
        int32_t capacity = 0;
982
11.5k
        char* buffer = sink.GetAppendBuffer(
983
11.5k
                needLen, needLen, nullptr, needLen, &capacity);
984
11.5k
        if (capacity < needLen || buffer == nullptr) {
985
0
            status = U_BUFFER_OVERFLOW_ERROR;
986
0
            return needLen; /* no change */
987
0
        }
988
11.5k
        char* it = buffer;
989
990
11.5k
        *it++ = '@';
991
11.5k
        uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length());
992
11.5k
        it += canonKeywordName.length();
993
11.5k
        *it++ = '=';
994
11.5k
        uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length());
995
11.5k
        sink.Append(buffer, needLen);
996
11.5k
        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
997
11.5k
        return needLen;
998
11.5k
    } /* end shortcut - no @ */
999
1000
    /* search for keyword */
1001
31.4k
    for (size_t keywordStart = 0; keywordStart != std::string_view::npos;) {
1002
27.1k
        keywordStart++; /* skip @ or ; */
1003
27.1k
        size_t nextEqualsign = keywords.find('=', keywordStart);
1004
27.1k
        if (nextEqualsign == std::string_view::npos) {
1005
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
1006
0
            return 0;
1007
0
        }
1008
        /* strip leading & trailing spaces (TC decided to tolerate these) */
1009
27.1k
        while (keywordStart < keywords.size() && keywords[keywordStart] == ' ') {
1010
0
            keywordStart++;
1011
0
        }
1012
27.1k
        size_t keyValueTail = nextEqualsign;
1013
27.1k
        while (keyValueTail > keywordStart && keywords[keyValueTail - 1] == ' ') {
1014
0
            keyValueTail--;
1015
0
        }
1016
        /* now keyValueTail points to first char after the keyName */
1017
        /* copy & normalize keyName from locale */
1018
27.1k
        if (keywordStart == keyValueTail) {
1019
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1020
0
            return 0;
1021
0
        }
1022
27.1k
        CharString localeKeywordName;
1023
195k
        while (keywordStart < keyValueTail) {
1024
168k
            if (!UPRV_ISALPHANUM(keywords[keywordStart])) {
1025
14
                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1026
14
                return 0;
1027
14
            }
1028
168k
            localeKeywordName.append(uprv_tolower(keywords[keywordStart++]), status);
1029
168k
        }
1030
27.0k
        if (U_FAILURE(status)) {
1031
0
            return 0;
1032
0
        }
1033
1034
27.0k
        size_t nextSeparator = keywords.find(';', nextEqualsign);
1035
1036
        /* start processing the value part */
1037
27.0k
        nextEqualsign++; /* skip '=' */
1038
        /* First strip leading & trailing spaces (TC decided to tolerate these) */
1039
27.0k
        while (nextEqualsign < keywords.size() && keywords[nextEqualsign] == ' ') {
1040
0
            nextEqualsign++;
1041
0
        }
1042
27.0k
        keyValueTail = nextSeparator == std::string_view::npos ? keywords.size() : nextSeparator;
1043
27.0k
        while (keyValueTail > nextEqualsign && keywords[keyValueTail - 1] == ' ') {
1044
0
            keyValueTail--;
1045
0
        }
1046
27.0k
        if (nextEqualsign == keyValueTail) {
1047
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1048
0
            return 0;
1049
0
        }
1050
1051
27.0k
        rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data());
1052
27.0k
        if(rc == 0) {
1053
            /* Current entry matches the input keyword. Update the entry */
1054
1.69k
            if (!canonKeywordValue.isEmpty()) { /* updating a value */
1055
1.69k
                updatedKeysAndValues.append(keyValuePrefix, status);
1056
1.69k
                keyValuePrefix = ';'; /* for any subsequent key-value pair */
1057
1.69k
                updatedKeysAndValues.append(canonKeywordName, status);
1058
1.69k
                updatedKeysAndValues.append('=', status);
1059
1.69k
                updatedKeysAndValues.append(canonKeywordValue, status);
1060
1.69k
            } /* else removing this entry, don't emit anything */
1061
1.69k
            handledInputKeyAndValue = true;
1062
25.3k
        } else {
1063
           /* input keyword sorts earlier than current entry, add before current entry */
1064
25.3k
            if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1065
                /* insert new entry at this location */
1066
69
                updatedKeysAndValues.append(keyValuePrefix, status);
1067
69
                keyValuePrefix = ';'; /* for any subsequent key-value pair */
1068
69
                updatedKeysAndValues.append(canonKeywordName, status);
1069
69
                updatedKeysAndValues.append('=', status);
1070
69
                updatedKeysAndValues.append(canonKeywordValue, status);
1071
69
                handledInputKeyAndValue = true;
1072
69
            }
1073
            /* copy the current entry */
1074
25.3k
            updatedKeysAndValues.append(keyValuePrefix, status);
1075
25.3k
            keyValuePrefix = ';'; /* for any subsequent key-value pair */
1076
25.3k
            updatedKeysAndValues.append(localeKeywordName, status);
1077
25.3k
            updatedKeysAndValues.append('=', status);
1078
25.3k
            updatedKeysAndValues.append(keywords.data() + nextEqualsign,
1079
25.3k
                                        static_cast<int32_t>(keyValueTail - nextEqualsign), status);
1080
25.3k
        }
1081
27.0k
        if (nextSeparator == std::string_view::npos && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1082
            /* append new entry at the end, it sorts later than existing entries */
1083
2.62k
            updatedKeysAndValues.append(keyValuePrefix, status);
1084
            /* skip keyValuePrefix update, no subsequent key-value pair */
1085
2.62k
            updatedKeysAndValues.append(canonKeywordName, status);
1086
2.62k
            updatedKeysAndValues.append('=', status);
1087
2.62k
            updatedKeysAndValues.append(canonKeywordValue, status);
1088
2.62k
            handledInputKeyAndValue = true;
1089
2.62k
        }
1090
27.0k
        keywordStart = nextSeparator;
1091
27.0k
    } /* end loop searching */
1092
1093
    /* Any error from updatedKeysAndValues.append above would be internal and not due to
1094
     * problems with the passed-in locale. So if we did encounter problems with the
1095
     * passed-in locale above, those errors took precedence and overrode any error
1096
     * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1097
     * are errors here they are from updatedKeysAndValues.append; they do cause an
1098
     * error return but the passed-in locale is unmodified and the original bufLen is
1099
     * returned.
1100
     */
1101
4.37k
    if (!handledInputKeyAndValue || U_FAILURE(status)) {
1102
        /* if input key/value specified removal of a keyword not present in locale, or
1103
         * there was an error in CharString.append, leave original locale alone. */
1104
0
        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1105
        // The sink is expected to be a buffer which already contains the full
1106
        // locale string, so when it isn't going to be modified there's no need
1107
        // to actually write any data to it, as the data is already there. Only
1108
        // the first character needs to be overwritten (changing '\0' to '@').
1109
0
        needLen = static_cast<int32_t>(keywords.size());
1110
0
        int32_t capacity = 0;
1111
0
        char* buffer = sink.GetAppendBuffer(
1112
0
                needLen, needLen, nullptr, needLen, &capacity);
1113
0
        if (capacity < needLen || buffer == nullptr) {
1114
0
            status = U_BUFFER_OVERFLOW_ERROR;
1115
0
        } else {
1116
0
            *buffer = '@';
1117
0
            sink.Append(buffer, needLen);
1118
0
        }
1119
0
        return needLen;
1120
0
    }
1121
1122
4.37k
    needLen = updatedKeysAndValues.length();
1123
    // Check to see can we fit the updatedKeysAndValues, if not, return
1124
    // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1125
    // We do this because this API function does not behave like most others:
1126
    // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1127
    // When the contents fits but without the terminating NUL, in this case we need to not change
1128
    // the buffer contents and return with a buffer overflow error.
1129
4.37k
    if (needLen > 0) {
1130
4.36k
        int32_t capacity = 0;
1131
4.36k
        char* buffer = sink.GetAppendBuffer(
1132
4.36k
                needLen, needLen, nullptr, needLen, &capacity);
1133
4.36k
        if (capacity < needLen || buffer == nullptr) {
1134
0
            status = U_BUFFER_OVERFLOW_ERROR;
1135
0
            return needLen;
1136
0
        }
1137
4.36k
        uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
1138
4.36k
        sink.Append(buffer, needLen);
1139
4.36k
    }
1140
4.37k
    U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1141
4.37k
    return needLen;
1142
4.37k
}
1143
1144
/* ### ID parsing implementation **************************************************/
1145
1146
namespace {
1147
1148
16.1M
inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
1149
1150
/*returns true if one of the special prefixes is here (s=string)
1151
  'x-' or 'i-' */
1152
16.3M
inline bool _isIDPrefix(std::string_view s) {
1153
16.3M
    return s.size() >= 2 && _isPrefixLetter(s[0]) && _isIDSeparator(s[1]);
1154
16.3M
}
1155
1156
/* Dot terminates it because of POSIX form  where dot precedes the codepage
1157
 * except for variant
1158
 */
1159
120M
inline bool _isTerminator(char a) { return a == '.' || a == '@'; }
1160
1161
9.76M
inline bool _isBCP47Extension(std::string_view p) {
1162
9.76M
    return p.size() >= 3 &&
1163
9.60M
           p[0] == '-' &&
1164
489k
           (p[1] == 't' || p[1] == 'T' ||
1165
482k
            p[1] == 'u' || p[1] == 'U' ||
1166
463k
            p[1] == 'x' || p[1] == 'X') &&
1167
34.3k
           p[2] == '-';
1168
9.76M
}
1169
1170
/**
1171
 * Lookup 'key' in the array 'list'.  The array 'list' should contain
1172
 * a nullptr entry, followed by more entries, and a second nullptr entry.
1173
 *
1174
 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1175
 * COUNTRIES_3.
1176
 */
1177
std::optional<int16_t> _findIndex(const char* const* list, const char* key)
1178
1.69M
{
1179
1.69M
    const char* const* anchor = list;
1180
1.69M
    int32_t pass = 0;
1181
1182
    /* Make two passes through two nullptr-terminated arrays at 'list' */
1183
1.96M
    while (pass++ < 2) {
1184
448M
        while (*list) {
1185
448M
            if (uprv_strcmp(key, *list) == 0) {
1186
1.55M
                return static_cast<int16_t>(list - anchor);
1187
1.55M
            }
1188
446M
            list++;
1189
446M
        }
1190
271k
        ++list;     /* skip final nullptr *CWB*/
1191
271k
    }
1192
135k
    return std::nullopt;
1193
1.69M
}
1194
1195
}  // namespace
1196
1197
U_CFUNC const char*
1198
0
uloc_getCurrentCountryID(const char* oldID){
1199
0
    std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1200
0
    return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID;
1201
0
}
1202
U_CFUNC const char*
1203
0
uloc_getCurrentLanguageID(const char* oldID){
1204
0
    std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1205
0
    return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID;
1206
0
}
1207
1208
namespace {
1209
1210
/*
1211
 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
1212
 * avoid duplicating code to handle the earlier locale ID pieces
1213
 * in the functions for the later ones by
1214
 * setting the *pEnd pointer to where they stopped parsing
1215
 *
1216
 * TODO try to use this in Locale
1217
 */
1218
1219
16.3M
size_t _getLanguage(std::string_view localeID, ByteSink* sink, UErrorCode& status) {
1220
16.3M
    size_t skip = 0;
1221
16.3M
    if (localeID.size() == 4 && uprv_strnicmp(localeID.data(), "root", 4) == 0) {
1222
62.4k
        skip = 4;
1223
62.4k
        localeID.remove_prefix(skip);
1224
16.3M
    } else if (localeID.size() >= 3 && uprv_strnicmp(localeID.data(), "und", 3) == 0 &&
1225
98.7k
               (localeID.size() == 3 ||
1226
11.4k
                localeID[3] == '-' ||
1227
11.2k
                localeID[3] == '_' ||
1228
89.1k
                localeID[3] == '@')) {
1229
89.1k
        skip = 3;
1230
89.1k
        localeID.remove_prefix(skip);
1231
89.1k
    }
1232
1233
16.3M
    constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1;  // Minus NUL.
1234
1235
    /* if it starts with i- or x- then copy that prefix */
1236
16.3M
    size_t len = _isIDPrefix(localeID) ? 2 : 0;
1237
50.6M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1238
34.3M
        if (len == MAXLEN) {
1239
4.64k
            status = U_ILLEGAL_ARGUMENT_ERROR;
1240
4.64k
            return 0;
1241
4.64k
        }
1242
34.3M
        len++;
1243
34.3M
    }
1244
1245
16.3M
    if (sink == nullptr || len == 0) { return skip + len; }
1246
1247
10.8M
    int32_t minCapacity = uprv_max(static_cast<int32_t>(len), 4);  // Minimum 3 letters plus NUL.
1248
10.8M
    char scratch[MAXLEN];
1249
10.8M
    int32_t capacity = 0;
1250
10.8M
    char* buffer = sink->GetAppendBuffer(
1251
10.8M
            minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1252
1253
34.6M
    for (size_t i = 0; i < len; ++i) {
1254
23.8M
        buffer[i] = uprv_tolower(localeID[i]);
1255
23.8M
    }
1256
10.8M
    if (localeID.size() >= 2 && _isIDSeparator(localeID[1])) {
1257
60.3k
        buffer[1] = '-';
1258
60.3k
    }
1259
1260
10.8M
    if (len == 3) {
1261
        /* convert 3 character code to 2 character code if possible *CWB*/
1262
1.59M
        U_ASSERT(capacity >= 4);
1263
1.59M
        buffer[3] = '\0';
1264
1.59M
        std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer);
1265
1.59M
        if (offset.has_value()) {
1266
1.55M
            const char* const alias = LANGUAGES[*offset];
1267
1.55M
            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
1268
1.55M
            return skip + len;
1269
1.55M
        }
1270
1.59M
    }
1271
1272
9.32M
    sink->Append(buffer, static_cast<int32_t>(len));
1273
9.32M
    return skip + len;
1274
10.8M
}
1275
1276
14.5M
size_t _getScript(std::string_view localeID, ByteSink* sink) {
1277
14.5M
    constexpr int32_t LENGTH = 4;
1278
1279
14.5M
    size_t len = 0;
1280
50.1M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) &&
1281
35.7M
            uprv_isASCIILetter(localeID[len])) {
1282
35.5M
        if (len == LENGTH) { return 0; }
1283
35.5M
        len++;
1284
35.5M
    }
1285
14.5M
    if (len != LENGTH) { return 0; }
1286
1287
3.70M
    if (sink == nullptr) { return len; }
1288
1289
2.05M
    char scratch[LENGTH];
1290
2.05M
    int32_t capacity = 0;
1291
2.05M
    char* buffer = sink->GetAppendBuffer(
1292
2.05M
            LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity);
1293
1294
2.05M
    buffer[0] = uprv_toupper(localeID[0]);
1295
8.22M
    for (int32_t i = 1; i < LENGTH; ++i) {
1296
6.16M
        buffer[i] = uprv_tolower(localeID[i]);
1297
6.16M
    }
1298
1299
2.05M
    sink->Append(buffer, LENGTH);
1300
2.05M
    return len;
1301
3.70M
}
1302
1303
10.0M
size_t _getRegion(std::string_view localeID, ByteSink* sink) {
1304
10.0M
    constexpr int32_t MINLEN = 2;
1305
10.0M
    constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1;  // Minus NUL.
1306
1307
10.0M
    size_t len = 0;
1308
29.3M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1309
19.3M
        if (len == MAXLEN) { return 0; }
1310
19.3M
        len++;
1311
19.3M
    }
1312
10.0M
    if (len < MINLEN) { return 0; }
1313
1314
9.59M
    if (sink == nullptr) { return len; }
1315
1316
6.98M
    char scratch[ULOC_COUNTRY_CAPACITY];
1317
6.98M
    int32_t capacity = 0;
1318
6.98M
    char* buffer = sink->GetAppendBuffer(
1319
6.98M
            ULOC_COUNTRY_CAPACITY,
1320
6.98M
            ULOC_COUNTRY_CAPACITY,
1321
6.98M
            scratch,
1322
6.98M
            UPRV_LENGTHOF(scratch),
1323
6.98M
            &capacity);
1324
1325
21.0M
    for (size_t i = 0; i < len; ++i) {
1326
14.0M
        buffer[i] = uprv_toupper(localeID[i]);
1327
14.0M
    }
1328
1329
6.98M
    if (len == 3) {
1330
        /* convert 3 character code to 2 character code if possible *CWB*/
1331
91.1k
        U_ASSERT(capacity >= 4);
1332
91.1k
        buffer[3] = '\0';
1333
91.1k
        std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer);
1334
91.1k
        if (offset.has_value()) {
1335
144
            const char* const alias = COUNTRIES[*offset];
1336
144
            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
1337
144
            return len;
1338
144
        }
1339
91.1k
    }
1340
1341
6.98M
    sink->Append(buffer, static_cast<int32_t>(len));
1342
6.98M
    return len;
1343
6.98M
}
1344
1345
/**
1346
 * @param needSeparator if true, then add leading '_' if any variants
1347
 * are added to 'variant'
1348
 */
1349
size_t
1350
_getVariant(std::string_view localeID,
1351
            char prev,
1352
            ByteSink* sink,
1353
            bool needSeparator,
1354
4.18M
            UErrorCode& status) {
1355
4.18M
    if (U_FAILURE(status) || localeID.empty()) return 0;
1356
1357
    // Reasonable upper limit for variants
1358
    // There are no strict limitation of the syntax of variant in the legacy
1359
    // locale format. If the locale is constructed from unicode_locale_id
1360
    // as defined in UTS35, then we know each unicode_variant_subtag
1361
    // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
1362
    // 179 would allow 20 unicode_variant_subtag with sep in the
1363
    // unicode_locale_id
1364
    // 8*20 + 1*(20-1) = 179
1365
4.17M
    constexpr int32_t MAX_VARIANTS_LENGTH = 179;
1366
1367
    /* get one or more variant tags and separate them with '_' */
1368
4.17M
    size_t index = 0;
1369
4.17M
    if (_isIDSeparator(prev)) {
1370
        /* get a variant string after a '-' or '_' */
1371
9.44M
        for (std::string_view sub = localeID;;) {
1372
9.44M
            size_t next = sub.find_first_of(".@_-");
1373
            // For historical reasons, a trailing separator is included in the variant.
1374
9.44M
            bool finished = next == std::string_view::npos || next + 1 == sub.length();
1375
9.44M
            size_t limit = finished ? sub.length() : next;
1376
9.44M
            index += limit;
1377
9.44M
            if (index > MAX_VARIANTS_LENGTH) {
1378
1.46k
                status = U_ILLEGAL_ARGUMENT_ERROR;
1379
1.46k
                return 0;
1380
1.46k
            }
1381
1382
9.44M
            if (sink != nullptr) {
1383
9.43M
                if (needSeparator) {
1384
5.26M
                    sink->Append("_", 1);
1385
5.26M
                } else {
1386
4.17M
                    needSeparator = true;
1387
4.17M
                }
1388
1389
9.43M
                int32_t length = static_cast<int32_t>(limit);
1390
9.43M
                int32_t minCapacity = uprv_min(length, MAX_VARIANTS_LENGTH);
1391
9.43M
                char scratch[MAX_VARIANTS_LENGTH];
1392
9.43M
                int32_t capacity = 0;
1393
9.43M
                char* buffer = sink->GetAppendBuffer(
1394
9.43M
                        minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1395
1396
35.6M
                for (size_t i = 0; i < limit; ++i) {
1397
26.1M
                    buffer[i] = uprv_toupper(sub[i]);
1398
26.1M
                }
1399
9.43M
                sink->Append(buffer, length);
1400
9.43M
            }
1401
1402
9.44M
            if (finished) { return index; }
1403
5.33M
            sub.remove_prefix(next);
1404
5.33M
            if (_isTerminator(sub.front()) || _isBCP47Extension(sub)) { return index; }
1405
5.27M
            sub.remove_prefix(1);
1406
5.27M
            index++;
1407
5.27M
        }
1408
4.17M
    }
1409
1410
4.32k
    size_t skip = 0;
1411
    /* if there is no variant tag after a '-' or '_' then look for '@' */
1412
4.32k
    if (prev == '@') {
1413
        /* keep localeID */
1414
4.32k
    } else if (const char* p = locale_getKeywordsStart(localeID); p != nullptr) {
1415
0
        skip = 1 + p - localeID.data(); /* point after the '@' */
1416
0
        localeID.remove_prefix(skip);
1417
0
    } else {
1418
0
        return 0;
1419
0
    }
1420
185k
    for (; index < localeID.size() && !_isTerminator(localeID[index]); index++) {
1421
181k
        if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1422
578
            status = U_ILLEGAL_ARGUMENT_ERROR;
1423
578
            return 0;
1424
578
        }
1425
180k
        if (needSeparator) {
1426
1.81k
            if (sink != nullptr) {
1427
1.81k
                sink->Append("_", 1);
1428
1.81k
            }
1429
1.81k
            needSeparator = false;
1430
1.81k
        }
1431
180k
        if (sink != nullptr) {
1432
180k
            char c = uprv_toupper(localeID[index]);
1433
180k
            if (c == '-' || c == ',') c = '_';
1434
180k
            sink->Append(&c, 1);
1435
180k
        }
1436
180k
    }
1437
3.75k
    return skip + index;
1438
4.32k
}
1439
1440
}  // namespace
1441
1442
U_EXPORT CharString
1443
2.87k
ulocimp_getLanguage(std::string_view localeID, UErrorCode& status) {
1444
2.87k
    return ByteSinkUtil::viaByteSinkToCharString(
1445
2.87k
        [&](ByteSink& sink, UErrorCode& status) {
1446
2.87k
            ulocimp_getSubtags(
1447
2.87k
                    localeID,
1448
2.87k
                    &sink,
1449
2.87k
                    nullptr,
1450
2.87k
                    nullptr,
1451
2.87k
                    nullptr,
1452
2.87k
                    nullptr,
1453
2.87k
                    status);
1454
2.87k
        },
1455
2.87k
        status);
1456
2.87k
}
1457
1458
U_EXPORT CharString
1459
2.87k
ulocimp_getScript(std::string_view localeID, UErrorCode& status) {
1460
2.87k
    return ByteSinkUtil::viaByteSinkToCharString(
1461
2.87k
        [&](ByteSink& sink, UErrorCode& status) {
1462
2.87k
            ulocimp_getSubtags(
1463
2.87k
                    localeID,
1464
2.87k
                    nullptr,
1465
2.87k
                    &sink,
1466
2.87k
                    nullptr,
1467
2.87k
                    nullptr,
1468
2.87k
                    nullptr,
1469
2.87k
                    status);
1470
2.87k
        },
1471
2.87k
        status);
1472
2.87k
}
1473
1474
U_EXPORT CharString
1475
591k
ulocimp_getRegion(std::string_view localeID, UErrorCode& status) {
1476
591k
    return ByteSinkUtil::viaByteSinkToCharString(
1477
591k
        [&](ByteSink& sink, UErrorCode& status) {
1478
591k
            ulocimp_getSubtags(
1479
591k
                    localeID,
1480
591k
                    nullptr,
1481
591k
                    nullptr,
1482
591k
                    &sink,
1483
591k
                    nullptr,
1484
591k
                    nullptr,
1485
591k
                    status);
1486
591k
        },
1487
591k
        status);
1488
591k
}
1489
1490
U_EXPORT CharString
1491
5.74k
ulocimp_getVariant(std::string_view localeID, UErrorCode& status) {
1492
5.74k
    return ByteSinkUtil::viaByteSinkToCharString(
1493
5.74k
        [&](ByteSink& sink, UErrorCode& status) {
1494
5.74k
            ulocimp_getSubtags(
1495
5.74k
                    localeID,
1496
5.74k
                    nullptr,
1497
5.74k
                    nullptr,
1498
5.74k
                    nullptr,
1499
5.74k
                    &sink,
1500
5.74k
                    nullptr,
1501
5.74k
                    status);
1502
5.74k
        },
1503
5.74k
        status);
1504
5.74k
}
1505
1506
U_EXPORT void
1507
ulocimp_getSubtags(
1508
        std::string_view localeID,
1509
        CharString* language,
1510
        CharString* script,
1511
        CharString* region,
1512
        CharString* variant,
1513
        const char** pEnd,
1514
11.7M
        UErrorCode& status) {
1515
11.7M
    if (U_FAILURE(status)) { return; }
1516
1517
11.7M
    std::optional<CharStringByteSink> languageSink;
1518
11.7M
    std::optional<CharStringByteSink> scriptSink;
1519
11.7M
    std::optional<CharStringByteSink> regionSink;
1520
11.7M
    std::optional<CharStringByteSink> variantSink;
1521
1522
11.7M
    if (language != nullptr) { languageSink.emplace(language); }
1523
11.7M
    if (script != nullptr) { scriptSink.emplace(script); }
1524
11.7M
    if (region != nullptr) { regionSink.emplace(region); }
1525
11.7M
    if (variant != nullptr) { variantSink.emplace(variant); }
1526
1527
11.7M
    ulocimp_getSubtags(
1528
11.7M
            localeID,
1529
11.7M
            languageSink.has_value() ? &*languageSink : nullptr,
1530
11.7M
            scriptSink.has_value() ? &*scriptSink : nullptr,
1531
11.7M
            regionSink.has_value() ? &*regionSink : nullptr,
1532
11.7M
            variantSink.has_value() ? &*variantSink : nullptr,
1533
11.7M
            pEnd,
1534
11.7M
            status);
1535
11.7M
}
1536
1537
U_EXPORT void
1538
ulocimp_getSubtags(
1539
        std::string_view localeID,
1540
        ByteSink* language,
1541
        ByteSink* script,
1542
        ByteSink* region,
1543
        ByteSink* variant,
1544
        const char** pEnd,
1545
16.8M
        UErrorCode& status) {
1546
16.8M
    if (U_FAILURE(status)) { return; }
1547
1548
16.8M
    if (pEnd != nullptr) {
1549
6.98M
        *pEnd = localeID.data();
1550
9.87M
    } else if (language == nullptr &&
1551
5.13M
               script == nullptr &&
1552
5.12M
               region == nullptr &&
1553
4.53M
               variant == nullptr) {
1554
0
        return;
1555
0
    }
1556
1557
16.8M
    if (localeID.empty()) { return; }
1558
1559
16.3M
    bool hasRegion = false;
1560
1561
16.3M
    {
1562
16.3M
        size_t len = _getLanguage(localeID, language, status);
1563
16.3M
        if (U_FAILURE(status)) { return; }
1564
16.3M
        if (len > 0) {
1565
15.8M
            localeID.remove_prefix(len);
1566
15.8M
        }
1567
16.3M
    }
1568
1569
16.3M
    if (pEnd != nullptr) {
1570
6.49M
        *pEnd = localeID.data();
1571
9.86M
    } else if (script == nullptr &&
1572
5.12M
               region == nullptr &&
1573
4.53M
               variant == nullptr) {
1574
2.84k
        return;
1575
2.84k
    }
1576
1577
16.3M
    if (localeID.empty()) { return; }
1578
1579
14.7M
    if (_isIDSeparator(localeID.front())) {
1580
14.5M
        std::string_view sub = localeID;
1581
14.5M
        sub.remove_prefix(1);
1582
14.5M
        size_t len = _getScript(sub, script);
1583
14.5M
        if (len > 0) {
1584
3.70M
            localeID.remove_prefix(len + 1);
1585
3.70M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1586
3.70M
        }
1587
14.5M
    }
1588
1589
14.7M
    if ((region == nullptr && variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1590
1591
10.2M
    if (_isIDSeparator(localeID.front())) {
1592
10.0M
        std::string_view sub = localeID;
1593
10.0M
        sub.remove_prefix(1);
1594
10.0M
        size_t len = _getRegion(sub, region);
1595
10.0M
        if (len > 0) {
1596
9.59M
            hasRegion = true;
1597
9.59M
            localeID.remove_prefix(len + 1);
1598
9.59M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1599
9.59M
        }
1600
10.0M
    }
1601
1602
10.2M
    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1603
1604
4.41M
    bool hasVariant = false;
1605
1606
4.41M
    if (_isIDSeparator(localeID.front()) && !_isBCP47Extension(localeID)) {
1607
4.17M
        std::string_view sub = localeID;
1608
        /* If there was no country ID, skip a possible extra IDSeparator */
1609
4.17M
        size_t skip = !hasRegion && localeID.size() > 1 && _isIDSeparator(localeID[1]) ? 2 : 1;
1610
4.17M
        sub.remove_prefix(skip);
1611
4.17M
        size_t len = _getVariant(sub, localeID[0], variant, false, status);
1612
4.17M
        if (U_FAILURE(status)) { return; }
1613
4.17M
        if (len > 0) {
1614
4.16M
            hasVariant = true;
1615
4.16M
            localeID.remove_prefix(skip + len);
1616
4.16M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1617
4.16M
        }
1618
4.17M
    }
1619
1620
4.41M
    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1621
1622
307k
    if (_isBCP47Extension(localeID)) {
1623
7.18k
        localeID.remove_prefix(2);
1624
7.18k
        constexpr char vaposix[] = "-va-posix";
1625
7.18k
        constexpr size_t length = sizeof vaposix - 1;
1626
7.99M
        for (size_t next;; localeID.remove_prefix(next)) {
1627
7.99M
            next = localeID.find('-', 1);
1628
7.99M
            if (next == std::string_view::npos) { break; }
1629
7.99M
            next = localeID.find('-', next + 1);
1630
7.99M
            bool finished = next == std::string_view::npos;
1631
7.99M
            std::string_view sub = localeID;
1632
7.99M
            if (!finished) { sub.remove_suffix(sub.length() - next); }
1633
1634
7.99M
            if (sub.length() == length && uprv_strnicmp(sub.data(), vaposix, length) == 0) {
1635
2.33M
                if (variant != nullptr) {
1636
2.33M
                    if (hasVariant) { variant->Append("_", 1); }
1637
2.33M
                    constexpr char posix[] = "POSIX";
1638
2.33M
                    variant->Append(posix, sizeof posix - 1);
1639
2.33M
                }
1640
2.33M
                if (pEnd != nullptr) { *pEnd = localeID.data() + length; }
1641
2.33M
            }
1642
1643
7.99M
            if (finished) { break; }
1644
7.99M
        }
1645
7.18k
    }
1646
307k
}
1647
1648
/* Keyword enumeration */
1649
1650
typedef struct UKeywordsContext {
1651
    char* keywords;
1652
    char* current;
1653
} UKeywordsContext;
1654
1655
U_CDECL_BEGIN
1656
1657
static void U_CALLCONV
1658
4.42k
uloc_kw_closeKeywords(UEnumeration *enumerator) {
1659
4.42k
    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1660
4.42k
    uprv_free(enumerator->context);
1661
4.42k
    uprv_free(enumerator);
1662
4.42k
}
1663
1664
static int32_t U_CALLCONV
1665
945
uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1666
945
    char *kw = ((UKeywordsContext *)en->context)->keywords;
1667
945
    int32_t result = 0;
1668
2.43k
    while(*kw) {
1669
1.49k
        result++;
1670
1.49k
        kw += uprv_strlen(kw)+1;
1671
1.49k
    }
1672
945
    return result;
1673
945
}
1674
1675
static const char * U_CALLCONV
1676
uloc_kw_nextKeyword(UEnumeration* en,
1677
                    int32_t* resultLength,
1678
7.83k
                    UErrorCode* /*status*/) {
1679
7.83k
    const char* result = ((UKeywordsContext *)en->context)->current;
1680
7.83k
    int32_t len = 0;
1681
7.83k
    if(*result) {
1682
6.07k
        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1683
6.07k
        ((UKeywordsContext *)en->context)->current += len+1;
1684
6.07k
    } else {
1685
1.75k
        result = nullptr;
1686
1.75k
    }
1687
7.83k
    if (resultLength) {
1688
7.83k
        *resultLength = len;
1689
7.83k
    }
1690
7.83k
    return result;
1691
7.83k
}
1692
1693
static void U_CALLCONV
1694
uloc_kw_resetKeywords(UEnumeration* en,
1695
0
                      UErrorCode* /*status*/) {
1696
0
    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1697
0
}
1698
1699
U_CDECL_END
1700
1701
1702
static const UEnumeration gKeywordsEnum = {
1703
    nullptr,
1704
    nullptr,
1705
    uloc_kw_closeKeywords,
1706
    uloc_kw_countKeywords,
1707
    uenum_unextDefault,
1708
    uloc_kw_nextKeyword,
1709
    uloc_kw_resetKeywords
1710
};
1711
1712
U_CAPI UEnumeration* U_EXPORT2
1713
uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1714
4.42k
{
1715
4.42k
    if (U_FAILURE(*status)) { return nullptr; }
1716
1717
4.42k
    LocalMemory<UKeywordsContext> myContext;
1718
4.42k
    LocalMemory<UEnumeration> result;
1719
1720
4.42k
    myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1721
4.42k
    result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1722
4.42k
    if (myContext.isNull() || result.isNull()) {
1723
0
        *status = U_MEMORY_ALLOCATION_ERROR;
1724
0
        return nullptr;
1725
0
    }
1726
4.42k
    uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1727
4.42k
    myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1728
4.42k
    if (myContext->keywords == nullptr) {
1729
0
        *status = U_MEMORY_ALLOCATION_ERROR;
1730
0
        return nullptr;
1731
0
    }
1732
4.42k
    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1733
4.42k
    myContext->keywords[keywordListSize] = 0;
1734
4.42k
    myContext->current = myContext->keywords;
1735
4.42k
    result->context = myContext.orphan();
1736
4.42k
    return result.orphan();
1737
4.42k
}
1738
1739
U_CAPI UEnumeration* U_EXPORT2
1740
uloc_openKeywords(const char* localeID,
1741
                        UErrorCode* status)
1742
7.22k
{
1743
7.22k
    if(status==nullptr || U_FAILURE(*status)) {
1744
0
        return nullptr;
1745
0
    }
1746
1747
7.22k
    CharString tempBuffer;
1748
7.22k
    const char* tmpLocaleID;
1749
1750
7.22k
    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
1751
2.56k
        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status);
1752
2.56k
        tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1753
4.65k
    } else {
1754
4.65k
        if (localeID==nullptr) {
1755
0
            localeID=uloc_getDefault();
1756
0
        }
1757
4.65k
        tmpLocaleID=localeID;
1758
4.65k
    }
1759
1760
7.22k
    ulocimp_getSubtags(
1761
7.22k
            tmpLocaleID,
1762
7.22k
            nullptr,
1763
7.22k
            nullptr,
1764
7.22k
            nullptr,
1765
7.22k
            nullptr,
1766
7.22k
            &tmpLocaleID,
1767
7.22k
            *status);
1768
7.22k
    if (U_FAILURE(*status)) {
1769
216
        return nullptr;
1770
216
    }
1771
1772
    /* keywords are located after '@' */
1773
7.00k
    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1774
4.75k
        CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status);
1775
4.75k
        if (U_FAILURE(*status)) {
1776
333
            return nullptr;
1777
333
        }
1778
4.42k
        return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1779
4.75k
    }
1780
2.24k
    return nullptr;
1781
7.00k
}
1782
1783
1784
/* bit-flags for 'options' parameter of _canonicalize */
1785
10.8M
#define _ULOC_STRIP_KEYWORDS 0x2
1786
20.8M
#define _ULOC_CANONICALIZE   0x1
1787
1788
namespace {
1789
1790
27.5M
inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; }
1791
1792
constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1793
constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default);
1794
1795
/**
1796
 * Canonicalize the given localeID, to level 1 or to level 2,
1797
 * depending on the options.  To specify level 1, pass in options=0.
1798
 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1799
 *
1800
 * This is the code underlying uloc_getName and uloc_canonicalize.
1801
 */
1802
void
1803
_canonicalize(std::string_view localeID,
1804
              ByteSink& sink,
1805
              uint32_t options,
1806
6.90M
              UErrorCode& err) {
1807
6.90M
    if (U_FAILURE(err)) {
1808
0
        return;
1809
0
    }
1810
1811
6.90M
    int32_t j, fieldCount=0;
1812
6.90M
    CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
1813
6.90M
    CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1814
6.90M
    std::string_view origLocaleID;
1815
6.90M
    std::string_view tmpLocaleID;
1816
6.90M
    size_t keywordAssign = std::string_view::npos;
1817
6.90M
    size_t separatorIndicator = std::string_view::npos;
1818
1819
6.90M
    if (_hasBCP47Extension(localeID)) {
1820
228k
        std::string_view localeIDPtr = localeID;
1821
1822
        // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1823
228k
        if (localeID.size() >= 2 && localeID.find('_') != std::string_view::npos && localeID[1] != '-' && localeID[1] != '_') {
1824
102k
            localeIDWithHyphens.append(localeID, err);
1825
102k
            if (U_SUCCESS(err)) {
1826
276M
                for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1827
276M
                    if (*p == '_') {
1828
4.96M
                        *p = '-';
1829
4.96M
                    }
1830
276M
                }
1831
102k
                localeIDPtr = localeIDWithHyphens.toStringPiece();
1832
102k
            }
1833
102k
        }
1834
1835
228k
        tempBuffer = ulocimp_forLanguageTag(localeIDPtr.data(), static_cast<int32_t>(localeIDPtr.size()), nullptr, err);
1836
228k
        tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? static_cast<std::string_view>(tempBuffer.toStringPiece()) : localeIDPtr;
1837
6.67M
    } else {
1838
6.67M
        tmpLocaleID=localeID;
1839
6.67M
    }
1840
1841
6.90M
    origLocaleID=tmpLocaleID;
1842
1843
    /* get all pieces, one after another, and separate with '_' */
1844
6.90M
    CharString tag;
1845
6.90M
    CharString script;
1846
6.90M
    CharString country;
1847
6.90M
    CharString variant;
1848
6.90M
    const char* end = nullptr;
1849
6.90M
    ulocimp_getSubtags(
1850
6.90M
            tmpLocaleID,
1851
6.90M
            &tag,
1852
6.90M
            &script,
1853
6.90M
            &country,
1854
6.90M
            &variant,
1855
6.90M
            &end,
1856
6.90M
            err);
1857
6.90M
    if (U_FAILURE(err)) {
1858
5.73k
        return;
1859
5.73k
    }
1860
6.90M
    U_ASSERT(end != nullptr);
1861
6.90M
    if (end > tmpLocaleID.data()) {
1862
6.36M
        tmpLocaleID.remove_prefix(end - tmpLocaleID.data());
1863
6.36M
    }
1864
1865
6.90M
    if (tag.length() == I_DEFAULT_LENGTH && origLocaleID.length() >= I_DEFAULT_LENGTH &&
1866
6.90M
            uprv_strncmp(origLocaleID.data(), i_default, I_DEFAULT_LENGTH) == 0) {
1867
241
        tag.clear();
1868
241
        tag.append(uloc_getDefault(), err);
1869
6.90M
    } else {
1870
6.90M
        if (!script.isEmpty()) {
1871
461k
            ++fieldCount;
1872
461k
            tag.append('_', err);
1873
461k
            tag.append(script, err);
1874
461k
        }
1875
6.90M
        if (!country.isEmpty()) {
1876
4.92M
            ++fieldCount;
1877
4.92M
            tag.append('_', err);
1878
4.92M
            tag.append(country, err);
1879
4.92M
        }
1880
6.90M
        if (!variant.isEmpty()) {
1881
2.87M
            ++fieldCount;
1882
2.87M
            if (country.isEmpty()) {
1883
137k
                tag.append('_', err);
1884
137k
            }
1885
2.87M
            tag.append('_', err);
1886
2.87M
            tag.append(variant, err);
1887
2.87M
        }
1888
6.90M
    }
1889
1890
    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1891
6.90M
    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && !tmpLocaleID.empty() && tmpLocaleID.front() == '.') {
1892
15.5k
        tag.append('.', err);
1893
15.5k
        tmpLocaleID.remove_prefix(1);
1894
15.5k
        size_t length;
1895
15.5k
        if (size_t atPos = tmpLocaleID.find('@'); atPos != std::string_view::npos) {
1896
3.54k
            length = atPos;
1897
12.0k
        } else {
1898
12.0k
            length = tmpLocaleID.length();
1899
12.0k
        }
1900
        // The longest charset name we found in IANA charset registry
1901
        // https://www.iana.org/assignments/character-sets/ is
1902
        // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
1903
        // we therefore restrict the length here to be 64 which is a power of 2
1904
        // number that is longer than 45.
1905
15.5k
        constexpr size_t kMaxCharsetLength = 64;
1906
15.5k
        if (length > kMaxCharsetLength) {
1907
250
           err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1908
250
           return;
1909
250
        }
1910
15.3k
        if (length > 0) {
1911
12.5k
            tag.append(tmpLocaleID.data(), static_cast<int32_t>(length), err);
1912
12.5k
            tmpLocaleID.remove_prefix(length);
1913
12.5k
        }
1914
15.3k
    }
1915
1916
    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1917
       After this, tmpLocaleID either starts at '@' or is empty. */
1918
6.89M
    if (const char* start = locale_getKeywordsStart(tmpLocaleID); start != nullptr) {
1919
226k
        if (start > tmpLocaleID.data()) {
1920
1.76k
            tmpLocaleID.remove_prefix(start - tmpLocaleID.data());
1921
1.76k
        }
1922
226k
        keywordAssign = tmpLocaleID.find('=');
1923
226k
        separatorIndicator = tmpLocaleID.find(';');
1924
6.67M
    } else {
1925
6.67M
        tmpLocaleID = {};
1926
6.67M
    }
1927
1928
    /* Copy POSIX-style variant, if any [mr@FOO] */
1929
6.89M
    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1930
6.77M
        !tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
1931
21.6k
        tag.append(tmpLocaleID, err);
1932
21.6k
        tmpLocaleID = {};
1933
21.6k
    }
1934
1935
6.89M
    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1936
        /* Handle @FOO variant if @ is present and not followed by = */
1937
121k
        if (!tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
1938
            /* Add missing '_' if needed */
1939
4.65k
            if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
1940
6.13k
                do {
1941
6.13k
                    tag.append('_', err);
1942
6.13k
                    ++fieldCount;
1943
6.13k
                } while(fieldCount<2);
1944
4.34k
            }
1945
1946
4.65k
            CharStringByteSink s(&tag);
1947
4.65k
            std::string_view sub = tmpLocaleID;
1948
4.65k
            sub.remove_prefix(1);
1949
4.65k
            _getVariant(sub, '@', &s, !variant.isEmpty(), err);
1950
4.65k
            if (U_FAILURE(err)) { return; }
1951
4.65k
        }
1952
1953
        /* Look up the ID in the canonicalization map */
1954
1.32M
        for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1955
1.20M
            StringPiece id(CANONICALIZE_MAP[j].id);
1956
1.20M
            if (tag == id) {
1957
80
                if (id.empty() && !tmpLocaleID.empty()) {
1958
0
                    break; /* Don't remap "" if keywords present */
1959
0
                }
1960
80
                tag.clear();
1961
80
                tag.append(CANONICALIZE_MAP[j].canonicalID, err);
1962
80
                break;
1963
80
            }
1964
1.20M
        }
1965
120k
    }
1966
1967
6.89M
    sink.Append(tag.data(), tag.length());
1968
1969
6.89M
    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1970
2.93M
        if (!tmpLocaleID.empty() && keywordAssign != std::string_view::npos &&
1971
130k
            (separatorIndicator == std::string_view::npos || separatorIndicator > keywordAssign)) {
1972
129k
            sink.Append("@", 1);
1973
129k
            ++fieldCount;
1974
129k
            tmpLocaleID.remove_prefix(1);
1975
129k
            ulocimp_getKeywords(tmpLocaleID, '@', sink, true, err);
1976
129k
        }
1977
2.93M
    }
1978
6.89M
}
1979
1980
}  // namespace
1981
1982
/* ### ID parsing API **************************************************/
1983
1984
U_CAPI int32_t  U_EXPORT2
1985
uloc_getParent(const char*    localeID,
1986
               char* parent,
1987
               int32_t parentCapacity,
1988
               UErrorCode* err)
1989
0
{
1990
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
1991
0
        parent, parentCapacity,
1992
0
        [&](ByteSink& sink, UErrorCode& status) {
1993
0
            ulocimp_getParent(localeID, sink, status);
1994
0
        },
1995
0
        *err);
1996
0
}
1997
1998
U_EXPORT CharString
1999
ulocimp_getParent(const char* localeID,
2000
                  UErrorCode& err)
2001
431k
{
2002
431k
    return ByteSinkUtil::viaByteSinkToCharString(
2003
431k
        [&](ByteSink& sink, UErrorCode& status) {
2004
431k
            ulocimp_getParent(localeID, sink, status);
2005
431k
        },
2006
431k
        err);
2007
431k
}
2008
2009
U_EXPORT void
2010
ulocimp_getParent(const char* localeID,
2011
                  icu::ByteSink& sink,
2012
                  UErrorCode& err)
2013
431k
{
2014
431k
    if (U_FAILURE(err)) { return; }
2015
2016
431k
    const char *lastUnderscore;
2017
431k
    int32_t i;
2018
2019
431k
    if (localeID == nullptr)
2020
0
        localeID = uloc_getDefault();
2021
2022
431k
    lastUnderscore=uprv_strrchr(localeID, '_');
2023
431k
    if(lastUnderscore!=nullptr) {
2024
320k
        i = static_cast<int32_t>(lastUnderscore - localeID);
2025
320k
    } else {
2026
110k
        i=0;
2027
110k
    }
2028
2029
431k
    if (i > 0) {
2030
317k
        if (uprv_strnicmp(localeID, "und_", 4) == 0) {
2031
0
            localeID += 3;
2032
0
            i -= 3;
2033
0
        }
2034
317k
        sink.Append(localeID, i);
2035
317k
    }
2036
431k
}
2037
2038
U_CAPI int32_t U_EXPORT2
2039
uloc_getLanguage(const char*    localeID,
2040
         char* language,
2041
         int32_t languageCapacity,
2042
         UErrorCode* err)
2043
0
{
2044
0
    if (localeID == nullptr) {
2045
0
        localeID = uloc_getDefault();
2046
0
    }
2047
2048
    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
2049
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2050
0
        language, languageCapacity,
2051
0
        [&](ByteSink& sink, UErrorCode& status) {
2052
0
            ulocimp_getSubtags(
2053
0
                    localeID,
2054
0
                    &sink,
2055
0
                    nullptr,
2056
0
                    nullptr,
2057
0
                    nullptr,
2058
0
                    nullptr,
2059
0
                    status);
2060
0
        },
2061
0
        *err);
2062
0
}
2063
2064
U_CAPI int32_t U_EXPORT2
2065
uloc_getScript(const char*    localeID,
2066
         char* script,
2067
         int32_t scriptCapacity,
2068
         UErrorCode* err)
2069
0
{
2070
0
    if (localeID == nullptr) {
2071
0
        localeID = uloc_getDefault();
2072
0
    }
2073
2074
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2075
0
        script, scriptCapacity,
2076
0
        [&](ByteSink& sink, UErrorCode& status) {
2077
0
            ulocimp_getSubtags(
2078
0
                    localeID,
2079
0
                    nullptr,
2080
0
                    &sink,
2081
0
                    nullptr,
2082
0
                    nullptr,
2083
0
                    nullptr,
2084
0
                    status);
2085
0
        },
2086
0
        *err);
2087
0
}
2088
2089
U_CAPI int32_t  U_EXPORT2
2090
uloc_getCountry(const char* localeID,
2091
            char* country,
2092
            int32_t countryCapacity,
2093
            UErrorCode* err)
2094
2.04k
{
2095
2.04k
    if (localeID == nullptr) {
2096
0
        localeID = uloc_getDefault();
2097
0
    }
2098
2099
2.04k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2100
2.04k
        country, countryCapacity,
2101
2.04k
        [&](ByteSink& sink, UErrorCode& status) {
2102
2.04k
            ulocimp_getSubtags(
2103
2.04k
                    localeID,
2104
2.04k
                    nullptr,
2105
2.04k
                    nullptr,
2106
2.04k
                    &sink,
2107
2.04k
                    nullptr,
2108
2.04k
                    nullptr,
2109
2.04k
                    status);
2110
2.04k
        },
2111
2.04k
        *err);
2112
2.04k
}
2113
2114
U_CAPI int32_t  U_EXPORT2
2115
uloc_getVariant(const char* localeID,
2116
                char* variant,
2117
                int32_t variantCapacity,
2118
                UErrorCode* err)
2119
0
{
2120
0
    if (localeID == nullptr) {
2121
0
        localeID = uloc_getDefault();
2122
0
    }
2123
2124
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2125
0
        variant, variantCapacity,
2126
0
        [&](ByteSink& sink, UErrorCode& status) {
2127
0
            ulocimp_getSubtags(
2128
0
                    localeID,
2129
0
                    nullptr,
2130
0
                    nullptr,
2131
0
                    nullptr,
2132
0
                    &sink,
2133
0
                    nullptr,
2134
0
                    status);
2135
0
        },
2136
0
        *err);
2137
0
}
2138
2139
U_CAPI int32_t  U_EXPORT2
2140
uloc_getName(const char* localeID,
2141
             char* name,
2142
             int32_t nameCapacity,
2143
             UErrorCode* err)
2144
11.6k
{
2145
11.6k
    if (localeID == nullptr) {
2146
0
        localeID = uloc_getDefault();
2147
0
    }
2148
11.6k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2149
11.6k
        name, nameCapacity,
2150
11.6k
        [&](ByteSink& sink, UErrorCode& status) {
2151
11.6k
            ulocimp_getName(localeID, sink, status);
2152
11.6k
        },
2153
11.6k
        *err);
2154
11.6k
}
2155
2156
U_EXPORT CharString
2157
ulocimp_getName(std::string_view localeID,
2158
                UErrorCode& err)
2159
1.21M
{
2160
1.21M
    return ByteSinkUtil::viaByteSinkToCharString(
2161
1.21M
        [&](ByteSink& sink, UErrorCode& status) {
2162
1.21M
            ulocimp_getName(localeID, sink, status);
2163
1.21M
        },
2164
1.21M
        err);
2165
1.21M
}
2166
2167
U_EXPORT void
2168
ulocimp_getName(std::string_view localeID,
2169
                ByteSink& sink,
2170
                UErrorCode& err)
2171
2.81M
{
2172
2.81M
    _canonicalize(localeID, sink, 0, err);
2173
2.81M
}
2174
2175
U_CAPI int32_t  U_EXPORT2
2176
uloc_getBaseName(const char* localeID,
2177
                 char* name,
2178
                 int32_t nameCapacity,
2179
                 UErrorCode* err)
2180
0
{
2181
0
    if (localeID == nullptr) {
2182
0
        localeID = uloc_getDefault();
2183
0
    }
2184
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2185
0
        name, nameCapacity,
2186
0
        [&](ByteSink& sink, UErrorCode& status) {
2187
0
            ulocimp_getBaseName(localeID, sink, status);
2188
0
        },
2189
0
        *err);
2190
0
}
2191
2192
U_EXPORT CharString
2193
ulocimp_getBaseName(std::string_view localeID,
2194
                    UErrorCode& err)
2195
3.96M
{
2196
3.96M
    return ByteSinkUtil::viaByteSinkToCharString(
2197
3.96M
        [&](ByteSink& sink, UErrorCode& status) {
2198
3.96M
            ulocimp_getBaseName(localeID, sink, status);
2199
3.96M
        },
2200
3.96M
        err);
2201
3.96M
}
2202
2203
U_EXPORT void
2204
ulocimp_getBaseName(std::string_view localeID,
2205
                    ByteSink& sink,
2206
                    UErrorCode& err)
2207
3.96M
{
2208
3.96M
    _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
2209
3.96M
}
2210
2211
U_CAPI int32_t  U_EXPORT2
2212
uloc_canonicalize(const char* localeID,
2213
                  char* name,
2214
                  int32_t nameCapacity,
2215
                  UErrorCode* err)
2216
3.89k
{
2217
3.89k
    if (localeID == nullptr) {
2218
0
        localeID = uloc_getDefault();
2219
0
    }
2220
3.89k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2221
3.89k
        name, nameCapacity,
2222
3.89k
        [&](ByteSink& sink, UErrorCode& status) {
2223
3.89k
            ulocimp_canonicalize(localeID, sink, status);
2224
3.89k
        },
2225
3.89k
        *err);
2226
3.89k
}
2227
2228
U_EXPORT CharString
2229
ulocimp_canonicalize(std::string_view localeID,
2230
                     UErrorCode& err)
2231
99.2k
{
2232
99.2k
    return ByteSinkUtil::viaByteSinkToCharString(
2233
99.2k
        [&](ByteSink& sink, UErrorCode& status) {
2234
99.2k
            ulocimp_canonicalize(localeID, sink, status);
2235
99.2k
        },
2236
99.2k
        err);
2237
99.2k
}
2238
2239
U_EXPORT void
2240
ulocimp_canonicalize(std::string_view localeID,
2241
                     ByteSink& sink,
2242
                     UErrorCode& err)
2243
121k
{
2244
121k
    _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
2245
121k
}
2246
2247
U_CAPI const char*  U_EXPORT2
2248
uloc_getISO3Language(const char* localeID)
2249
0
{
2250
0
    UErrorCode err = U_ZERO_ERROR;
2251
2252
0
    if (localeID == nullptr)
2253
0
    {
2254
0
        localeID = uloc_getDefault();
2255
0
    }
2256
0
    CharString lang = ulocimp_getLanguage(localeID, err);
2257
0
    if (U_FAILURE(err))
2258
0
        return "";
2259
0
    std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data());
2260
0
    return offset.has_value() ? LANGUAGES_3[*offset] : "";
2261
0
}
2262
2263
U_CAPI const char*  U_EXPORT2
2264
uloc_getISO3Country(const char* localeID)
2265
0
{
2266
0
    UErrorCode err = U_ZERO_ERROR;
2267
2268
0
    if (localeID == nullptr)
2269
0
    {
2270
0
        localeID = uloc_getDefault();
2271
0
    }
2272
0
    CharString cntry = ulocimp_getRegion(localeID, err);
2273
0
    if (U_FAILURE(err))
2274
0
        return "";
2275
0
    std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data());
2276
0
    return offset.has_value() ? COUNTRIES_3[*offset] : "";
2277
0
}
2278
2279
U_CAPI uint32_t  U_EXPORT2
2280
uloc_getLCID(const char* localeID)
2281
0
{
2282
0
    UErrorCode status = U_ZERO_ERROR;
2283
0
    uint32_t   lcid = 0;
2284
2285
    /* Check for incomplete id. */
2286
0
    if (!localeID || uprv_strlen(localeID) < 2) {
2287
0
        return 0;
2288
0
    }
2289
2290
    // First, attempt Windows platform lookup if available, but fall
2291
    // through to catch any special cases (ICU vs Windows name differences).
2292
0
    lcid = uprv_convertToLCIDPlatform(localeID, &status);
2293
0
    if (U_FAILURE(status)) {
2294
0
        return 0;
2295
0
    }
2296
0
    if (lcid > 0) {
2297
        // Windows found an LCID, return that
2298
0
        return lcid;
2299
0
    }
2300
2301
0
    CharString langID = ulocimp_getLanguage(localeID, status);
2302
0
    if (U_FAILURE(status)) {
2303
0
        return 0;
2304
0
    }
2305
2306
0
    if (uprv_strchr(localeID, '@')) {
2307
        // uprv_convertToLCID does not support keywords other than collation.
2308
        // Remove all keywords except collation.
2309
0
        CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status);
2310
0
        if (U_SUCCESS(status) && !collVal.isEmpty()) {
2311
0
            CharString tmpLocaleID = ulocimp_getBaseName(localeID, status);
2312
0
            ulocimp_setKeywordValue("collation", collVal.toStringPiece(), tmpLocaleID, status);
2313
0
            if (U_SUCCESS(status)) {
2314
0
                return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status);
2315
0
            }
2316
0
        }
2317
2318
        // fall through - all keywords are simply ignored
2319
0
        status = U_ZERO_ERROR;
2320
0
    }
2321
2322
0
    return uprv_convertToLCID(langID.data(), localeID, &status);
2323
0
}
2324
2325
U_CAPI int32_t U_EXPORT2
2326
uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2327
                UErrorCode *status)
2328
0
{
2329
0
    return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2330
0
}
2331
2332
/* ### Default locale **************************************************/
2333
2334
U_CAPI const char*  U_EXPORT2
2335
uloc_getDefault()
2336
6.00M
{
2337
6.00M
    return locale_get_default();
2338
6.00M
}
2339
2340
U_CAPI void  U_EXPORT2
2341
uloc_setDefault(const char*   newDefaultLocale,
2342
             UErrorCode* err)
2343
0
{
2344
0
    if (U_FAILURE(*err))
2345
0
        return;
2346
    /* the error code isn't currently used for anything by this function*/
2347
2348
    /* propagate change to C++ */
2349
0
    locale_set_default(newDefaultLocale);
2350
0
}
2351
2352
/**
2353
 * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2354
 * to an array of pointers to arrays of char.  All of these pointers are owned
2355
 * by ICU-- do not delete them, and do not write through them.  The array is
2356
 * terminated with a null pointer.
2357
 */
2358
U_CAPI const char* const*  U_EXPORT2
2359
uloc_getISOLanguages()
2360
0
{
2361
0
    return LANGUAGES;
2362
0
}
2363
2364
/**
2365
 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2366
 * pointer to an array of pointers to arrays of char.  All of these pointers are
2367
 * owned by ICU-- do not delete them, and do not write through them.  The array is
2368
 * terminated with a null pointer.
2369
 */
2370
U_CAPI const char* const*  U_EXPORT2
2371
uloc_getISOCountries()
2372
0
{
2373
0
    return COUNTRIES;
2374
0
}
2375
2376
U_CAPI const char* U_EXPORT2
2377
uloc_toUnicodeLocaleKey(const char* keyword)
2378
2.26k
{
2379
2.26k
    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
2380
2.26k
    std::optional<std::string_view> result = ulocimp_toBcpKeyWithFallback(keyword);
2381
2.26k
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2382
2.26k
}
2383
2384
U_EXPORT std::optional<std::string_view>
2385
ulocimp_toBcpKeyWithFallback(std::string_view keyword)
2386
12.9k
{
2387
12.9k
    std::optional<std::string_view> bcpKey = ulocimp_toBcpKey(keyword);
2388
12.9k
    if (!bcpKey.has_value() &&
2389
5.02k
        ultag_isUnicodeLocaleKey(keyword.data(), static_cast<int32_t>(keyword.size()))) {
2390
        // unknown keyword, but syntax is fine..
2391
4.73k
        return keyword;
2392
4.73k
    }
2393
8.20k
    return bcpKey;
2394
12.9k
}
2395
2396
U_CAPI const char* U_EXPORT2
2397
uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2398
4.76k
{
2399
4.76k
    if (keyword == nullptr || *keyword == '\0' ||
2400
4.76k
        value == nullptr || *value == '\0') { return nullptr; }
2401
4.76k
    std::optional<std::string_view> result = ulocimp_toBcpTypeWithFallback(keyword, value);
2402
4.76k
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2403
4.76k
}
2404
2405
U_EXPORT std::optional<std::string_view>
2406
ulocimp_toBcpTypeWithFallback(std::string_view keyword, std::string_view value)
2407
17.2k
{
2408
17.2k
    std::optional<std::string_view> bcpType = ulocimp_toBcpType(keyword, value);
2409
17.2k
    if (!bcpType.has_value() &&
2410
6.26k
        ultag_isUnicodeLocaleType(value.data(), static_cast<int32_t>(value.size()))) {
2411
        // unknown keyword, but syntax is fine..
2412
5.81k
        return value;
2413
5.81k
    }
2414
11.4k
    return bcpType;
2415
17.2k
}
2416
2417
namespace {
2418
2419
bool
2420
isWellFormedLegacyKey(std::string_view key)
2421
16.5M
{
2422
16.5M
    return std::all_of(key.begin(), key.end(), UPRV_ISALPHANUM);
2423
16.5M
}
2424
2425
bool
2426
isWellFormedLegacyType(std::string_view legacyType)
2427
129k
{
2428
129k
    int32_t alphaNumLen = 0;
2429
49.3M
    for (char c : legacyType) {
2430
49.3M
        if (c == '_' || c == '/' || c == '-') {
2431
10.3M
            if (alphaNumLen == 0) {
2432
0
                return false;
2433
0
            }
2434
10.3M
            alphaNumLen = 0;
2435
38.9M
        } else if (UPRV_ISALPHANUM(c)) {
2436
38.9M
            alphaNumLen++;
2437
38.9M
        } else {
2438
0
            return false;
2439
0
        }
2440
49.3M
    }
2441
129k
    return alphaNumLen != 0;
2442
129k
}
2443
2444
}  // namespace
2445
2446
U_CAPI const char* U_EXPORT2
2447
uloc_toLegacyKey(const char* keyword)
2448
0
{
2449
0
    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
2450
0
    std::optional<std::string_view> result = ulocimp_toLegacyKeyWithFallback(keyword);
2451
0
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2452
0
}
2453
2454
U_EXPORT std::optional<std::string_view>
2455
ulocimp_toLegacyKeyWithFallback(std::string_view keyword)
2456
16.8M
{
2457
16.8M
    std::optional<std::string_view> legacyKey = ulocimp_toLegacyKey(keyword);
2458
16.8M
    if (!legacyKey.has_value() && isWellFormedLegacyKey(keyword)) {
2459
        // Checks if the specified locale key is well-formed with the legacy locale syntax.
2460
        //
2461
        // Note:
2462
        //  LDML/CLDR provides some definition of keyword syntax in
2463
        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2464
        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2465
        //  Keys can only consist of [0-9a-zA-Z].
2466
16.5M
        return keyword;
2467
16.5M
    }
2468
310k
    return legacyKey;
2469
16.8M
}
2470
2471
U_CAPI const char* U_EXPORT2
2472
uloc_toLegacyType(const char* keyword, const char* value)
2473
0
{
2474
0
    if (keyword == nullptr || *keyword == '\0' ||
2475
0
        value == nullptr || *value == '\0') { return nullptr; }
2476
0
    std::optional<std::string_view> result = ulocimp_toLegacyTypeWithFallback(keyword, value);
2477
0
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2478
0
}
2479
2480
U_EXPORT std::optional<std::string_view>
2481
ulocimp_toLegacyTypeWithFallback(std::string_view keyword, std::string_view value)
2482
159k
{
2483
159k
    std::optional<std::string_view> legacyType = ulocimp_toLegacyType(keyword, value);
2484
159k
    if (!legacyType.has_value() && isWellFormedLegacyType(value)) {
2485
        // Checks if the specified locale type is well-formed with the legacy locale syntax.
2486
        //
2487
        // Note:
2488
        //  LDML/CLDR provides some definition of keyword syntax in
2489
        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2490
        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2491
        //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2492
        //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2493
129k
        return value;
2494
129k
    }
2495
30.3k
    return legacyType;
2496
159k
}
2497
2498
/*eof*/