Coverage Report

Created: 2025-12-07 06:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/uloc.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 1997-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*
9
* File ULOC.CPP
10
*
11
* Modification History:
12
*
13
*   Date        Name        Description
14
*   04/01/97    aliu        Creation.
15
*   08/21/98    stephen     JDK 1.2 sync
16
*   12/08/98    rtg         New Locale implementation and C API
17
*   03/15/99    damiba      overhaul.
18
*   04/06/99    stephen     changed setDefault() to realloc and copy
19
*   06/14/99    stephen     Changed calls to ures_open for new params
20
*   07/21/99    stephen     Modified setDefault() to propagate to C++
21
*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22
*                           brought canonicalization code into line with spec
23
*****************************************************************************/
24
25
/*
26
   POSIX's locale format, from putil.c: [no spaces]
27
28
     ll [ _CC ] [ . MM ] [ @ VV]
29
30
     l = lang, C = ctry, M = charmap, V = variant
31
*/
32
33
#include <algorithm>
34
#include <optional>
35
#include <string_view>
36
37
#include "unicode/bytestream.h"
38
#include "unicode/errorcode.h"
39
#include "unicode/stringpiece.h"
40
#include "unicode/utypes.h"
41
#include "unicode/ustring.h"
42
#include "unicode/uloc.h"
43
44
#include "bytesinkutil.h"
45
#include "putilimp.h"
46
#include "ustr_imp.h"
47
#include "ulocimp.h"
48
#include "umutex.h"
49
#include "cstring.h"
50
#include "cmemory.h"
51
#include "locmap.h"
52
#include "uarrsort.h"
53
#include "uenumimp.h"
54
#include "uassert.h"
55
#include "charstr.h"
56
57
U_NAMESPACE_USE
58
59
/* ### Declarations **************************************************/
60
61
/* Locale stuff from locid.cpp */
62
U_CFUNC void locale_set_default(const char *id);
63
U_CFUNC const char *locale_get_default();
64
65
namespace {
66
67
/* ### Data tables **************************************************/
68
69
/**
70
 * Table of language codes, both 2- and 3-letter, with preference
71
 * given to 2-letter codes where possible.  Includes 3-letter codes
72
 * that lack a 2-letter equivalent.
73
 *
74
 * This list must be in sorted order.  This list is returned directly
75
 * to the user by some API.
76
 *
77
 * This list must be kept in sync with LANGUAGES_3, with corresponding
78
 * entries matched.
79
 *
80
 * This table should be terminated with a nullptr entry, followed by a
81
 * second list, and another nullptr entry.  The first list is visible to
82
 * user code when this array is returned by API.  The second list
83
 * contains codes we support, but do not expose through user API.
84
 *
85
 * Notes
86
 *
87
 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
88
 * include the revisions up to 2001/7/27 *CWB*
89
 *
90
 * The 3 character codes are the terminology codes like RFC 3066.  This
91
 * is compatible with prior ICU codes
92
 *
93
 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
94
 * table but now at the end of the table because 3 character codes are
95
 * duplicates.  This avoids bad searches going from 3 to 2 character
96
 * codes.
97
 *
98
 * The range qaa-qtz is reserved for local use
99
 */
100
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
101
/* ISO639 table version is 20150505 */
102
/* Subsequent hand addition of selected languages */
103
constexpr const char* LANGUAGES[] = {
104
    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
105
    "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
106
    "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
107
    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
108
    "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
109
    "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
110
    "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
111
    "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
112
    "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
113
    "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
114
    "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
115
    "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
116
    "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
117
    "cs",  "csb", "csw", "cu",  "cv",  "cy",
118
    "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
119
    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
120
    "dyo", "dyu", "dz",  "dzg",
121
    "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
122
    "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
123
    "ext",
124
    "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
125
    "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
126
    "frs", "fur", "fy",
127
    "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
128
    "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
129
    "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
130
    "gur", "guz", "gv",  "gwi",
131
    "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
132
    "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
133
    "hup", "hy",  "hz",
134
    "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
135
    "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
136
    "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
137
    "jv",
138
    "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
139
    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
140
    "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
141
    "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
142
    "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
143
    "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
144
    "kv",  "kw",  "kxv", "ky",
145
    "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
146
    "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
147
    "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
148
    "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
149
    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
150
    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
151
    "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
152
    "ml",  "mn",  "mnc", "mni",
153
    "moh", "mos", "mr",  "mrj",
154
    "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
155
    "my",  "mye", "myv", "mzn",
156
    "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
157
    "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
158
    "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
159
    "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
160
    "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
161
    "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
162
    "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
163
    "pon", "prg", "pro", "ps",  "pt",
164
    "qu",  "quc", "qug",
165
    "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
166
    "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
167
    "rw",  "rwk",
168
    "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
169
    "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
170
    "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
171
    "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
172
    "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
173
    "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
174
    "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
175
    "sv",  "sw",  "swb", "syc", "syr", "szl",
176
    "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
177
    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
178
    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
179
    "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
180
    "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
181
    "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
182
    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
183
    "vo", "vot", "vro", "vun",
184
    "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
185
    "xal", "xh",  "xmf", "xnr", "xog",
186
    "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
187
    "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
188
    "zun", "zxx", "zza",
189
nullptr,
190
    "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
191
nullptr
192
};
193
194
constexpr const char* DEPRECATED_LANGUAGES[]={
195
    "in", "iw", "ji", "jw", "mo", nullptr, nullptr
196
};
197
constexpr const char* REPLACEMENT_LANGUAGES[]={
198
    "id", "he", "yi", "jv", "ro", nullptr, nullptr
199
};
200
201
/**
202
 * Table of 3-letter language codes.
203
 *
204
 * This is a lookup table used to convert 3-letter language codes to
205
 * their 2-letter equivalent, where possible.  It must be kept in sync
206
 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
207
 * same language as LANGUAGES_3[i].  The commented-out lines are
208
 * copied from LANGUAGES to make eyeballing this baby easier.
209
 *
210
 * Where a 3-letter language code has no 2-letter equivalent, the
211
 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
212
 *
213
 * This table should be terminated with a nullptr entry, followed by a
214
 * second list, and another nullptr entry.  The two lists correspond to
215
 * the two lists in LANGUAGES.
216
 */
217
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
218
/* ISO639 table version is 20150505 */
219
/* Subsequent hand addition of selected languages */
220
constexpr const char* LANGUAGES_3[] = {
221
    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
222
    "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
223
    "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
224
    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
225
    "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
226
    "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
227
    "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
228
    "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
229
    "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
230
    "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
231
    "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
232
    "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
233
    "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
234
    "ces", "csb", "csw", "chu", "chv", "cym",
235
    "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
236
    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
237
    "dyo", "dyu", "dzo", "dzg",
238
    "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
239
    "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
240
    "ext",
241
    "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
242
    "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
243
    "frs", "fur", "fry",
244
    "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
245
    "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
246
    "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
247
    "gur", "guz", "glv", "gwi",
248
    "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
249
    "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
250
    "hup", "hye", "her",
251
    "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
252
    "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
253
    "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
254
    "jav",
255
    "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
256
    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
257
    "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
258
    "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
259
    "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
260
    "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
261
    "kom", "cor", "kxv", "kir",
262
    "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
263
    "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
264
    "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
265
    "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
266
    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
267
    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
268
    "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
269
    "mal", "mon", "mnc", "mni",
270
    "moh", "mos", "mar", "mrj",
271
    "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
272
    "mya", "mye", "myv", "mzn",
273
    "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
274
    "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
275
    "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
276
    "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
277
    "oci", "oji", "orm", "ori", "oss", "osa", "ota",
278
    "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
279
    "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
280
    "pon", "prg", "pro", "pus", "por",
281
    "que", "quc", "qug",
282
    "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
283
    "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
284
    "kin", "rwk",
285
    "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
286
    "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
287
    "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
288
    "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
289
    "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
290
    "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
291
    "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
292
    "swe", "swa", "swb", "syc", "syr", "szl",
293
    "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
294
    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
295
    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
296
    "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
297
    "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
298
    "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
299
    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
300
    "vol", "vot", "vro", "vun",
301
    "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
302
    "xal", "xho", "xmf", "xnr", "xog",
303
    "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
304
    "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
305
    "zun", "zxx", "zza",
306
nullptr,
307
/*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
308
    "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
309
nullptr
310
};
311
312
/**
313
 * Table of 2-letter country codes.
314
 *
315
 * This list must be in sorted order.  This list is returned directly
316
 * to the user by some API.
317
 *
318
 * This list must be kept in sync with COUNTRIES_3, with corresponding
319
 * entries matched.
320
 *
321
 * This table should be terminated with a nullptr entry, followed by a
322
 * second list, and another nullptr entry.  The first list is visible to
323
 * user code when this array is returned by API.  The second list
324
 * contains codes we support, but do not expose through user API.
325
 *
326
 * Notes:
327
 *
328
 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
329
 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
330
 * new codes keeping the old ones for compatibility updated to include
331
 * 1999/12/03 revisions *CWB*
332
 *
333
 * RO(ROM) is now RO(ROU) according to
334
 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
335
 */
336
constexpr const char* COUNTRIES[] = {
337
    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
338
    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
339
    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
340
    "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
341
    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
342
    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",
343
    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
344
    "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
345
    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
346
    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
347
    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
348
    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
349
    "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
350
    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
351
    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
352
    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
353
    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
354
    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
355
    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
356
    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
357
    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
358
    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
359
    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
360
    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
361
    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
362
    "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
363
    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
364
    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
365
    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
366
    "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
367
nullptr,
368
    "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
369
nullptr
370
};
371
372
constexpr const char* DEPRECATED_COUNTRIES[] = {
373
    "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
374
};
375
constexpr const char* REPLACEMENT_COUNTRIES[] = {
376
/*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
377
    "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr  /* replacement country codes */
378
};
379
380
/**
381
 * Table of 3-letter country codes.
382
 *
383
 * This is a lookup table used to convert 3-letter country codes to
384
 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
385
 * For all valid i, COUNTRIES[i] must refer to the same country as
386
 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
387
 * to make eyeballing this baby easier.
388
 *
389
 * This table should be terminated with a nullptr entry, followed by a
390
 * second list, and another nullptr entry.  The two lists correspond to
391
 * the two lists in COUNTRIES.
392
 */
393
constexpr const char* COUNTRIES_3[] = {
394
/*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
395
    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
396
/*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
397
    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
398
/*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
399
    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
400
/*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
401
    "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
402
/*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
403
    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
404
/*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",     */
405
    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
406
/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
407
    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
408
/*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
409
    "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
410
/*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
411
    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
412
/*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
413
    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
414
/*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
415
    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
416
/*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
417
    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
418
/*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
419
    "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
420
/*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
421
    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
422
/*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
423
    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
424
/*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
425
    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
426
/*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
427
    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
428
/*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
429
    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
430
/*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
431
    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
432
/*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
433
    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
434
/*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
435
    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
436
/*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
437
    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
438
/*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
439
    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
440
/*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
441
    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
442
/*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
443
    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
444
/*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
445
    "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
446
/*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
447
    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
448
/*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
449
    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
450
/*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
451
    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
452
/*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
453
    "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
454
nullptr,
455
/*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
456
    "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
457
nullptr
458
};
459
460
typedef struct CanonicalizationMap {
461
    const char *id;          /* input ID */
462
    const char *canonicalID; /* canonicalized output ID */
463
} CanonicalizationMap;
464
465
/**
466
 * A map to canonicalize locale IDs.  This handles a variety of
467
 * different semantic kinds of transformations.
468
 */
469
constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
470
    { "art__LOJBAN",    "jbo" }, /* registered name */
471
    { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
472
    { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
473
    { "zh__GUOYU",      "zh" }, /* registered name */
474
    { "zh__HAKKA",      "hak" }, /* registered name */
475
    { "zh__XIANG",      "hsn" }, /* registered name */
476
    // subtags with 3 chars won't be treated as variants.
477
    { "zh_GAN",         "gan" }, /* registered name */
478
    { "zh_MIN_NAN",     "nan" }, /* registered name */
479
    { "zh_WUU",         "wuu" }, /* registered name */
480
    { "zh_YUE",         "yue" }, /* registered name */
481
};
482
483
/* ### BCP47 Conversion *******************************************/
484
/* Gets the size of the shortest subtag in the given localeID. */
485
8.18M
int32_t getShortestSubtagLength(std::string_view localeID) {
486
8.18M
    int32_t localeIDLength = static_cast<int32_t>(localeID.length());
487
8.18M
    int32_t length = localeIDLength;
488
8.18M
    int32_t tmpLength = 0;
489
8.18M
    int32_t i;
490
8.18M
    bool reset = true;
491
492
893M
    for (i = 0; i < localeIDLength; i++) {
493
885M
        if (localeID[i] != '_' && localeID[i] != '-') {
494
790M
            if (reset) {
495
99.7M
                tmpLength = 0;
496
99.7M
                reset = false;
497
99.7M
            }
498
790M
            tmpLength++;
499
790M
        } else {
500
95.2M
            if (tmpLength != 0 && tmpLength < length) {
501
6.71M
                length = tmpLength;
502
6.71M
            }
503
95.2M
            reset = true;
504
95.2M
        }
505
885M
    }
506
507
8.18M
    return length;
508
8.18M
}
509
/* Test if the locale id has BCP47 u extension and does not have '@' */
510
8.53M
inline bool _hasBCP47Extension(std::string_view id) {
511
8.53M
    return id.find('@') == std::string_view::npos && getShortestSubtagLength(id) == 1;
512
8.53M
}
513
514
/* ### Keywords **************************************************/
515
91.4M
inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
516
279M
inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
517
/* Punctuation/symbols allowed in legacy key values */
518
42.3M
inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
519
520
}  // namespace
521
522
381k
#define ULOC_KEYWORD_BUFFER_LEN 25
523
139k
#define ULOC_MAX_NO_KEYWORDS 25
524
525
U_CAPI const char * U_EXPORT2
526
8.57M
locale_getKeywordsStart(std::string_view localeID) {
527
8.57M
    if (size_t pos = localeID.find('@'); pos != std::string_view::npos) {
528
430k
        return localeID.data() + pos;
529
430k
    }
530
#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
531
    else {
532
        /* We do this because the @ sign is variant, and the @ sign used on one
533
        EBCDIC machine won't be compiled the same way on other EBCDIC based
534
        machines. */
535
        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
536
        const uint8_t *charToFind = ebcdicSigns;
537
        while(*charToFind) {
538
            if (size_t pos = localeID.find(*charToFind); pos != std::string_view::npos) {
539
                return localeID.data() + pos;
540
            }
541
            charToFind++;
542
        }
543
    }
544
#endif
545
8.14M
    return nullptr;
546
8.57M
}
547
548
namespace {
549
550
/**
551
 * @param keywordName incoming name to be canonicalized
552
 * @param status return status (keyword too long)
553
 * @return the keyword name
554
 */
555
CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status)
556
1.87M
{
557
1.87M
  if (U_FAILURE(status)) { return {}; }
558
1.87M
  CharString result;
559
560
10.7M
  for (char c : keywordName) {
561
10.7M
    if (!UPRV_ISALPHANUM(c)) {
562
166
      status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
563
166
      return {};
564
166
    }
565
10.7M
    result.append(uprv_tolower(c), status);
566
10.7M
  }
567
1.87M
  if (result.isEmpty()) {
568
0
    status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
569
0
    return {};
570
0
  }
571
572
1.87M
  return result;
573
1.87M
}
574
575
typedef struct {
576
    char keyword[ULOC_KEYWORD_BUFFER_LEN];
577
    int32_t keywordLen;
578
    const char *valueStart;
579
    int32_t valueLen;
580
} KeywordStruct;
581
582
int32_t U_CALLCONV
583
752k
compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
584
752k
    const char* leftString = static_cast<const KeywordStruct*>(left)->keyword;
585
752k
    const char* rightString = static_cast<const KeywordStruct*>(right)->keyword;
586
752k
    return uprv_strcmp(leftString, rightString);
587
752k
}
588
589
}  // namespace
590
591
U_EXPORT CharString
592
ulocimp_getKeywords(std::string_view localeID,
593
                    char prev,
594
                    bool valuesToo,
595
                    UErrorCode& status)
596
12.5k
{
597
12.5k
    return ByteSinkUtil::viaByteSinkToCharString(
598
12.5k
        [&](ByteSink& sink, UErrorCode& status) {
599
12.5k
            ulocimp_getKeywords(localeID,
600
12.5k
                                prev,
601
12.5k
                                sink,
602
12.5k
                                valuesToo,
603
12.5k
                                status);
604
12.5k
        },
605
12.5k
        status);
606
12.5k
}
607
608
U_EXPORT void
609
ulocimp_getKeywords(std::string_view localeID,
610
                    char prev,
611
                    ByteSink& sink,
612
                    bool valuesToo,
613
                    UErrorCode& status)
614
139k
{
615
139k
    if (U_FAILURE(status)) { return; }
616
617
139k
    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
618
619
139k
    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
620
139k
    int32_t numKeywords = 0;
621
139k
    size_t equalSign = std::string_view::npos;
622
139k
    size_t semicolon = std::string_view::npos;
623
139k
    int32_t i = 0, j, n;
624
625
139k
    if(prev == '@') { /* start of keyword definition */
626
        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
627
383k
        do {
628
383k
            bool duplicate = false;
629
            /* skip leading spaces */
630
396k
            while (!localeID.empty() && localeID.front() == ' ') {
631
13.3k
                localeID.remove_prefix(1);
632
13.3k
            }
633
383k
            if (localeID.empty()) { /* handle trailing "; " */
634
69
                break;
635
69
            }
636
382k
            if(numKeywords == maxKeywords) {
637
954
                status = U_INTERNAL_PROGRAM_ERROR;
638
954
                return;
639
954
            }
640
382k
            equalSign = localeID.find('=');
641
382k
            semicolon = localeID.find(';');
642
            /* lack of '=' [foo@currency] is illegal */
643
            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
644
382k
            if (equalSign == std::string_view::npos ||
645
381k
                (semicolon != std::string_view::npos && semicolon < equalSign)) {
646
198
                status = U_INVALID_FORMAT_ERROR;
647
198
                return;
648
198
            }
649
            /* zero-length keyword is an error. */
650
381k
            if (equalSign == 0) {
651
363
                status = U_INVALID_FORMAT_ERROR;
652
363
                return;
653
363
            }
654
            /* need to normalize both keyword and keyword name */
655
381k
            if (equalSign >= ULOC_KEYWORD_BUFFER_LEN) {
656
                /* keyword name too long for internal buffer */
657
735
                status = U_INTERNAL_PROGRAM_ERROR;
658
735
                return;
659
735
            }
660
2.18M
            for (i = 0, n = 0; static_cast<size_t>(i) < equalSign; ++i) {
661
1.79M
                if (localeID[i] != ' ') {
662
1.79M
                    keywordList[numKeywords].keyword[n++] = uprv_tolower(localeID[i]);
663
1.79M
                }
664
1.79M
            }
665
666
380k
            keywordList[numKeywords].keyword[n] = 0;
667
380k
            keywordList[numKeywords].keywordLen = n;
668
            /* now grab the value part. First we skip the '=' */
669
380k
            equalSign++;
670
            /* then we leading spaces */
671
27.5M
            while (equalSign < localeID.length() && localeID[equalSign] == ' ') {
672
27.1M
                equalSign++;
673
27.1M
            }
674
675
            /* Premature end or zero-length value */
676
380k
            if (equalSign == localeID.length() || equalSign == semicolon) {
677
329
                status = U_INVALID_FORMAT_ERROR;
678
329
                return;
679
329
            }
680
681
380k
            keywordList[numKeywords].valueStart = localeID.data() + equalSign;
682
683
380k
            std::string_view value = localeID;
684
380k
            if (semicolon != std::string_view::npos) {
685
244k
                value.remove_suffix(value.length() - semicolon);
686
244k
                localeID.remove_prefix(semicolon + 1);
687
244k
            } else {
688
136k
                localeID = {};
689
136k
            }
690
380k
            value.remove_prefix(equalSign);
691
380k
            if (size_t last = value.find_last_not_of(' '); last != std::string_view::npos) {
692
380k
                value.remove_suffix(value.length() - last - 1);
693
380k
            }
694
380k
            keywordList[numKeywords].valueLen = static_cast<int32_t>(value.length());
695
696
            /* If this is a duplicate keyword, then ignore it */
697
2.02M
            for (j=0; j<numKeywords; ++j) {
698
1.64M
                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
699
6.97k
                    duplicate = true;
700
6.97k
                    break;
701
6.97k
                }
702
1.64M
            }
703
380k
            if (!duplicate) {
704
373k
                ++numKeywords;
705
373k
            }
706
380k
        } while (!localeID.empty());
707
708
        /* now we have a list of keywords */
709
        /* we need to sort it */
710
136k
        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
711
712
        /* Now construct the keyword part */
713
485k
        for(i = 0; i < numKeywords; i++) {
714
348k
            sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
715
348k
            if(valuesToo) {
716
318k
                sink.Append("=", 1);
717
318k
                sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
718
318k
                if(i < numKeywords - 1) {
719
194k
                    sink.Append(";", 1);
720
194k
                }
721
318k
            } else {
722
30.1k
                sink.Append("\0", 1);
723
30.1k
            }
724
348k
        }
725
136k
    }
726
139k
}
727
728
U_CAPI int32_t U_EXPORT2
729
uloc_getKeywordValue(const char* localeID,
730
                     const char* keywordName,
731
                     char* buffer, int32_t bufferCapacity,
732
                     UErrorCode* status)
733
630k
{
734
630k
    if (U_FAILURE(*status)) { return 0; }
735
630k
    if (keywordName == nullptr || *keywordName == '\0') {
736
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
737
0
        return 0;
738
0
    }
739
630k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
740
630k
        buffer, bufferCapacity,
741
630k
        [&](ByteSink& sink, UErrorCode& status) {
742
630k
            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
743
630k
        },
744
630k
        *status);
745
630k
}
746
747
U_EXPORT CharString
748
ulocimp_getKeywordValue(const char* localeID,
749
                        std::string_view keywordName,
750
                        UErrorCode& status)
751
1.21M
{
752
1.21M
    return ByteSinkUtil::viaByteSinkToCharString(
753
1.21M
        [&](ByteSink& sink, UErrorCode& status) {
754
1.21M
            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
755
1.21M
        },
756
1.21M
        status);
757
1.21M
}
758
759
U_EXPORT void
760
ulocimp_getKeywordValue(const char* localeID,
761
                        std::string_view keywordName,
762
                        icu::ByteSink& sink,
763
                        UErrorCode& status)
764
1.86M
{
765
1.86M
    if (U_FAILURE(status)) { return; }
766
767
1.86M
    if (localeID == nullptr || keywordName.empty()) {
768
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
769
0
        return;
770
0
    }
771
772
1.86M
    const char* startSearchHere = nullptr;
773
1.86M
    const char* nextSeparator = nullptr;
774
775
1.86M
    CharString tempBuffer;
776
1.86M
    const char* tmpLocaleID;
777
778
1.86M
    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
779
1.86M
    if (U_FAILURE(status)) {
780
166
      return;
781
166
    }
782
783
1.86M
    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
784
48.9k
        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status);
785
48.9k
        tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
786
1.81M
    } else {
787
1.81M
        tmpLocaleID=localeID;
788
1.81M
    }
789
790
1.86M
    startSearchHere = locale_getKeywordsStart(tmpLocaleID);
791
1.86M
    if(startSearchHere == nullptr) {
792
        /* no keywords, return at once */
793
1.68M
        return;
794
1.68M
    }
795
796
    /* find the first keyword */
797
360k
    while(startSearchHere) {
798
279k
        const char* keyValueTail;
799
800
279k
        startSearchHere++; /* skip @ or ; */
801
279k
        nextSeparator = uprv_strchr(startSearchHere, '=');
802
279k
        if(!nextSeparator) {
803
10.7k
            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
804
10.7k
            return;
805
10.7k
        }
806
        /* strip leading & trailing spaces (TC decided to tolerate these) */
807
269k
        while(*startSearchHere == ' ') {
808
0
            startSearchHere++;
809
0
        }
810
269k
        keyValueTail = nextSeparator;
811
269k
        while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
812
0
            keyValueTail--;
813
0
        }
814
        /* now keyValueTail points to first char after the keyName */
815
        /* copy & normalize keyName from locale */
816
269k
        if (startSearchHere == keyValueTail) {
817
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
818
0
            return;
819
0
        }
820
269k
        CharString localeKeywordName;
821
1.88M
        while (startSearchHere < keyValueTail) {
822
1.62M
          if (!UPRV_ISALPHANUM(*startSearchHere)) {
823
8.97k
            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
824
8.97k
            return;
825
8.97k
          }
826
1.61M
          localeKeywordName.append(uprv_tolower(*startSearchHere++), status);
827
1.61M
        }
828
260k
        if (U_FAILURE(status)) {
829
0
            return;
830
0
        }
831
832
260k
        startSearchHere = uprv_strchr(nextSeparator, ';');
833
834
260k
        if (canonKeywordName == localeKeywordName) {
835
             /* current entry matches the keyword. */
836
76.7k
           nextSeparator++; /* skip '=' */
837
            /* First strip leading & trailing spaces (TC decided to tolerate these) */
838
76.7k
            while(*nextSeparator == ' ') {
839
0
              nextSeparator++;
840
0
            }
841
76.7k
            keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
842
76.7k
            while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
843
0
              keyValueTail--;
844
0
            }
845
            /* Now copy the value, but check well-formedness */
846
76.7k
            if (nextSeparator == keyValueTail) {
847
0
              status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
848
0
              return;
849
0
            }
850
143M
            while (nextSeparator < keyValueTail) {
851
143M
              if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
852
822
                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
853
822
                return;
854
822
              }
855
              /* Should we lowercase value to return here? Tests expect as-is. */
856
143M
              sink.Append(nextSeparator++, 1);
857
143M
            }
858
75.8k
            return;
859
76.7k
        }
860
260k
    }
861
176k
}
862
863
U_CAPI int32_t U_EXPORT2
864
uloc_setKeywordValue(const char* keywordName,
865
                     const char* keywordValue,
866
                     char* buffer, int32_t bufferCapacity,
867
                     UErrorCode* status)
868
0
{
869
0
    if (U_FAILURE(*status)) { return 0; }
870
871
0
    if (keywordName == nullptr || *keywordName == 0) {
872
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
873
0
        return 0;
874
0
    }
875
876
0
    if (bufferCapacity <= 1) {
877
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
878
0
        return 0;
879
0
    }
880
881
0
    int32_t bufLen = (int32_t)uprv_strlen(buffer);
882
0
    if(bufferCapacity<bufLen) {
883
        /* The capacity is less than the length?! Is this NUL terminated? */
884
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
885
0
        return 0;
886
0
    }
887
888
0
    char* keywords = const_cast<char*>(
889
0
        locale_getKeywordsStart({buffer, static_cast<std::string_view::size_type>(bufLen)}));
890
0
    int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer;
891
    // Remove -1 from the capacity so that this function can guarantee NUL termination.
892
0
    CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
893
0
                              bufferCapacity - baseLen - 1);
894
0
    int32_t reslen = ulocimp_setKeywordValue(
895
0
        keywords == nullptr ? std::string_view() : keywords,
896
0
        keywordName,
897
0
        keywordValue == nullptr ? std::string_view() : keywordValue,
898
0
        sink,
899
0
        *status);
900
901
0
    if (U_FAILURE(*status)) {
902
0
        return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0;
903
0
    }
904
905
    // See the documentation for this function, it's guaranteed to never
906
    // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR.
907
    // In this case, nothing has been written to the sink, so it cannot have Overflowed().
908
0
    U_ASSERT(!sink.Overflowed());
909
0
    U_ASSERT(reslen >= 0);
910
0
    return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
911
0
}
912
913
U_EXPORT void
914
ulocimp_setKeywordValue(std::string_view keywordName,
915
                        std::string_view keywordValue,
916
                        CharString& localeID,
917
                        UErrorCode& status)
918
14.8k
{
919
14.8k
    if (U_FAILURE(status)) { return; }
920
14.8k
    std::string_view keywords;
921
14.8k
    if (const char* start = locale_getKeywordsStart(localeID.toStringPiece()); start != nullptr) {
922
        // This is safe because CharString::truncate() doesn't actually erase any
923
        // data, but simply sets the position for where new data will be written.
924
4.44k
        int32_t size = start - localeID.data();
925
4.44k
        keywords = localeID.toStringPiece();
926
4.44k
        keywords.remove_prefix(size);
927
4.44k
        localeID.truncate(size);
928
4.44k
    }
929
14.8k
    CharStringByteSink sink(&localeID);
930
14.8k
    ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
931
14.8k
}
932
933
U_EXPORT int32_t
934
ulocimp_setKeywordValue(std::string_view keywords,
935
                        std::string_view keywordName,
936
                        std::string_view keywordValue,
937
                        ByteSink& sink,
938
                        UErrorCode& status)
939
14.8k
{
940
14.8k
    if (U_FAILURE(status)) { return 0; }
941
942
    /* TODO: sorting. removal. */
943
14.8k
    int32_t needLen = 0;
944
14.8k
    int32_t rc;
945
14.8k
    CharString updatedKeysAndValues;
946
14.8k
    bool handledInputKeyAndValue = false;
947
14.8k
    char keyValuePrefix = '@';
948
949
14.8k
    if (status == U_STRING_NOT_TERMINATED_WARNING) {
950
0
        status = U_ZERO_ERROR;
951
0
    }
952
14.8k
    if (keywordName.empty()) {
953
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
954
0
        return 0;
955
0
    }
956
14.8k
    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
957
14.8k
    if (U_FAILURE(status)) {
958
0
        return 0;
959
0
    }
960
961
14.8k
    CharString canonKeywordValue;
962
50.1M
    for (char c : keywordValue) {
963
50.1M
        if (!UPRV_ISALPHANUM(c) && !UPRV_OK_VALUE_PUNCTUATION(c)) {
964
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
965
0
            return 0;
966
0
        }
967
        /* Should we force lowercase in value to set? */
968
50.1M
        canonKeywordValue.append(c, status);
969
50.1M
    }
970
14.8k
    if (U_FAILURE(status)) {
971
0
        return 0;
972
0
    }
973
974
14.8k
    if (keywords.size() <= 1) {
975
10.4k
        if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
976
0
            U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
977
0
            return 0;
978
0
        }
979
980
10.4k
        needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length();
981
10.4k
        int32_t capacity = 0;
982
10.4k
        char* buffer = sink.GetAppendBuffer(
983
10.4k
                needLen, needLen, nullptr, needLen, &capacity);
984
10.4k
        if (capacity < needLen || buffer == nullptr) {
985
0
            status = U_BUFFER_OVERFLOW_ERROR;
986
0
            return needLen; /* no change */
987
0
        }
988
10.4k
        char* it = buffer;
989
990
10.4k
        *it++ = '@';
991
10.4k
        uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length());
992
10.4k
        it += canonKeywordName.length();
993
10.4k
        *it++ = '=';
994
10.4k
        uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length());
995
10.4k
        sink.Append(buffer, needLen);
996
10.4k
        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
997
10.4k
        return needLen;
998
10.4k
    } /* end shortcut - no @ */
999
1000
    /* search for keyword */
1001
31.9k
    for (size_t keywordStart = 0; keywordStart != std::string_view::npos;) {
1002
27.4k
        keywordStart++; /* skip @ or ; */
1003
27.4k
        size_t nextEqualsign = keywords.find('=', keywordStart);
1004
27.4k
        if (nextEqualsign == std::string_view::npos) {
1005
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
1006
0
            return 0;
1007
0
        }
1008
        /* strip leading & trailing spaces (TC decided to tolerate these) */
1009
27.4k
        while (keywordStart < keywords.size() && keywords[keywordStart] == ' ') {
1010
0
            keywordStart++;
1011
0
        }
1012
27.4k
        size_t keyValueTail = nextEqualsign;
1013
27.4k
        while (keyValueTail > keywordStart && keywords[keyValueTail - 1] == ' ') {
1014
0
            keyValueTail--;
1015
0
        }
1016
        /* now keyValueTail points to first char after the keyName */
1017
        /* copy & normalize keyName from locale */
1018
27.4k
        if (keywordStart == keyValueTail) {
1019
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1020
0
            return 0;
1021
0
        }
1022
27.4k
        CharString localeKeywordName;
1023
207k
        while (keywordStart < keyValueTail) {
1024
180k
            if (!UPRV_ISALPHANUM(keywords[keywordStart])) {
1025
12
                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1026
12
                return 0;
1027
12
            }
1028
180k
            localeKeywordName.append(uprv_tolower(keywords[keywordStart++]), status);
1029
180k
        }
1030
27.4k
        if (U_FAILURE(status)) {
1031
0
            return 0;
1032
0
        }
1033
1034
27.4k
        size_t nextSeparator = keywords.find(';', nextEqualsign);
1035
1036
        /* start processing the value part */
1037
27.4k
        nextEqualsign++; /* skip '=' */
1038
        /* First strip leading & trailing spaces (TC decided to tolerate these) */
1039
27.4k
        while (nextEqualsign < keywords.size() && keywords[nextEqualsign] == ' ') {
1040
0
            nextEqualsign++;
1041
0
        }
1042
27.4k
        keyValueTail = nextSeparator == std::string_view::npos ? keywords.size() : nextSeparator;
1043
27.4k
        while (keyValueTail > nextEqualsign && keywords[keyValueTail - 1] == ' ') {
1044
0
            keyValueTail--;
1045
0
        }
1046
27.4k
        if (nextEqualsign == keyValueTail) {
1047
0
            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1048
0
            return 0;
1049
0
        }
1050
1051
27.4k
        rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data());
1052
27.4k
        if(rc == 0) {
1053
            /* Current entry matches the input keyword. Update the entry */
1054
1.68k
            if (!canonKeywordValue.isEmpty()) { /* updating a value */
1055
1.67k
                updatedKeysAndValues.append(keyValuePrefix, status);
1056
1.67k
                keyValuePrefix = ';'; /* for any subsequent key-value pair */
1057
1.67k
                updatedKeysAndValues.append(canonKeywordName, status);
1058
1.67k
                updatedKeysAndValues.append('=', status);
1059
1.67k
                updatedKeysAndValues.append(canonKeywordValue, status);
1060
1.67k
            } /* else removing this entry, don't emit anything */
1061
1.68k
            handledInputKeyAndValue = true;
1062
25.7k
        } else {
1063
           /* input keyword sorts earlier than current entry, add before current entry */
1064
25.7k
            if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1065
                /* insert new entry at this location */
1066
64
                updatedKeysAndValues.append(keyValuePrefix, status);
1067
64
                keyValuePrefix = ';'; /* for any subsequent key-value pair */
1068
64
                updatedKeysAndValues.append(canonKeywordName, status);
1069
64
                updatedKeysAndValues.append('=', status);
1070
64
                updatedKeysAndValues.append(canonKeywordValue, status);
1071
64
                handledInputKeyAndValue = true;
1072
64
            }
1073
            /* copy the current entry */
1074
25.7k
            updatedKeysAndValues.append(keyValuePrefix, status);
1075
25.7k
            keyValuePrefix = ';'; /* for any subsequent key-value pair */
1076
25.7k
            updatedKeysAndValues.append(localeKeywordName, status);
1077
25.7k
            updatedKeysAndValues.append('=', status);
1078
25.7k
            updatedKeysAndValues.append(keywords.data() + nextEqualsign,
1079
25.7k
                                        static_cast<int32_t>(keyValueTail - nextEqualsign), status);
1080
25.7k
        }
1081
27.4k
        if (nextSeparator == std::string_view::npos && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1082
            /* append new entry at the end, it sorts later than existing entries */
1083
2.69k
            updatedKeysAndValues.append(keyValuePrefix, status);
1084
            /* skip keyValuePrefix update, no subsequent key-value pair */
1085
2.69k
            updatedKeysAndValues.append(canonKeywordName, status);
1086
2.69k
            updatedKeysAndValues.append('=', status);
1087
2.69k
            updatedKeysAndValues.append(canonKeywordValue, status);
1088
2.69k
            handledInputKeyAndValue = true;
1089
2.69k
        }
1090
27.4k
        keywordStart = nextSeparator;
1091
27.4k
    } /* end loop searching */
1092
1093
    /* Any error from updatedKeysAndValues.append above would be internal and not due to
1094
     * problems with the passed-in locale. So if we did encounter problems with the
1095
     * passed-in locale above, those errors took precedence and overrode any error
1096
     * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1097
     * are errors here they are from updatedKeysAndValues.append; they do cause an
1098
     * error return but the passed-in locale is unmodified and the original bufLen is
1099
     * returned.
1100
     */
1101
4.42k
    if (!handledInputKeyAndValue || U_FAILURE(status)) {
1102
        /* if input key/value specified removal of a keyword not present in locale, or
1103
         * there was an error in CharString.append, leave original locale alone. */
1104
0
        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1105
        // The sink is expected to be a buffer which already contains the full
1106
        // locale string, so when it isn't going to be modified there's no need
1107
        // to actually write any data to it, as the data is already there. Only
1108
        // the first character needs to be overwritten (changing '\0' to '@').
1109
0
        needLen = static_cast<int32_t>(keywords.size());
1110
0
        int32_t capacity = 0;
1111
0
        char* buffer = sink.GetAppendBuffer(
1112
0
                needLen, needLen, nullptr, needLen, &capacity);
1113
0
        if (capacity < needLen || buffer == nullptr) {
1114
0
            status = U_BUFFER_OVERFLOW_ERROR;
1115
0
        } else {
1116
0
            *buffer = '@';
1117
0
            sink.Append(buffer, needLen);
1118
0
        }
1119
0
        return needLen;
1120
0
    }
1121
1122
4.42k
    needLen = updatedKeysAndValues.length();
1123
    // Check to see can we fit the updatedKeysAndValues, if not, return
1124
    // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1125
    // We do this because this API function does not behave like most others:
1126
    // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1127
    // When the contents fits but without the terminating NUL, in this case we need to not change
1128
    // the buffer contents and return with a buffer overflow error.
1129
4.42k
    if (needLen > 0) {
1130
4.42k
        int32_t capacity = 0;
1131
4.42k
        char* buffer = sink.GetAppendBuffer(
1132
4.42k
                needLen, needLen, nullptr, needLen, &capacity);
1133
4.42k
        if (capacity < needLen || buffer == nullptr) {
1134
0
            status = U_BUFFER_OVERFLOW_ERROR;
1135
0
            return needLen;
1136
0
        }
1137
4.42k
        uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
1138
4.42k
        sink.Append(buffer, needLen);
1139
4.42k
    }
1140
4.42k
    U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1141
4.42k
    return needLen;
1142
4.42k
}
1143
1144
/* ### ID parsing implementation **************************************************/
1145
1146
namespace {
1147
1148
16.1M
inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
1149
1150
/*returns true if one of the special prefixes is here (s=string)
1151
  'x-' or 'i-' */
1152
16.4M
inline bool _isIDPrefix(std::string_view s) {
1153
16.4M
    return s.size() >= 2 && _isPrefixLetter(s[0]) && _isIDSeparator(s[1]);
1154
16.4M
}
1155
1156
/* Dot terminates it because of POSIX form  where dot precedes the codepage
1157
 * except for variant
1158
 */
1159
121M
inline bool _isTerminator(char a) { return a == '.' || a == '@'; }
1160
1161
10.1M
inline bool _isBCP47Extension(std::string_view p) {
1162
10.1M
    return p.size() >= 3 &&
1163
9.95M
           p[0] == '-' &&
1164
460k
           (p[1] == 't' || p[1] == 'T' ||
1165
453k
            p[1] == 'u' || p[1] == 'U' ||
1166
434k
            p[1] == 'x' || p[1] == 'X') &&
1167
32.6k
           p[2] == '-';
1168
10.1M
}
1169
1170
/**
1171
 * Lookup 'key' in the array 'list'.  The array 'list' should contain
1172
 * a nullptr entry, followed by more entries, and a second nullptr entry.
1173
 *
1174
 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1175
 * COUNTRIES_3.
1176
 */
1177
std::optional<int16_t> _findIndex(const char* const* list, const char* key)
1178
1.68M
{
1179
1.68M
    const char* const* anchor = list;
1180
1.68M
    int32_t pass = 0;
1181
1182
    /* Make two passes through two nullptr-terminated arrays at 'list' */
1183
1.95M
    while (pass++ < 2) {
1184
450M
        while (*list) {
1185
450M
            if (uprv_strcmp(key, *list) == 0) {
1186
1.54M
                return static_cast<int16_t>(list - anchor);
1187
1.54M
            }
1188
448M
            list++;
1189
448M
        }
1190
272k
        ++list;     /* skip final nullptr *CWB*/
1191
272k
    }
1192
136k
    return std::nullopt;
1193
1.68M
}
1194
1195
}  // namespace
1196
1197
U_CFUNC const char*
1198
0
uloc_getCurrentCountryID(const char* oldID){
1199
0
    std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1200
0
    return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID;
1201
0
}
1202
U_CFUNC const char*
1203
0
uloc_getCurrentLanguageID(const char* oldID){
1204
0
    std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1205
0
    return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID;
1206
0
}
1207
1208
namespace {
1209
1210
/*
1211
 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
1212
 * avoid duplicating code to handle the earlier locale ID pieces
1213
 * in the functions for the later ones by
1214
 * setting the *pEnd pointer to where they stopped parsing
1215
 *
1216
 * TODO try to use this in Locale
1217
 */
1218
1219
16.4M
size_t _getLanguage(std::string_view localeID, ByteSink* sink, UErrorCode& status) {
1220
16.4M
    size_t skip = 0;
1221
16.4M
    if (localeID.size() == 4 && uprv_strnicmp(localeID.data(), "root", 4) == 0) {
1222
59.7k
        skip = 4;
1223
59.7k
        localeID.remove_prefix(skip);
1224
16.3M
    } else if (localeID.size() >= 3 && uprv_strnicmp(localeID.data(), "und", 3) == 0 &&
1225
98.0k
               (localeID.size() == 3 ||
1226
11.7k
                localeID[3] == '-' ||
1227
11.6k
                localeID[3] == '_' ||
1228
88.2k
                localeID[3] == '@')) {
1229
88.2k
        skip = 3;
1230
88.2k
        localeID.remove_prefix(skip);
1231
88.2k
    }
1232
1233
16.4M
    constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1;  // Minus NUL.
1234
1235
    /* if it starts with i- or x- then copy that prefix */
1236
16.4M
    size_t len = _isIDPrefix(localeID) ? 2 : 0;
1237
50.8M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1238
34.3M
        if (len == MAXLEN) {
1239
4.73k
            status = U_ILLEGAL_ARGUMENT_ERROR;
1240
4.73k
            return 0;
1241
4.73k
        }
1242
34.3M
        len++;
1243
34.3M
    }
1244
1245
16.4M
    if (sink == nullptr || len == 0) { return skip + len; }
1246
1247
10.8M
    int32_t minCapacity = uprv_max(static_cast<int32_t>(len), 4);  // Minimum 3 letters plus NUL.
1248
10.8M
    char scratch[MAXLEN];
1249
10.8M
    int32_t capacity = 0;
1250
10.8M
    char* buffer = sink->GetAppendBuffer(
1251
10.8M
            minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1252
1253
34.4M
    for (size_t i = 0; i < len; ++i) {
1254
23.6M
        buffer[i] = uprv_tolower(localeID[i]);
1255
23.6M
    }
1256
10.8M
    if (localeID.size() >= 2 && _isIDSeparator(localeID[1])) {
1257
58.8k
        buffer[1] = '-';
1258
58.8k
    }
1259
1260
10.8M
    if (len == 3) {
1261
        /* convert 3 character code to 2 character code if possible *CWB*/
1262
1.58M
        U_ASSERT(capacity >= 4);
1263
1.58M
        buffer[3] = '\0';
1264
1.58M
        std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer);
1265
1.58M
        if (offset.has_value()) {
1266
1.54M
            const char* const alias = LANGUAGES[*offset];
1267
1.54M
            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
1268
1.54M
            return skip + len;
1269
1.54M
        }
1270
1.58M
    }
1271
1272
9.26M
    sink->Append(buffer, static_cast<int32_t>(len));
1273
9.26M
    return skip + len;
1274
10.8M
}
1275
1276
14.7M
size_t _getScript(std::string_view localeID, ByteSink* sink) {
1277
14.7M
    constexpr int32_t LENGTH = 4;
1278
1279
14.7M
    size_t len = 0;
1280
50.8M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) &&
1281
36.2M
            uprv_isASCIILetter(localeID[len])) {
1282
36.1M
        if (len == LENGTH) { return 0; }
1283
36.1M
        len++;
1284
36.1M
    }
1285
14.7M
    if (len != LENGTH) { return 0; }
1286
1287
3.78M
    if (sink == nullptr) { return len; }
1288
1289
2.10M
    char scratch[LENGTH];
1290
2.10M
    int32_t capacity = 0;
1291
2.10M
    char* buffer = sink->GetAppendBuffer(
1292
2.10M
            LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity);
1293
1294
2.10M
    buffer[0] = uprv_toupper(localeID[0]);
1295
8.41M
    for (int32_t i = 1; i < LENGTH; ++i) {
1296
6.31M
        buffer[i] = uprv_tolower(localeID[i]);
1297
6.31M
    }
1298
1299
2.10M
    sink->Append(buffer, LENGTH);
1300
2.10M
    return len;
1301
3.78M
}
1302
1303
10.0M
size_t _getRegion(std::string_view localeID, ByteSink* sink) {
1304
10.0M
    constexpr int32_t MINLEN = 2;
1305
10.0M
    constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1;  // Minus NUL.
1306
1307
10.0M
    size_t len = 0;
1308
29.6M
    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1309
19.5M
        if (len == MAXLEN) { return 0; }
1310
19.5M
        len++;
1311
19.5M
    }
1312
10.0M
    if (len < MINLEN) { return 0; }
1313
1314
9.68M
    if (sink == nullptr) { return len; }
1315
1316
6.99M
    char scratch[ULOC_COUNTRY_CAPACITY];
1317
6.99M
    int32_t capacity = 0;
1318
6.99M
    char* buffer = sink->GetAppendBuffer(
1319
6.99M
            ULOC_COUNTRY_CAPACITY,
1320
6.99M
            ULOC_COUNTRY_CAPACITY,
1321
6.99M
            scratch,
1322
6.99M
            UPRV_LENGTHOF(scratch),
1323
6.99M
            &capacity);
1324
1325
21.0M
    for (size_t i = 0; i < len; ++i) {
1326
14.0M
        buffer[i] = uprv_toupper(localeID[i]);
1327
14.0M
    }
1328
1329
6.99M
    if (len == 3) {
1330
        /* convert 3 character code to 2 character code if possible *CWB*/
1331
95.7k
        U_ASSERT(capacity >= 4);
1332
95.7k
        buffer[3] = '\0';
1333
95.7k
        std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer);
1334
95.7k
        if (offset.has_value()) {
1335
133
            const char* const alias = COUNTRIES[*offset];
1336
133
            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
1337
133
            return len;
1338
133
        }
1339
95.7k
    }
1340
1341
6.99M
    sink->Append(buffer, static_cast<int32_t>(len));
1342
6.99M
    return len;
1343
6.99M
}
1344
1345
/**
1346
 * @param needSeparator if true, then add leading '_' if any variants
1347
 * are added to 'variant'
1348
 */
1349
size_t
1350
_getVariant(std::string_view localeID,
1351
            char prev,
1352
            ByteSink* sink,
1353
            bool needSeparator,
1354
4.25M
            UErrorCode& status) {
1355
4.25M
    if (U_FAILURE(status) || localeID.empty()) return 0;
1356
1357
    // Reasonable upper limit for variants
1358
    // There are no strict limitation of the syntax of variant in the legacy
1359
    // locale format. If the locale is constructed from unicode_locale_id
1360
    // as defined in UTS35, then we know each unicode_variant_subtag
1361
    // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
1362
    // 179 would allow 20 unicode_variant_subtag with sep in the
1363
    // unicode_locale_id
1364
    // 8*20 + 1*(20-1) = 179
1365
4.24M
    constexpr int32_t MAX_VARIANTS_LENGTH = 179;
1366
1367
    /* get one or more variant tags and separate them with '_' */
1368
4.24M
    size_t index = 0;
1369
4.24M
    if (_isIDSeparator(prev)) {
1370
        /* get a variant string after a '-' or '_' */
1371
9.79M
        for (std::string_view sub = localeID;;) {
1372
9.79M
            size_t next = sub.find_first_of(".@_-");
1373
            // For historical reasons, a trailing separator is included in the variant.
1374
9.79M
            bool finished = next == std::string_view::npos || next + 1 == sub.length();
1375
9.79M
            size_t limit = finished ? sub.length() : next;
1376
9.79M
            index += limit;
1377
9.79M
            if (index > MAX_VARIANTS_LENGTH) {
1378
1.35k
                status = U_ILLEGAL_ARGUMENT_ERROR;
1379
1.35k
                return 0;
1380
1.35k
            }
1381
1382
9.79M
            if (sink != nullptr) {
1383
9.78M
                if (needSeparator) {
1384
5.54M
                    sink->Append("_", 1);
1385
5.54M
                } else {
1386
4.24M
                    needSeparator = true;
1387
4.24M
                }
1388
1389
9.78M
                int32_t length = static_cast<int32_t>(limit);
1390
9.78M
                int32_t minCapacity = uprv_min(length, MAX_VARIANTS_LENGTH);
1391
9.78M
                char scratch[MAX_VARIANTS_LENGTH];
1392
9.78M
                int32_t capacity = 0;
1393
9.78M
                char* buffer = sink->GetAppendBuffer(
1394
9.78M
                        minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1395
1396
36.0M
                for (size_t i = 0; i < limit; ++i) {
1397
26.3M
                    buffer[i] = uprv_toupper(sub[i]);
1398
26.3M
                }
1399
9.78M
                sink->Append(buffer, length);
1400
9.78M
            }
1401
1402
9.79M
            if (finished) { return index; }
1403
5.61M
            sub.remove_prefix(next);
1404
5.61M
            if (_isTerminator(sub.front()) || _isBCP47Extension(sub)) { return index; }
1405
5.55M
            sub.remove_prefix(1);
1406
5.55M
            index++;
1407
5.55M
        }
1408
4.24M
    }
1409
1410
4.39k
    size_t skip = 0;
1411
    /* if there is no variant tag after a '-' or '_' then look for '@' */
1412
4.39k
    if (prev == '@') {
1413
        /* keep localeID */
1414
4.39k
    } else if (const char* p = locale_getKeywordsStart(localeID); p != nullptr) {
1415
0
        skip = 1 + p - localeID.data(); /* point after the '@' */
1416
0
        localeID.remove_prefix(skip);
1417
0
    } else {
1418
0
        return 0;
1419
0
    }
1420
183k
    for (; index < localeID.size() && !_isTerminator(localeID[index]); index++) {
1421
179k
        if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1422
562
            status = U_ILLEGAL_ARGUMENT_ERROR;
1423
562
            return 0;
1424
562
        }
1425
179k
        if (needSeparator) {
1426
1.78k
            if (sink != nullptr) {
1427
1.78k
                sink->Append("_", 1);
1428
1.78k
            }
1429
1.78k
            needSeparator = false;
1430
1.78k
        }
1431
179k
        if (sink != nullptr) {
1432
179k
            char c = uprv_toupper(localeID[index]);
1433
179k
            if (c == '-' || c == ',') c = '_';
1434
179k
            sink->Append(&c, 1);
1435
179k
        }
1436
179k
    }
1437
3.83k
    return skip + index;
1438
4.39k
}
1439
1440
}  // namespace
1441
1442
U_EXPORT CharString
1443
2.79k
ulocimp_getLanguage(std::string_view localeID, UErrorCode& status) {
1444
2.79k
    return ByteSinkUtil::viaByteSinkToCharString(
1445
2.79k
        [&](ByteSink& sink, UErrorCode& status) {
1446
2.79k
            ulocimp_getSubtags(
1447
2.79k
                    localeID,
1448
2.79k
                    &sink,
1449
2.79k
                    nullptr,
1450
2.79k
                    nullptr,
1451
2.79k
                    nullptr,
1452
2.79k
                    nullptr,
1453
2.79k
                    status);
1454
2.79k
        },
1455
2.79k
        status);
1456
2.79k
}
1457
1458
U_EXPORT CharString
1459
2.79k
ulocimp_getScript(std::string_view localeID, UErrorCode& status) {
1460
2.79k
    return ByteSinkUtil::viaByteSinkToCharString(
1461
2.79k
        [&](ByteSink& sink, UErrorCode& status) {
1462
2.79k
            ulocimp_getSubtags(
1463
2.79k
                    localeID,
1464
2.79k
                    nullptr,
1465
2.79k
                    &sink,
1466
2.79k
                    nullptr,
1467
2.79k
                    nullptr,
1468
2.79k
                    nullptr,
1469
2.79k
                    status);
1470
2.79k
        },
1471
2.79k
        status);
1472
2.79k
}
1473
1474
U_EXPORT CharString
1475
586k
ulocimp_getRegion(std::string_view localeID, UErrorCode& status) {
1476
586k
    return ByteSinkUtil::viaByteSinkToCharString(
1477
586k
        [&](ByteSink& sink, UErrorCode& status) {
1478
586k
            ulocimp_getSubtags(
1479
586k
                    localeID,
1480
586k
                    nullptr,
1481
586k
                    nullptr,
1482
586k
                    &sink,
1483
586k
                    nullptr,
1484
586k
                    nullptr,
1485
586k
                    status);
1486
586k
        },
1487
586k
        status);
1488
586k
}
1489
1490
U_EXPORT CharString
1491
5.58k
ulocimp_getVariant(std::string_view localeID, UErrorCode& status) {
1492
5.58k
    return ByteSinkUtil::viaByteSinkToCharString(
1493
5.58k
        [&](ByteSink& sink, UErrorCode& status) {
1494
5.58k
            ulocimp_getSubtags(
1495
5.58k
                    localeID,
1496
5.58k
                    nullptr,
1497
5.58k
                    nullptr,
1498
5.58k
                    nullptr,
1499
5.58k
                    &sink,
1500
5.58k
                    nullptr,
1501
5.58k
                    status);
1502
5.58k
        },
1503
5.58k
        status);
1504
5.58k
}
1505
1506
U_EXPORT void
1507
ulocimp_getSubtags(
1508
        std::string_view localeID,
1509
        CharString* language,
1510
        CharString* script,
1511
        CharString* region,
1512
        CharString* variant,
1513
        const char** pEnd,
1514
11.5M
        UErrorCode& status) {
1515
11.5M
    if (U_FAILURE(status)) { return; }
1516
1517
11.5M
    std::optional<CharStringByteSink> languageSink;
1518
11.5M
    std::optional<CharStringByteSink> scriptSink;
1519
11.5M
    std::optional<CharStringByteSink> regionSink;
1520
11.5M
    std::optional<CharStringByteSink> variantSink;
1521
1522
11.5M
    if (language != nullptr) { languageSink.emplace(language); }
1523
11.5M
    if (script != nullptr) { scriptSink.emplace(script); }
1524
11.5M
    if (region != nullptr) { regionSink.emplace(region); }
1525
11.5M
    if (variant != nullptr) { variantSink.emplace(variant); }
1526
1527
11.5M
    ulocimp_getSubtags(
1528
11.5M
            localeID,
1529
11.5M
            languageSink.has_value() ? &*languageSink : nullptr,
1530
11.5M
            scriptSink.has_value() ? &*scriptSink : nullptr,
1531
11.5M
            regionSink.has_value() ? &*regionSink : nullptr,
1532
11.5M
            variantSink.has_value() ? &*variantSink : nullptr,
1533
11.5M
            pEnd,
1534
11.5M
            status);
1535
11.5M
}
1536
1537
U_EXPORT void
1538
ulocimp_getSubtags(
1539
        std::string_view localeID,
1540
        ByteSink* language,
1541
        ByteSink* script,
1542
        ByteSink* region,
1543
        ByteSink* variant,
1544
        const char** pEnd,
1545
16.8M
        UErrorCode& status) {
1546
16.8M
    if (U_FAILURE(status)) { return; }
1547
1548
16.8M
    if (pEnd != nullptr) {
1549
6.74M
        *pEnd = localeID.data();
1550
10.0M
    } else if (language == nullptr &&
1551
5.23M
               script == nullptr &&
1552
5.23M
               region == nullptr &&
1553
4.64M
               variant == nullptr) {
1554
0
        return;
1555
0
    }
1556
1557
16.8M
    if (localeID.empty()) { return; }
1558
1559
16.4M
    bool hasRegion = false;
1560
1561
16.4M
    {
1562
16.4M
        size_t len = _getLanguage(localeID, language, status);
1563
16.4M
        if (U_FAILURE(status)) { return; }
1564
16.4M
        if (len > 0) {
1565
15.9M
            localeID.remove_prefix(len);
1566
15.9M
        }
1567
16.4M
    }
1568
1569
16.4M
    if (pEnd != nullptr) {
1570
6.32M
        *pEnd = localeID.data();
1571
10.0M
    } else if (script == nullptr &&
1572
5.22M
               region == nullptr &&
1573
4.64M
               variant == nullptr) {
1574
2.76k
        return;
1575
2.76k
    }
1576
1577
16.3M
    if (localeID.empty()) { return; }
1578
1579
14.9M
    if (_isIDSeparator(localeID.front())) {
1580
14.7M
        std::string_view sub = localeID;
1581
14.7M
        sub.remove_prefix(1);
1582
14.7M
        size_t len = _getScript(sub, script);
1583
14.7M
        if (len > 0) {
1584
3.78M
            localeID.remove_prefix(len + 1);
1585
3.78M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1586
3.78M
        }
1587
14.7M
    }
1588
1589
14.9M
    if ((region == nullptr && variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1590
1591
10.2M
    if (_isIDSeparator(localeID.front())) {
1592
10.0M
        std::string_view sub = localeID;
1593
10.0M
        sub.remove_prefix(1);
1594
10.0M
        size_t len = _getRegion(sub, region);
1595
10.0M
        if (len > 0) {
1596
9.68M
            hasRegion = true;
1597
9.68M
            localeID.remove_prefix(len + 1);
1598
9.68M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1599
9.68M
        }
1600
10.0M
    }
1601
1602
10.2M
    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1603
1604
4.48M
    bool hasVariant = false;
1605
1606
4.48M
    if (_isIDSeparator(localeID.front()) && !_isBCP47Extension(localeID)) {
1607
4.25M
        std::string_view sub = localeID;
1608
        /* If there was no country ID, skip a possible extra IDSeparator */
1609
4.25M
        size_t skip = !hasRegion && localeID.size() > 1 && _isIDSeparator(localeID[1]) ? 2 : 1;
1610
4.25M
        sub.remove_prefix(skip);
1611
4.25M
        size_t len = _getVariant(sub, localeID[0], variant, false, status);
1612
4.25M
        if (U_FAILURE(status)) { return; }
1613
4.24M
        if (len > 0) {
1614
4.24M
            hasVariant = true;
1615
4.24M
            localeID.remove_prefix(skip + len);
1616
4.24M
            if (pEnd != nullptr) { *pEnd = localeID.data(); }
1617
4.24M
        }
1618
4.24M
    }
1619
1620
4.48M
    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
1621
1622
307k
    if (_isBCP47Extension(localeID)) {
1623
7.00k
        localeID.remove_prefix(2);
1624
7.00k
        constexpr char vaposix[] = "-va-posix";
1625
7.00k
        constexpr size_t length = sizeof vaposix - 1;
1626
6.15M
        for (size_t next;; localeID.remove_prefix(next)) {
1627
6.15M
            next = localeID.find('-', 1);
1628
6.15M
            if (next == std::string_view::npos) { break; }
1629
6.14M
            next = localeID.find('-', next + 1);
1630
6.14M
            bool finished = next == std::string_view::npos;
1631
6.14M
            std::string_view sub = localeID;
1632
6.14M
            if (!finished) { sub.remove_suffix(sub.length() - next); }
1633
1634
6.14M
            if (sub.length() == length && uprv_strnicmp(sub.data(), vaposix, length) == 0) {
1635
2.11M
                if (variant != nullptr) {
1636
2.11M
                    if (hasVariant) { variant->Append("_", 1); }
1637
2.11M
                    constexpr char posix[] = "POSIX";
1638
2.11M
                    variant->Append(posix, sizeof posix - 1);
1639
2.11M
                }
1640
2.11M
                if (pEnd != nullptr) { *pEnd = localeID.data() + length; }
1641
2.11M
            }
1642
1643
6.14M
            if (finished) { break; }
1644
6.14M
        }
1645
7.00k
    }
1646
307k
}
1647
1648
/* Keyword enumeration */
1649
1650
typedef struct UKeywordsContext {
1651
    char* keywords;
1652
    char* current;
1653
} UKeywordsContext;
1654
1655
U_CDECL_BEGIN
1656
1657
static void U_CALLCONV
1658
4.42k
uloc_kw_closeKeywords(UEnumeration *enumerator) {
1659
4.42k
    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1660
4.42k
    uprv_free(enumerator->context);
1661
4.42k
    uprv_free(enumerator);
1662
4.42k
}
1663
1664
static int32_t U_CALLCONV
1665
957
uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1666
957
    char *kw = ((UKeywordsContext *)en->context)->keywords;
1667
957
    int32_t result = 0;
1668
2.48k
    while(*kw) {
1669
1.52k
        result++;
1670
1.52k
        kw += uprv_strlen(kw)+1;
1671
1.52k
    }
1672
957
    return result;
1673
957
}
1674
1675
static const char * U_CALLCONV
1676
uloc_kw_nextKeyword(UEnumeration* en,
1677
                    int32_t* resultLength,
1678
8.08k
                    UErrorCode* /*status*/) {
1679
8.08k
    const char* result = ((UKeywordsContext *)en->context)->current;
1680
8.08k
    int32_t len = 0;
1681
8.08k
    if(*result) {
1682
6.31k
        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1683
6.31k
        ((UKeywordsContext *)en->context)->current += len+1;
1684
6.31k
    } else {
1685
1.77k
        result = nullptr;
1686
1.77k
    }
1687
8.08k
    if (resultLength) {
1688
8.08k
        *resultLength = len;
1689
8.08k
    }
1690
8.08k
    return result;
1691
8.08k
}
1692
1693
static void U_CALLCONV
1694
uloc_kw_resetKeywords(UEnumeration* en,
1695
0
                      UErrorCode* /*status*/) {
1696
0
    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1697
0
}
1698
1699
U_CDECL_END
1700
1701
1702
static const UEnumeration gKeywordsEnum = {
1703
    nullptr,
1704
    nullptr,
1705
    uloc_kw_closeKeywords,
1706
    uloc_kw_countKeywords,
1707
    uenum_unextDefault,
1708
    uloc_kw_nextKeyword,
1709
    uloc_kw_resetKeywords
1710
};
1711
1712
U_CAPI UEnumeration* U_EXPORT2
1713
uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1714
4.42k
{
1715
4.42k
    if (U_FAILURE(*status)) { return nullptr; }
1716
1717
4.42k
    LocalMemory<UKeywordsContext> myContext;
1718
4.42k
    LocalMemory<UEnumeration> result;
1719
1720
4.42k
    myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1721
4.42k
    result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1722
4.42k
    if (myContext.isNull() || result.isNull()) {
1723
0
        *status = U_MEMORY_ALLOCATION_ERROR;
1724
0
        return nullptr;
1725
0
    }
1726
4.42k
    uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1727
4.42k
    myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1728
4.42k
    if (myContext->keywords == nullptr) {
1729
0
        *status = U_MEMORY_ALLOCATION_ERROR;
1730
0
        return nullptr;
1731
0
    }
1732
4.42k
    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1733
4.42k
    myContext->keywords[keywordListSize] = 0;
1734
4.42k
    myContext->current = myContext->keywords;
1735
4.42k
    result->context = myContext.orphan();
1736
4.42k
    return result.orphan();
1737
4.42k
}
1738
1739
U_CAPI UEnumeration* U_EXPORT2
1740
uloc_openKeywords(const char* localeID,
1741
                        UErrorCode* status)
1742
7.03k
{
1743
7.03k
    if(status==nullptr || U_FAILURE(*status)) {
1744
0
        return nullptr;
1745
0
    }
1746
1747
7.03k
    CharString tempBuffer;
1748
7.03k
    const char* tmpLocaleID;
1749
1750
7.03k
    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
1751
2.42k
        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status);
1752
2.42k
        tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1753
4.61k
    } else {
1754
4.61k
        if (localeID==nullptr) {
1755
0
            localeID=uloc_getDefault();
1756
0
        }
1757
4.61k
        tmpLocaleID=localeID;
1758
4.61k
    }
1759
1760
7.03k
    ulocimp_getSubtags(
1761
7.03k
            tmpLocaleID,
1762
7.03k
            nullptr,
1763
7.03k
            nullptr,
1764
7.03k
            nullptr,
1765
7.03k
            nullptr,
1766
7.03k
            &tmpLocaleID,
1767
7.03k
            *status);
1768
7.03k
    if (U_FAILURE(*status)) {
1769
213
        return nullptr;
1770
213
    }
1771
1772
    /* keywords are located after '@' */
1773
6.82k
    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1774
4.76k
        CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status);
1775
4.76k
        if (U_FAILURE(*status)) {
1776
337
            return nullptr;
1777
337
        }
1778
4.42k
        return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1779
4.76k
    }
1780
2.06k
    return nullptr;
1781
6.82k
}
1782
1783
1784
/* bit-flags for 'options' parameter of _canonicalize */
1785
10.4M
#define _ULOC_STRIP_KEYWORDS 0x2
1786
20.0M
#define _ULOC_CANONICALIZE   0x1
1787
1788
namespace {
1789
1790
26.6M
inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; }
1791
1792
constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1793
constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default);
1794
1795
/**
1796
 * Canonicalize the given localeID, to level 1 or to level 2,
1797
 * depending on the options.  To specify level 1, pass in options=0.
1798
 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1799
 *
1800
 * This is the code underlying uloc_getName and uloc_canonicalize.
1801
 */
1802
void
1803
_canonicalize(std::string_view localeID,
1804
              ByteSink& sink,
1805
              uint32_t options,
1806
6.66M
              UErrorCode& err) {
1807
6.66M
    if (U_FAILURE(err)) {
1808
0
        return;
1809
0
    }
1810
1811
6.66M
    int32_t j, fieldCount=0;
1812
6.66M
    CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
1813
6.66M
    CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1814
6.66M
    std::string_view origLocaleID;
1815
6.66M
    std::string_view tmpLocaleID;
1816
6.66M
    size_t keywordAssign = std::string_view::npos;
1817
6.66M
    size_t separatorIndicator = std::string_view::npos;
1818
1819
6.66M
    if (_hasBCP47Extension(localeID)) {
1820
220k
        std::string_view localeIDPtr = localeID;
1821
1822
        // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1823
220k
        if (localeID.size() >= 2 && localeID.find('_') != std::string_view::npos && localeID[1] != '-' && localeID[1] != '_') {
1824
98.5k
            localeIDWithHyphens.append(localeID, err);
1825
98.5k
            if (U_SUCCESS(err)) {
1826
217M
                for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1827
217M
                    if (*p == '_') {
1828
4.60M
                        *p = '-';
1829
4.60M
                    }
1830
217M
                }
1831
98.5k
                localeIDPtr = localeIDWithHyphens.toStringPiece();
1832
98.5k
            }
1833
98.5k
        }
1834
1835
220k
        tempBuffer = ulocimp_forLanguageTag(localeIDPtr.data(), static_cast<int32_t>(localeIDPtr.size()), nullptr, err);
1836
220k
        tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? static_cast<std::string_view>(tempBuffer.toStringPiece()) : localeIDPtr;
1837
6.44M
    } else {
1838
6.44M
        tmpLocaleID=localeID;
1839
6.44M
    }
1840
1841
6.66M
    origLocaleID=tmpLocaleID;
1842
1843
    /* get all pieces, one after another, and separate with '_' */
1844
6.66M
    CharString tag;
1845
6.66M
    CharString script;
1846
6.66M
    CharString country;
1847
6.66M
    CharString variant;
1848
6.66M
    const char* end = nullptr;
1849
6.66M
    ulocimp_getSubtags(
1850
6.66M
            tmpLocaleID,
1851
6.66M
            &tag,
1852
6.66M
            &script,
1853
6.66M
            &country,
1854
6.66M
            &variant,
1855
6.66M
            &end,
1856
6.66M
            err);
1857
6.66M
    if (U_FAILURE(err)) {
1858
5.73k
        return;
1859
5.73k
    }
1860
6.65M
    U_ASSERT(end != nullptr);
1861
6.65M
    if (end > tmpLocaleID.data()) {
1862
6.19M
        tmpLocaleID.remove_prefix(end - tmpLocaleID.data());
1863
6.19M
    }
1864
1865
6.65M
    if (tag.length() == I_DEFAULT_LENGTH && origLocaleID.length() >= I_DEFAULT_LENGTH &&
1866
6.65M
            uprv_strncmp(origLocaleID.data(), i_default, I_DEFAULT_LENGTH) == 0) {
1867
271
        tag.clear();
1868
271
        tag.append(uloc_getDefault(), err);
1869
6.65M
    } else {
1870
6.65M
        if (!script.isEmpty()) {
1871
471k
            ++fieldCount;
1872
471k
            tag.append('_', err);
1873
471k
            tag.append(script, err);
1874
471k
        }
1875
6.65M
        if (!country.isEmpty()) {
1876
4.88M
            ++fieldCount;
1877
4.88M
            tag.append('_', err);
1878
4.88M
            tag.append(country, err);
1879
4.88M
        }
1880
6.65M
        if (!variant.isEmpty()) {
1881
2.90M
            ++fieldCount;
1882
2.90M
            if (country.isEmpty()) {
1883
132k
                tag.append('_', err);
1884
132k
            }
1885
2.90M
            tag.append('_', err);
1886
2.90M
            tag.append(variant, err);
1887
2.90M
        }
1888
6.65M
    }
1889
1890
    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1891
6.65M
    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && !tmpLocaleID.empty() && tmpLocaleID.front() == '.') {
1892
16.2k
        tag.append('.', err);
1893
16.2k
        tmpLocaleID.remove_prefix(1);
1894
16.2k
        size_t length;
1895
16.2k
        if (size_t atPos = tmpLocaleID.find('@'); atPos != std::string_view::npos) {
1896
3.66k
            length = atPos;
1897
12.5k
        } else {
1898
12.5k
            length = tmpLocaleID.length();
1899
12.5k
        }
1900
        // The longest charset name we found in IANA charset registry
1901
        // https://www.iana.org/assignments/character-sets/ is
1902
        // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
1903
        // we therefore restrict the length here to be 64 which is a power of 2
1904
        // number that is longer than 45.
1905
16.2k
        constexpr size_t kMaxCharsetLength = 64;
1906
16.2k
        if (length > kMaxCharsetLength) {
1907
267
           err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1908
267
           return;
1909
267
        }
1910
15.9k
        if (length > 0) {
1911
13.0k
            tag.append(tmpLocaleID.data(), static_cast<int32_t>(length), err);
1912
13.0k
            tmpLocaleID.remove_prefix(length);
1913
13.0k
        }
1914
15.9k
    }
1915
1916
    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1917
       After this, tmpLocaleID either starts at '@' or is empty. */
1918
6.65M
    if (const char* start = locale_getKeywordsStart(tmpLocaleID); start != nullptr) {
1919
224k
        if (start > tmpLocaleID.data()) {
1920
1.94k
            tmpLocaleID.remove_prefix(start - tmpLocaleID.data());
1921
1.94k
        }
1922
224k
        keywordAssign = tmpLocaleID.find('=');
1923
224k
        separatorIndicator = tmpLocaleID.find(';');
1924
6.43M
    } else {
1925
6.43M
        tmpLocaleID = {};
1926
6.43M
    }
1927
1928
    /* Copy POSIX-style variant, if any [mr@FOO] */
1929
6.65M
    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1930
6.54M
        !tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
1931
22.0k
        tag.append(tmpLocaleID, err);
1932
22.0k
        tmpLocaleID = {};
1933
22.0k
    }
1934
1935
6.65M
    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1936
        /* Handle @FOO variant if @ is present and not followed by = */
1937
113k
        if (!tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
1938
            /* Add missing '_' if needed */
1939
4.71k
            if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
1940
6.28k
                do {
1941
6.28k
                    tag.append('_', err);
1942
6.28k
                    ++fieldCount;
1943
6.28k
                } while(fieldCount<2);
1944
4.41k
            }
1945
1946
4.71k
            CharStringByteSink s(&tag);
1947
4.71k
            std::string_view sub = tmpLocaleID;
1948
4.71k
            sub.remove_prefix(1);
1949
4.71k
            _getVariant(sub, '@', &s, !variant.isEmpty(), err);
1950
4.71k
            if (U_FAILURE(err)) { return; }
1951
4.71k
        }
1952
1953
        /* Look up the ID in the canonicalization map */
1954
1.24M
        for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1955
1.13M
            StringPiece id(CANONICALIZE_MAP[j].id);
1956
1.13M
            if (tag == id) {
1957
75
                if (id.empty() && !tmpLocaleID.empty()) {
1958
0
                    break; /* Don't remap "" if keywords present */
1959
0
                }
1960
75
                tag.clear();
1961
75
                tag.append(CANONICALIZE_MAP[j].canonicalID, err);
1962
75
                break;
1963
75
            }
1964
1.13M
        }
1965
113k
    }
1966
1967
6.65M
    sink.Append(tag.data(), tag.length());
1968
1969
6.65M
    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1970
2.82M
        if (!tmpLocaleID.empty() && keywordAssign != std::string_view::npos &&
1971
127k
            (separatorIndicator == std::string_view::npos || separatorIndicator > keywordAssign)) {
1972
126k
            sink.Append("@", 1);
1973
126k
            ++fieldCount;
1974
126k
            tmpLocaleID.remove_prefix(1);
1975
126k
            ulocimp_getKeywords(tmpLocaleID, '@', sink, true, err);
1976
126k
        }
1977
2.82M
    }
1978
6.65M
}
1979
1980
}  // namespace
1981
1982
/* ### ID parsing API **************************************************/
1983
1984
U_CAPI int32_t  U_EXPORT2
1985
uloc_getParent(const char*    localeID,
1986
               char* parent,
1987
               int32_t parentCapacity,
1988
               UErrorCode* err)
1989
0
{
1990
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
1991
0
        parent, parentCapacity,
1992
0
        [&](ByteSink& sink, UErrorCode& status) {
1993
0
            ulocimp_getParent(localeID, sink, status);
1994
0
        },
1995
0
        *err);
1996
0
}
1997
1998
U_EXPORT CharString
1999
ulocimp_getParent(const char* localeID,
2000
                  UErrorCode& err)
2001
369k
{
2002
369k
    return ByteSinkUtil::viaByteSinkToCharString(
2003
369k
        [&](ByteSink& sink, UErrorCode& status) {
2004
369k
            ulocimp_getParent(localeID, sink, status);
2005
369k
        },
2006
369k
        err);
2007
369k
}
2008
2009
U_EXPORT void
2010
ulocimp_getParent(const char* localeID,
2011
                  icu::ByteSink& sink,
2012
                  UErrorCode& err)
2013
369k
{
2014
369k
    if (U_FAILURE(err)) { return; }
2015
2016
369k
    const char *lastUnderscore;
2017
369k
    int32_t i;
2018
2019
369k
    if (localeID == nullptr)
2020
0
        localeID = uloc_getDefault();
2021
2022
369k
    lastUnderscore=uprv_strrchr(localeID, '_');
2023
369k
    if(lastUnderscore!=nullptr) {
2024
268k
        i = static_cast<int32_t>(lastUnderscore - localeID);
2025
268k
    } else {
2026
101k
        i=0;
2027
101k
    }
2028
2029
369k
    if (i > 0) {
2030
264k
        if (uprv_strnicmp(localeID, "und_", 4) == 0) {
2031
0
            localeID += 3;
2032
0
            i -= 3;
2033
0
        }
2034
264k
        sink.Append(localeID, i);
2035
264k
    }
2036
369k
}
2037
2038
U_CAPI int32_t U_EXPORT2
2039
uloc_getLanguage(const char*    localeID,
2040
         char* language,
2041
         int32_t languageCapacity,
2042
         UErrorCode* err)
2043
0
{
2044
0
    if (localeID == nullptr) {
2045
0
        localeID = uloc_getDefault();
2046
0
    }
2047
2048
    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
2049
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2050
0
        language, languageCapacity,
2051
0
        [&](ByteSink& sink, UErrorCode& status) {
2052
0
            ulocimp_getSubtags(
2053
0
                    localeID,
2054
0
                    &sink,
2055
0
                    nullptr,
2056
0
                    nullptr,
2057
0
                    nullptr,
2058
0
                    nullptr,
2059
0
                    status);
2060
0
        },
2061
0
        *err);
2062
0
}
2063
2064
U_CAPI int32_t U_EXPORT2
2065
uloc_getScript(const char*    localeID,
2066
         char* script,
2067
         int32_t scriptCapacity,
2068
         UErrorCode* err)
2069
0
{
2070
0
    if (localeID == nullptr) {
2071
0
        localeID = uloc_getDefault();
2072
0
    }
2073
2074
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2075
0
        script, scriptCapacity,
2076
0
        [&](ByteSink& sink, UErrorCode& status) {
2077
0
            ulocimp_getSubtags(
2078
0
                    localeID,
2079
0
                    nullptr,
2080
0
                    &sink,
2081
0
                    nullptr,
2082
0
                    nullptr,
2083
0
                    nullptr,
2084
0
                    status);
2085
0
        },
2086
0
        *err);
2087
0
}
2088
2089
U_CAPI int32_t  U_EXPORT2
2090
uloc_getCountry(const char* localeID,
2091
            char* country,
2092
            int32_t countryCapacity,
2093
            UErrorCode* err)
2094
1.76k
{
2095
1.76k
    if (localeID == nullptr) {
2096
0
        localeID = uloc_getDefault();
2097
0
    }
2098
2099
1.76k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2100
1.76k
        country, countryCapacity,
2101
1.76k
        [&](ByteSink& sink, UErrorCode& status) {
2102
1.76k
            ulocimp_getSubtags(
2103
1.76k
                    localeID,
2104
1.76k
                    nullptr,
2105
1.76k
                    nullptr,
2106
1.76k
                    &sink,
2107
1.76k
                    nullptr,
2108
1.76k
                    nullptr,
2109
1.76k
                    status);
2110
1.76k
        },
2111
1.76k
        *err);
2112
1.76k
}
2113
2114
U_CAPI int32_t  U_EXPORT2
2115
uloc_getVariant(const char* localeID,
2116
                char* variant,
2117
                int32_t variantCapacity,
2118
                UErrorCode* err)
2119
0
{
2120
0
    if (localeID == nullptr) {
2121
0
        localeID = uloc_getDefault();
2122
0
    }
2123
2124
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2125
0
        variant, variantCapacity,
2126
0
        [&](ByteSink& sink, UErrorCode& status) {
2127
0
            ulocimp_getSubtags(
2128
0
                    localeID,
2129
0
                    nullptr,
2130
0
                    nullptr,
2131
0
                    nullptr,
2132
0
                    &sink,
2133
0
                    nullptr,
2134
0
                    status);
2135
0
        },
2136
0
        *err);
2137
0
}
2138
2139
U_CAPI int32_t  U_EXPORT2
2140
uloc_getName(const char* localeID,
2141
             char* name,
2142
             int32_t nameCapacity,
2143
             UErrorCode* err)
2144
10.9k
{
2145
10.9k
    if (localeID == nullptr) {
2146
0
        localeID = uloc_getDefault();
2147
0
    }
2148
10.9k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2149
10.9k
        name, nameCapacity,
2150
10.9k
        [&](ByteSink& sink, UErrorCode& status) {
2151
10.9k
            ulocimp_getName(localeID, sink, status);
2152
10.9k
        },
2153
10.9k
        *err);
2154
10.9k
}
2155
2156
U_EXPORT CharString
2157
ulocimp_getName(std::string_view localeID,
2158
                UErrorCode& err)
2159
1.21M
{
2160
1.21M
    return ByteSinkUtil::viaByteSinkToCharString(
2161
1.21M
        [&](ByteSink& sink, UErrorCode& status) {
2162
1.21M
            ulocimp_getName(localeID, sink, status);
2163
1.21M
        },
2164
1.21M
        err);
2165
1.21M
}
2166
2167
U_EXPORT void
2168
ulocimp_getName(std::string_view localeID,
2169
                ByteSink& sink,
2170
                UErrorCode& err)
2171
2.71M
{
2172
2.71M
    _canonicalize(localeID, sink, 0, err);
2173
2.71M
}
2174
2175
U_CAPI int32_t  U_EXPORT2
2176
uloc_getBaseName(const char* localeID,
2177
                 char* name,
2178
                 int32_t nameCapacity,
2179
                 UErrorCode* err)
2180
0
{
2181
0
    if (localeID == nullptr) {
2182
0
        localeID = uloc_getDefault();
2183
0
    }
2184
0
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2185
0
        name, nameCapacity,
2186
0
        [&](ByteSink& sink, UErrorCode& status) {
2187
0
            ulocimp_getBaseName(localeID, sink, status);
2188
0
        },
2189
0
        *err);
2190
0
}
2191
2192
U_EXPORT CharString
2193
ulocimp_getBaseName(std::string_view localeID,
2194
                    UErrorCode& err)
2195
3.83M
{
2196
3.83M
    return ByteSinkUtil::viaByteSinkToCharString(
2197
3.83M
        [&](ByteSink& sink, UErrorCode& status) {
2198
3.83M
            ulocimp_getBaseName(localeID, sink, status);
2199
3.83M
        },
2200
3.83M
        err);
2201
3.83M
}
2202
2203
U_EXPORT void
2204
ulocimp_getBaseName(std::string_view localeID,
2205
                    ByteSink& sink,
2206
                    UErrorCode& err)
2207
3.83M
{
2208
3.83M
    _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
2209
3.83M
}
2210
2211
U_CAPI int32_t  U_EXPORT2
2212
uloc_canonicalize(const char* localeID,
2213
                  char* name,
2214
                  int32_t nameCapacity,
2215
                  UErrorCode* err)
2216
3.58k
{
2217
3.58k
    if (localeID == nullptr) {
2218
0
        localeID = uloc_getDefault();
2219
0
    }
2220
3.58k
    return ByteSinkUtil::viaByteSinkToTerminatedChars(
2221
3.58k
        name, nameCapacity,
2222
3.58k
        [&](ByteSink& sink, UErrorCode& status) {
2223
3.58k
            ulocimp_canonicalize(localeID, sink, status);
2224
3.58k
        },
2225
3.58k
        *err);
2226
3.58k
}
2227
2228
U_EXPORT CharString
2229
ulocimp_canonicalize(std::string_view localeID,
2230
                     UErrorCode& err)
2231
91.9k
{
2232
91.9k
    return ByteSinkUtil::viaByteSinkToCharString(
2233
91.9k
        [&](ByteSink& sink, UErrorCode& status) {
2234
91.9k
            ulocimp_canonicalize(localeID, sink, status);
2235
91.9k
        },
2236
91.9k
        err);
2237
91.9k
}
2238
2239
U_EXPORT void
2240
ulocimp_canonicalize(std::string_view localeID,
2241
                     ByteSink& sink,
2242
                     UErrorCode& err)
2243
114k
{
2244
114k
    _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
2245
114k
}
2246
2247
U_CAPI const char*  U_EXPORT2
2248
uloc_getISO3Language(const char* localeID)
2249
0
{
2250
0
    UErrorCode err = U_ZERO_ERROR;
2251
2252
0
    if (localeID == nullptr)
2253
0
    {
2254
0
        localeID = uloc_getDefault();
2255
0
    }
2256
0
    CharString lang = ulocimp_getLanguage(localeID, err);
2257
0
    if (U_FAILURE(err))
2258
0
        return "";
2259
0
    std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data());
2260
0
    return offset.has_value() ? LANGUAGES_3[*offset] : "";
2261
0
}
2262
2263
U_CAPI const char*  U_EXPORT2
2264
uloc_getISO3Country(const char* localeID)
2265
0
{
2266
0
    UErrorCode err = U_ZERO_ERROR;
2267
2268
0
    if (localeID == nullptr)
2269
0
    {
2270
0
        localeID = uloc_getDefault();
2271
0
    }
2272
0
    CharString cntry = ulocimp_getRegion(localeID, err);
2273
0
    if (U_FAILURE(err))
2274
0
        return "";
2275
0
    std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data());
2276
0
    return offset.has_value() ? COUNTRIES_3[*offset] : "";
2277
0
}
2278
2279
U_CAPI uint32_t  U_EXPORT2
2280
uloc_getLCID(const char* localeID)
2281
0
{
2282
0
    UErrorCode status = U_ZERO_ERROR;
2283
0
    uint32_t   lcid = 0;
2284
2285
    /* Check for incomplete id. */
2286
0
    if (!localeID || uprv_strlen(localeID) < 2) {
2287
0
        return 0;
2288
0
    }
2289
2290
    // First, attempt Windows platform lookup if available, but fall
2291
    // through to catch any special cases (ICU vs Windows name differences).
2292
0
    lcid = uprv_convertToLCIDPlatform(localeID, &status);
2293
0
    if (U_FAILURE(status)) {
2294
0
        return 0;
2295
0
    }
2296
0
    if (lcid > 0) {
2297
        // Windows found an LCID, return that
2298
0
        return lcid;
2299
0
    }
2300
2301
0
    CharString langID = ulocimp_getLanguage(localeID, status);
2302
0
    if (U_FAILURE(status)) {
2303
0
        return 0;
2304
0
    }
2305
2306
0
    if (uprv_strchr(localeID, '@')) {
2307
        // uprv_convertToLCID does not support keywords other than collation.
2308
        // Remove all keywords except collation.
2309
0
        CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status);
2310
0
        if (U_SUCCESS(status) && !collVal.isEmpty()) {
2311
0
            CharString tmpLocaleID = ulocimp_getBaseName(localeID, status);
2312
0
            ulocimp_setKeywordValue("collation", collVal.toStringPiece(), tmpLocaleID, status);
2313
0
            if (U_SUCCESS(status)) {
2314
0
                return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status);
2315
0
            }
2316
0
        }
2317
2318
        // fall through - all keywords are simply ignored
2319
0
        status = U_ZERO_ERROR;
2320
0
    }
2321
2322
0
    return uprv_convertToLCID(langID.data(), localeID, &status);
2323
0
}
2324
2325
U_CAPI int32_t U_EXPORT2
2326
uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2327
                UErrorCode *status)
2328
0
{
2329
0
    return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2330
0
}
2331
2332
/* ### Default locale **************************************************/
2333
2334
U_CAPI const char*  U_EXPORT2
2335
uloc_getDefault()
2336
5.81M
{
2337
5.81M
    return locale_get_default();
2338
5.81M
}
2339
2340
U_CAPI void  U_EXPORT2
2341
uloc_setDefault(const char*   newDefaultLocale,
2342
             UErrorCode* err)
2343
0
{
2344
0
    if (U_FAILURE(*err))
2345
0
        return;
2346
    /* the error code isn't currently used for anything by this function*/
2347
2348
    /* propagate change to C++ */
2349
0
    locale_set_default(newDefaultLocale);
2350
0
}
2351
2352
/**
2353
 * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2354
 * to an array of pointers to arrays of char.  All of these pointers are owned
2355
 * by ICU-- do not delete them, and do not write through them.  The array is
2356
 * terminated with a null pointer.
2357
 */
2358
U_CAPI const char* const*  U_EXPORT2
2359
uloc_getISOLanguages()
2360
0
{
2361
0
    return LANGUAGES;
2362
0
}
2363
2364
/**
2365
 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2366
 * pointer to an array of pointers to arrays of char.  All of these pointers are
2367
 * owned by ICU-- do not delete them, and do not write through them.  The array is
2368
 * terminated with a null pointer.
2369
 */
2370
U_CAPI const char* const*  U_EXPORT2
2371
uloc_getISOCountries()
2372
0
{
2373
0
    return COUNTRIES;
2374
0
}
2375
2376
U_CAPI const char* U_EXPORT2
2377
uloc_toUnicodeLocaleKey(const char* keyword)
2378
2.01k
{
2379
2.01k
    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
2380
2.01k
    std::optional<std::string_view> result = ulocimp_toBcpKeyWithFallback(keyword);
2381
2.01k
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2382
2.01k
}
2383
2384
U_EXPORT std::optional<std::string_view>
2385
ulocimp_toBcpKeyWithFallback(std::string_view keyword)
2386
12.5k
{
2387
12.5k
    std::optional<std::string_view> bcpKey = ulocimp_toBcpKey(keyword);
2388
12.5k
    if (!bcpKey.has_value() &&
2389
5.01k
        ultag_isUnicodeLocaleKey(keyword.data(), static_cast<int32_t>(keyword.size()))) {
2390
        // unknown keyword, but syntax is fine..
2391
4.75k
        return keyword;
2392
4.75k
    }
2393
7.76k
    return bcpKey;
2394
12.5k
}
2395
2396
U_CAPI const char* U_EXPORT2
2397
uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2398
4.25k
{
2399
4.25k
    if (keyword == nullptr || *keyword == '\0' ||
2400
4.25k
        value == nullptr || *value == '\0') { return nullptr; }
2401
4.25k
    std::optional<std::string_view> result = ulocimp_toBcpTypeWithFallback(keyword, value);
2402
4.25k
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2403
4.25k
}
2404
2405
U_EXPORT std::optional<std::string_view>
2406
ulocimp_toBcpTypeWithFallback(std::string_view keyword, std::string_view value)
2407
16.3k
{
2408
16.3k
    std::optional<std::string_view> bcpType = ulocimp_toBcpType(keyword, value);
2409
16.3k
    if (!bcpType.has_value() &&
2410
6.21k
        ultag_isUnicodeLocaleType(value.data(), static_cast<int32_t>(value.size()))) {
2411
        // unknown keyword, but syntax is fine..
2412
5.77k
        return value;
2413
5.77k
    }
2414
10.5k
    return bcpType;
2415
16.3k
}
2416
2417
namespace {
2418
2419
bool
2420
isWellFormedLegacyKey(std::string_view key)
2421
16.6M
{
2422
16.6M
    return std::all_of(key.begin(), key.end(), UPRV_ISALPHANUM);
2423
16.6M
}
2424
2425
bool
2426
isWellFormedLegacyType(std::string_view legacyType)
2427
85.5k
{
2428
85.5k
    int32_t alphaNumLen = 0;
2429
49.7M
    for (char c : legacyType) {
2430
49.7M
        if (c == '_' || c == '/' || c == '-') {
2431
10.3M
            if (alphaNumLen == 0) {
2432
0
                return false;
2433
0
            }
2434
10.3M
            alphaNumLen = 0;
2435
39.3M
        } else if (UPRV_ISALPHANUM(c)) {
2436
39.3M
            alphaNumLen++;
2437
39.3M
        } else {
2438
0
            return false;
2439
0
        }
2440
49.7M
    }
2441
85.5k
    return alphaNumLen != 0;
2442
85.5k
}
2443
2444
}  // namespace
2445
2446
U_CAPI const char* U_EXPORT2
2447
uloc_toLegacyKey(const char* keyword)
2448
0
{
2449
0
    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
2450
0
    std::optional<std::string_view> result = ulocimp_toLegacyKeyWithFallback(keyword);
2451
0
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2452
0
}
2453
2454
U_EXPORT std::optional<std::string_view>
2455
ulocimp_toLegacyKeyWithFallback(std::string_view keyword)
2456
16.8M
{
2457
16.8M
    std::optional<std::string_view> legacyKey = ulocimp_toLegacyKey(keyword);
2458
16.8M
    if (!legacyKey.has_value() && isWellFormedLegacyKey(keyword)) {
2459
        // Checks if the specified locale key is well-formed with the legacy locale syntax.
2460
        //
2461
        // Note:
2462
        //  LDML/CLDR provides some definition of keyword syntax in
2463
        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2464
        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2465
        //  Keys can only consist of [0-9a-zA-Z].
2466
16.6M
        return keyword;
2467
16.6M
    }
2468
271k
    return legacyKey;
2469
16.8M
}
2470
2471
U_CAPI const char* U_EXPORT2
2472
uloc_toLegacyType(const char* keyword, const char* value)
2473
0
{
2474
0
    if (keyword == nullptr || *keyword == '\0' ||
2475
0
        value == nullptr || *value == '\0') { return nullptr; }
2476
0
    std::optional<std::string_view> result = ulocimp_toLegacyTypeWithFallback(keyword, value);
2477
0
    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
2478
0
}
2479
2480
U_EXPORT std::optional<std::string_view>
2481
ulocimp_toLegacyTypeWithFallback(std::string_view keyword, std::string_view value)
2482
103k
{
2483
103k
    std::optional<std::string_view> legacyType = ulocimp_toLegacyType(keyword, value);
2484
103k
    if (!legacyType.has_value() && isWellFormedLegacyType(value)) {
2485
        // Checks if the specified locale type is well-formed with the legacy locale syntax.
2486
        //
2487
        // Note:
2488
        //  LDML/CLDR provides some definition of keyword syntax in
2489
        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2490
        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2491
        //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2492
        //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2493
85.5k
        return value;
2494
85.5k
    }
2495
18.2k
    return legacyType;
2496
103k
}
2497
2498
/*eof*/