/src/icu/icu4c/source/common/ucasemap.cpp
Line | Count | Source |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2005-2016, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: ucasemap.cpp |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2005may06 |
16 | | * created by: Markus W. Scherer |
17 | | * |
18 | | * Case mapping service object and functions using it. |
19 | | */ |
20 | | |
21 | | #include "unicode/utypes.h" |
22 | | #include "unicode/brkiter.h" |
23 | | #include "unicode/bytestream.h" |
24 | | #include "unicode/casemap.h" |
25 | | #include "unicode/edits.h" |
26 | | #include "unicode/stringoptions.h" |
27 | | #include "unicode/stringpiece.h" |
28 | | #include "unicode/ubrk.h" |
29 | | #include "unicode/uloc.h" |
30 | | #include "unicode/ustring.h" |
31 | | #include "unicode/ucasemap.h" |
32 | | #if !UCONFIG_NO_BREAK_ITERATION |
33 | | #include "unicode/utext.h" |
34 | | #endif |
35 | | #include "unicode/utf.h" |
36 | | #include "unicode/utf8.h" |
37 | | #include "unicode/utf16.h" |
38 | | #include "bytesinkutil.h" |
39 | | #include "cmemory.h" |
40 | | #include "cstring.h" |
41 | | #include "uassert.h" |
42 | | #include "ucase.h" |
43 | | #include "ucasemap_imp.h" |
44 | | |
45 | | U_NAMESPACE_USE |
46 | | |
47 | | /* UCaseMap service object -------------------------------------------------- */ |
48 | | |
49 | | UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) : |
50 | | #if !UCONFIG_NO_BREAK_ITERATION |
51 | 7.58k | iter(nullptr), |
52 | | #endif |
53 | 7.58k | caseLocale(UCASE_LOC_UNKNOWN), options(opts) { |
54 | 7.58k | ucasemap_setLocale(this, localeID, pErrorCode); |
55 | 7.58k | } |
56 | | |
57 | 7.58k | UCaseMap::~UCaseMap() { |
58 | 7.58k | #if !UCONFIG_NO_BREAK_ITERATION |
59 | 7.58k | delete iter; |
60 | 7.58k | #endif |
61 | 7.58k | } |
62 | | |
63 | | U_CAPI UCaseMap * U_EXPORT2 |
64 | 7.58k | ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { |
65 | 7.58k | if(U_FAILURE(*pErrorCode)) { |
66 | 0 | return nullptr; |
67 | 0 | } |
68 | 7.58k | UCaseMap *csm = new UCaseMap(locale, options, pErrorCode); |
69 | 7.58k | if(csm==nullptr) { |
70 | 0 | *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
71 | 0 | return nullptr; |
72 | 7.58k | } else if (U_FAILURE(*pErrorCode)) { |
73 | 0 | delete csm; |
74 | 0 | return nullptr; |
75 | 0 | } |
76 | 7.58k | return csm; |
77 | 7.58k | } |
78 | | |
79 | | U_CAPI void U_EXPORT2 |
80 | 7.58k | ucasemap_close(UCaseMap *csm) { |
81 | 7.58k | delete csm; |
82 | 7.58k | } |
83 | | |
84 | | U_CAPI const char * U_EXPORT2 |
85 | 0 | ucasemap_getLocale(const UCaseMap *csm) { |
86 | 0 | return csm->locale; |
87 | 0 | } |
88 | | |
89 | | U_CAPI uint32_t U_EXPORT2 |
90 | 0 | ucasemap_getOptions(const UCaseMap *csm) { |
91 | 0 | return csm->options; |
92 | 0 | } |
93 | | |
94 | | U_CAPI void U_EXPORT2 |
95 | 7.58k | ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { |
96 | 7.58k | if(U_FAILURE(*pErrorCode)) { |
97 | 0 | return; |
98 | 0 | } |
99 | 7.58k | if (locale != nullptr && *locale == 0) { |
100 | 0 | csm->locale[0] = 0; |
101 | 0 | csm->caseLocale = UCASE_LOC_ROOT; |
102 | 0 | return; |
103 | 0 | } |
104 | | |
105 | 7.58k | UErrorCode bufferStatus = U_ZERO_ERROR; |
106 | 7.58k | int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), &bufferStatus); |
107 | 7.58k | if(bufferStatus==U_BUFFER_OVERFLOW_ERROR || (U_SUCCESS(bufferStatus) && length==sizeof(csm->locale))) { |
108 | 0 | bufferStatus = U_ZERO_ERROR; |
109 | | /* we only really need the language code for case mappings */ |
110 | 0 | length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), &bufferStatus); |
111 | 0 | } |
112 | 7.58k | if(U_FAILURE(bufferStatus)) { |
113 | 0 | *pErrorCode=bufferStatus; |
114 | 7.58k | } else if(length==sizeof(csm->locale)) { |
115 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
116 | 0 | } |
117 | 7.58k | if(U_SUCCESS(*pErrorCode)) { |
118 | 7.58k | csm->caseLocale = ucase_getCaseLocale(csm->locale); |
119 | 7.58k | } else { |
120 | 0 | csm->locale[0]=0; |
121 | 0 | csm->caseLocale = UCASE_LOC_ROOT; |
122 | 0 | } |
123 | 7.58k | } |
124 | | |
125 | | U_CAPI void U_EXPORT2 |
126 | 0 | ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { |
127 | 0 | if(U_FAILURE(*pErrorCode)) { |
128 | 0 | return; |
129 | 0 | } |
130 | 0 | csm->options=options; |
131 | 0 | } |
132 | | |
133 | | /* UTF-8 string case mappings ----------------------------------------------- */ |
134 | | |
135 | | /* TODO(markus): Move to a new, separate utf8case.cpp file. */ |
136 | | |
137 | | namespace { |
138 | | |
139 | | /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ |
140 | | inline UBool |
141 | | appendResult(int32_t cpLength, int32_t result, const char16_t *s, |
142 | 30.3M | ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { |
143 | 30.3M | U_ASSERT(U_SUCCESS(errorCode)); |
144 | | |
145 | | /* decode the result */ |
146 | 30.3M | if(result<0) { |
147 | | /* (not) original code point */ |
148 | 27.4M | if(edits!=nullptr) { |
149 | 0 | edits->addUnchanged(cpLength); |
150 | 0 | } |
151 | 27.4M | if((options & U_OMIT_UNCHANGED_TEXT) == 0) { |
152 | 10.1M | ByteSinkUtil::appendCodePoint(cpLength, ~result, sink); |
153 | 10.1M | } |
154 | 27.4M | } else { |
155 | 2.88M | if(result<=UCASE_MAX_STRING_LENGTH) { |
156 | | // string: "result" is the UTF-16 length |
157 | 474k | return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode); |
158 | 2.41M | } else { |
159 | 2.41M | ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits); |
160 | 2.41M | } |
161 | 2.88M | } |
162 | 29.9M | return true; |
163 | 30.3M | } |
164 | | |
165 | | // See unicode/utf8.h U8_APPEND_UNSAFE(). |
166 | 297k | inline uint8_t getTwoByteLead(UChar32 c) { return static_cast<uint8_t>((c >> 6) | 0xc0); } |
167 | 259k | inline uint8_t getTwoByteTrail(UChar32 c) { return static_cast<uint8_t>((c & 0x3f) | 0x80); } |
168 | | |
169 | | UChar32 U_CALLCONV |
170 | 253k | utf8_caseContextIterator(void *context, int8_t dir) { |
171 | 253k | UCaseContext* csc = static_cast<UCaseContext*>(context); |
172 | 253k | UChar32 c; |
173 | | |
174 | 253k | if(dir<0) { |
175 | | /* reset for backward iteration */ |
176 | 87.3k | csc->index=csc->cpStart; |
177 | 87.3k | csc->dir=dir; |
178 | 166k | } else if(dir>0) { |
179 | | /* reset for forward iteration */ |
180 | 117k | csc->index=csc->cpLimit; |
181 | 117k | csc->dir=dir; |
182 | 117k | } else { |
183 | | /* continue current iteration direction */ |
184 | 48.2k | dir=csc->dir; |
185 | 48.2k | } |
186 | | |
187 | 253k | if(dir<0) { |
188 | 124k | if(csc->start<csc->index) { |
189 | 124k | U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); |
190 | 124k | return c; |
191 | 124k | } |
192 | 129k | } else { |
193 | 129k | if(csc->index<csc->limit) { |
194 | 128k | U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); |
195 | 128k | return c; |
196 | 128k | } |
197 | 129k | } |
198 | 518 | return U_SENTINEL; |
199 | 253k | } |
200 | | |
201 | | /** |
202 | | * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. |
203 | | * caseLocale < 0: Case-folds [srcStart..srcLimit[. |
204 | | */ |
205 | | void toLower(int32_t caseLocale, uint32_t options, |
206 | | const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, |
207 | 1.29M | icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { |
208 | 1.29M | const int8_t *latinToLower; |
209 | 1.29M | if (caseLocale == UCASE_LOC_ROOT || |
210 | 314k | (caseLocale >= 0 ? |
211 | 314k | !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : |
212 | 1.29M | (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { |
213 | 1.29M | latinToLower = LatinCase::TO_LOWER_NORMAL; |
214 | 1.29M | } else { |
215 | 8.85k | latinToLower = LatinCase::TO_LOWER_TR_LT; |
216 | 8.85k | } |
217 | 1.29M | const UTrie2 *trie = ucase_getTrie(); |
218 | 1.29M | int32_t prev = srcStart; |
219 | 1.29M | int32_t srcIndex = srcStart; |
220 | 2.03M | for (;;) { |
221 | | // fast path for simple cases |
222 | 2.03M | int32_t cpStart; |
223 | 2.03M | UChar32 c; |
224 | 46.2M | for (;;) { |
225 | 46.2M | if (U_FAILURE(errorCode) || srcIndex >= srcLimit) { |
226 | 1.29M | c = U_SENTINEL; |
227 | 1.29M | break; |
228 | 1.29M | } |
229 | 44.9M | uint8_t lead = src[srcIndex++]; |
230 | 44.9M | if (lead <= 0x7f) { |
231 | 27.1M | int8_t d = latinToLower[lead]; |
232 | 27.1M | if (d == LatinCase::EXC) { |
233 | 58.3k | cpStart = srcIndex - 1; |
234 | 58.3k | c = lead; |
235 | 58.3k | break; |
236 | 58.3k | } |
237 | 27.1M | if (d == 0) { continue; } |
238 | 3.50M | ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, |
239 | 3.50M | sink, options, edits, errorCode); |
240 | 3.50M | char ascii = static_cast<char>(lead + d); |
241 | 3.50M | sink.Append(&ascii, 1); |
242 | 3.50M | if (edits != nullptr) { |
243 | 0 | edits->addReplace(1, 1); |
244 | 0 | } |
245 | 3.50M | prev = srcIndex; |
246 | 3.50M | continue; |
247 | 27.1M | } else if (lead < 0xe3) { |
248 | 10.6M | uint8_t t; |
249 | 10.6M | if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit && |
250 | 711k | (t = src[srcIndex] - 0x80) <= 0x3f) { |
251 | | // U+0080..U+017F |
252 | 507k | ++srcIndex; |
253 | 507k | c = ((lead - 0xc0) << 6) | t; |
254 | 507k | int8_t d = latinToLower[c]; |
255 | 507k | if (d == LatinCase::EXC) { |
256 | 418k | cpStart = srcIndex - 2; |
257 | 418k | break; |
258 | 418k | } |
259 | 88.8k | if (d == 0) { continue; } |
260 | 47.6k | ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, |
261 | 47.6k | sink, options, edits, errorCode); |
262 | 47.6k | ByteSinkUtil::appendTwoBytes(c + d, sink); |
263 | 47.6k | if (edits != nullptr) { |
264 | 0 | edits->addReplace(2, 2); |
265 | 0 | } |
266 | 47.6k | prev = srcIndex; |
267 | 47.6k | continue; |
268 | 88.8k | } |
269 | 10.6M | } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && |
270 | 1.41M | (srcIndex + 2) <= srcLimit && |
271 | 1.40M | U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { |
272 | | // most of CJK: no case mappings |
273 | 776k | srcIndex += 2; |
274 | 776k | continue; |
275 | 776k | } |
276 | 16.5M | cpStart = --srcIndex; |
277 | 16.5M | U8_NEXT(src, srcIndex, srcLimit, c); |
278 | 16.5M | if (c < 0) { |
279 | | // ill-formed UTF-8 |
280 | 10.7M | continue; |
281 | 10.7M | } |
282 | 5.74M | uint16_t props = UTRIE2_GET16(trie, c); |
283 | 5.74M | if (UCASE_HAS_EXCEPTION(props)) { break; } |
284 | 5.49M | int32_t delta; |
285 | 5.49M | if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { |
286 | 5.38M | continue; |
287 | 5.38M | } |
288 | 108k | ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, |
289 | 108k | sink, options, edits, errorCode); |
290 | 108k | ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); |
291 | 108k | prev = srcIndex; |
292 | 108k | } |
293 | 2.03M | if (c < 0) { |
294 | 1.29M | break; |
295 | 1.29M | } |
296 | | // slow path |
297 | 733k | const char16_t *s; |
298 | 733k | if (caseLocale >= 0) { |
299 | 720k | csc->cpStart = cpStart; |
300 | 720k | csc->cpLimit = srcIndex; |
301 | 720k | c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale); |
302 | 720k | } else { |
303 | 13.3k | c = ucase_toFullFolding(c, &s, options); |
304 | 13.3k | } |
305 | 733k | if (c >= 0) { |
306 | 570k | ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, |
307 | 570k | sink, options, edits, errorCode); |
308 | 570k | appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); |
309 | 570k | prev = srcIndex; |
310 | 570k | } |
311 | 733k | } |
312 | 1.29M | ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, |
313 | 1.29M | sink, options, edits, errorCode); |
314 | 1.29M | } |
315 | | |
316 | | void toUpper(int32_t caseLocale, uint32_t options, |
317 | | const uint8_t *src, UCaseContext *csc, int32_t srcLength, |
318 | 677 | icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { |
319 | 677 | const int8_t *latinToUpper; |
320 | 677 | if (caseLocale == UCASE_LOC_TURKISH) { |
321 | 27 | latinToUpper = LatinCase::TO_UPPER_TR; |
322 | 650 | } else { |
323 | 650 | latinToUpper = LatinCase::TO_UPPER_NORMAL; |
324 | 650 | } |
325 | 677 | const UTrie2 *trie = ucase_getTrie(); |
326 | 677 | int32_t prev = 0; |
327 | 677 | int32_t srcIndex = 0; |
328 | 62.4k | for (;;) { |
329 | | // fast path for simple cases |
330 | 62.4k | int32_t cpStart; |
331 | 62.4k | UChar32 c; |
332 | 13.6M | for (;;) { |
333 | 13.6M | if (U_FAILURE(errorCode) || srcIndex >= srcLength) { |
334 | 677 | c = U_SENTINEL; |
335 | 677 | break; |
336 | 677 | } |
337 | 13.6M | uint8_t lead = src[srcIndex++]; |
338 | 13.6M | if (lead <= 0x7f) { |
339 | 8.38M | int8_t d = latinToUpper[lead]; |
340 | 8.38M | if (d == LatinCase::EXC) { |
341 | 440 | cpStart = srcIndex - 1; |
342 | 440 | c = lead; |
343 | 440 | break; |
344 | 440 | } |
345 | 8.38M | if (d == 0) { continue; } |
346 | 911k | ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, |
347 | 911k | sink, options, edits, errorCode); |
348 | 911k | char ascii = static_cast<char>(lead + d); |
349 | 911k | sink.Append(&ascii, 1); |
350 | 911k | if (edits != nullptr) { |
351 | 0 | edits->addReplace(1, 1); |
352 | 0 | } |
353 | 911k | prev = srcIndex; |
354 | 911k | continue; |
355 | 8.38M | } else if (lead < 0xe3) { |
356 | 3.37M | uint8_t t; |
357 | 3.37M | if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength && |
358 | 126k | (t = src[srcIndex] - 0x80) <= 0x3f) { |
359 | | // U+0080..U+017F |
360 | 32.7k | ++srcIndex; |
361 | 32.7k | c = ((lead - 0xc0) << 6) | t; |
362 | 32.7k | int8_t d = latinToUpper[c]; |
363 | 32.7k | if (d == LatinCase::EXC) { |
364 | 854 | cpStart = srcIndex - 2; |
365 | 854 | break; |
366 | 854 | } |
367 | 31.9k | if (d == 0) { continue; } |
368 | 11.2k | ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, |
369 | 11.2k | sink, options, edits, errorCode); |
370 | 11.2k | ByteSinkUtil::appendTwoBytes(c + d, sink); |
371 | 11.2k | if (edits != nullptr) { |
372 | 0 | edits->addReplace(2, 2); |
373 | 0 | } |
374 | 11.2k | prev = srcIndex; |
375 | 11.2k | continue; |
376 | 31.9k | } |
377 | 3.37M | } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && |
378 | 732k | (srcIndex + 2) <= srcLength && |
379 | 732k | U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { |
380 | | // most of CJK: no case mappings |
381 | 293k | srcIndex += 2; |
382 | 293k | continue; |
383 | 293k | } |
384 | 4.92M | cpStart = --srcIndex; |
385 | 4.92M | U8_NEXT(src, srcIndex, srcLength, c); |
386 | 4.92M | if (c < 0) { |
387 | | // ill-formed UTF-8 |
388 | 4.08M | continue; |
389 | 4.08M | } |
390 | 842k | uint16_t props = UTRIE2_GET16(trie, c); |
391 | 842k | if (UCASE_HAS_EXCEPTION(props)) { break; } |
392 | 781k | int32_t delta; |
393 | 781k | if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { |
394 | 709k | continue; |
395 | 709k | } |
396 | 72.8k | ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, |
397 | 72.8k | sink, options, edits, errorCode); |
398 | 72.8k | ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); |
399 | 72.8k | prev = srcIndex; |
400 | 72.8k | } |
401 | 62.4k | if (c < 0) { |
402 | 677 | break; |
403 | 677 | } |
404 | | // slow path |
405 | 61.8k | csc->cpStart = cpStart; |
406 | 61.8k | csc->cpLimit = srcIndex; |
407 | 61.8k | const char16_t *s; |
408 | 61.8k | c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale); |
409 | 61.8k | if (c >= 0) { |
410 | 34.1k | ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, |
411 | 34.1k | sink, options, edits, errorCode); |
412 | 34.1k | appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); |
413 | 34.1k | prev = srcIndex; |
414 | 34.1k | } |
415 | 61.8k | } |
416 | 677 | ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, |
417 | 677 | sink, options, edits, errorCode); |
418 | 677 | } |
419 | | |
420 | | } // namespace |
421 | | |
422 | | #if !UCONFIG_NO_BREAK_ITERATION |
423 | | |
424 | | namespace { |
425 | | |
426 | | constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0]; |
427 | | |
428 | | constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1]; |
429 | | |
430 | | /** |
431 | | * Input: c is a letter I with or without acute accent. |
432 | | * start is the index in src after c, and is less than segmentLimit. |
433 | | * If a plain i/I is followed by a plain j/J, |
434 | | * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, |
435 | | * then we output accordingly. |
436 | | * |
437 | | * @return the src index after the titlecased sequence, or the start index if no Dutch IJ |
438 | | */ |
439 | | int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit, |
440 | 57.0k | ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { |
441 | 57.0k | U_ASSERT(start < segmentLimit); |
442 | | |
443 | 57.0k | int32_t index = start; |
444 | 57.0k | bool withAcute = false; |
445 | | |
446 | | // If the conditions are met, then the following variables tell us what to output. |
447 | 57.0k | int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) |
448 | 57.0k | bool doTitleJ = false; // true if the j needs to be titlecased |
449 | 57.0k | int32_t unchanged2 = 0; // after the j (0 or 1) |
450 | | |
451 | | // next character after the first letter |
452 | 57.0k | UChar32 c2; |
453 | 57.0k | c2 = src[index++]; |
454 | | |
455 | | // Is the first letter an i/I with accent? |
456 | 57.0k | if (c == u'I') { |
457 | 56.7k | if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) { |
458 | 11.3k | withAcute = true; |
459 | 11.3k | unchanged1 = 2; // ACUTE is 2 code units in UTF-8 |
460 | 11.3k | if (index == segmentLimit) { return start; } |
461 | 11.0k | c2 = src[index++]; |
462 | 11.0k | } |
463 | 56.7k | } else { // Í |
464 | 294 | withAcute = true; |
465 | 294 | } |
466 | | |
467 | | // Is the next character a j/J? |
468 | 56.6k | if (c2 == u'j') { |
469 | 19.7k | doTitleJ = true; |
470 | 36.9k | } else if (c2 == u'J') { |
471 | 23.6k | ++unchanged1; |
472 | 23.6k | } else { |
473 | 13.2k | return start; |
474 | 13.2k | } |
475 | | |
476 | | // A plain i/I must be followed by a plain j/J. |
477 | | // An i/I with acute must be followed by a j/J with acute. |
478 | 43.4k | if (withAcute) { |
479 | 9.61k | if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { |
480 | 6.17k | return start; |
481 | 6.17k | } |
482 | 3.44k | if (doTitleJ) { |
483 | 318 | unchanged2 = 2; // ACUTE is 2 code units in UTF-8 |
484 | 3.12k | } else { |
485 | 3.12k | unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8 |
486 | 3.12k | } |
487 | 3.44k | } |
488 | | |
489 | | // There must not be another combining mark. |
490 | 37.2k | if (index < segmentLimit) { |
491 | 34.5k | int32_t cp; |
492 | 34.5k | int32_t i = index; |
493 | 34.5k | U8_NEXT(src, i, segmentLimit, cp); |
494 | 34.5k | uint32_t typeMask = U_GET_GC_MASK(cp); |
495 | 34.5k | if ((typeMask & U_GC_M_MASK) != 0) { |
496 | 542 | return start; |
497 | 542 | } |
498 | 34.5k | } |
499 | | |
500 | | // Output the rest of the Dutch IJ. |
501 | 36.7k | ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode); |
502 | 36.7k | start += unchanged1; |
503 | 36.7k | if (doTitleJ) { |
504 | 18.3k | ByteSinkUtil::appendCodePoint(1, u'J', sink, edits); |
505 | 18.3k | ++start; |
506 | 18.3k | } |
507 | 36.7k | ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode); |
508 | | |
509 | 36.7k | U_ASSERT(start + unchanged2 == index); |
510 | 36.7k | return index; |
511 | 37.2k | } |
512 | | |
513 | | } // namespace |
514 | | |
515 | | U_CFUNC void U_CALLCONV |
516 | | ucasemap_internalUTF8ToTitle( |
517 | | int32_t caseLocale, uint32_t options, BreakIterator *iter, |
518 | | const uint8_t *src, int32_t srcLength, |
519 | | ByteSink &sink, icu::Edits *edits, |
520 | 4.53k | UErrorCode &errorCode) { |
521 | 4.53k | if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { |
522 | 20 | return; |
523 | 20 | } |
524 | | |
525 | | /* set up local variables */ |
526 | 4.51k | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
527 | 4.51k | csc.p=(void *)src; |
528 | 4.51k | csc.limit=srcLength; |
529 | 4.51k | int32_t prev=0; |
530 | 4.51k | UBool isFirstIndex=true; |
531 | | |
532 | | /* titlecasing loop */ |
533 | 53.7M | while(prev<srcLength) { |
534 | | /* find next index where to titlecase */ |
535 | 53.7M | int32_t index; |
536 | 53.7M | if(isFirstIndex) { |
537 | 4.45k | isFirstIndex=false; |
538 | 4.45k | index=iter->first(); |
539 | 53.7M | } else { |
540 | 53.7M | index=iter->next(); |
541 | 53.7M | } |
542 | 53.7M | if(index==UBRK_DONE || index>srcLength) { |
543 | 0 | index=srcLength; |
544 | 0 | } |
545 | | |
546 | | /* |
547 | | * Segment [prev..index[ into 3 parts: |
548 | | * a) skipped characters (copy as-is) [prev..titleStart[ |
549 | | * b) first letter (titlecase) [titleStart..titleLimit[ |
550 | | * c) subsequent characters (lowercase) [titleLimit..index[ |
551 | | */ |
552 | 53.7M | if(prev<index) { |
553 | | /* find and copy skipped characters [prev..titleStart[ */ |
554 | 53.7M | int32_t titleStart=prev; |
555 | 53.7M | int32_t titleLimit=prev; |
556 | 53.7M | UChar32 c; |
557 | 53.7M | U8_NEXT(src, titleLimit, index, c); |
558 | 53.7M | if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { |
559 | | // Adjust the titlecasing index to the next cased character, |
560 | | // or to the next letter/number/symbol/private use. |
561 | | // Stop with titleStart<titleLimit<=index |
562 | | // if there is a character to be titlecased, |
563 | | // or else stop with titleStart==titleLimit==index. |
564 | 34.1M | UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; |
565 | 39.8M | while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { |
566 | 31.9M | titleStart=titleLimit; |
567 | 31.9M | if(titleLimit==index) { |
568 | 26.2M | break; |
569 | 26.2M | } |
570 | 5.65M | U8_NEXT(src, titleLimit, index, c); |
571 | 5.65M | } |
572 | 34.1M | if (prev < titleStart) { |
573 | 26.3M | if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev, |
574 | 26.3M | sink, options, edits, errorCode)) { |
575 | 0 | return; |
576 | 0 | } |
577 | 26.3M | } |
578 | 34.1M | } |
579 | | |
580 | 53.7M | if(titleStart<titleLimit) { |
581 | | /* titlecase c which is from [titleStart..titleLimit[ */ |
582 | 27.4M | if(c>=0) { |
583 | 23.8M | csc.cpStart=titleStart; |
584 | 23.8M | csc.cpLimit=titleLimit; |
585 | 23.8M | const char16_t *s; |
586 | 23.8M | c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale); |
587 | 23.8M | if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) { |
588 | 0 | return; |
589 | 0 | } |
590 | 23.8M | } else { |
591 | | // Malformed UTF-8. |
592 | 3.59M | if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart, |
593 | 3.59M | sink, options, edits, errorCode)) { |
594 | 0 | return; |
595 | 0 | } |
596 | 3.59M | } |
597 | | |
598 | | /* Special case Dutch IJ titlecasing */ |
599 | 27.4M | if (titleLimit < index && |
600 | 1.80M | caseLocale == UCASE_LOC_DUTCH) { |
601 | 335k | if (c < 0) { |
602 | 188k | c = ~c; |
603 | 188k | } |
604 | | |
605 | 335k | if (c == u'I' || c == u'Í') { |
606 | 57.0k | titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode); |
607 | 57.0k | } |
608 | 335k | } |
609 | | |
610 | | /* lowercase [titleLimit..index[ */ |
611 | 27.4M | if(titleLimit<index) { |
612 | 1.80M | if((options&U_TITLECASE_NO_LOWERCASE)==0) { |
613 | | /* Normal operation: Lowercase the rest of the word. */ |
614 | 1.29M | toLower(caseLocale, options, |
615 | 1.29M | src, &csc, titleLimit, index, |
616 | 1.29M | sink, edits, errorCode); |
617 | 1.29M | if(U_FAILURE(errorCode)) { |
618 | 0 | return; |
619 | 0 | } |
620 | 1.29M | } else { |
621 | | /* Optionally just copy the rest of the word unchanged. */ |
622 | 505k | if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit, |
623 | 505k | sink, options, edits, errorCode)) { |
624 | 0 | return; |
625 | 0 | } |
626 | 505k | } |
627 | 1.80M | } |
628 | 27.4M | } |
629 | 53.7M | } |
630 | | |
631 | 53.7M | prev=index; |
632 | 53.7M | } |
633 | 4.51k | } |
634 | | |
635 | | #endif |
636 | | |
637 | | U_NAMESPACE_BEGIN |
638 | | namespace GreekUpper { |
639 | | |
640 | 187k | UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) { |
641 | 190k | while (i < length) { |
642 | 190k | UChar32 c; |
643 | 190k | U8_NEXT(s, i, length, c); |
644 | 190k | int32_t type = ucase_getTypeOrIgnorable(c); |
645 | 190k | if ((type & UCASE_IGNORABLE) != 0) { |
646 | | // Case-ignorable, continue with the loop. |
647 | 187k | } else if (type != UCASE_NONE) { |
648 | 17.3k | return true; // Followed by cased letter. |
649 | 169k | } else { |
650 | 169k | return false; // Uncased and not case-ignorable. |
651 | 169k | } |
652 | 190k | } |
653 | 75 | return false; // Not followed by cased letter. |
654 | 187k | } |
655 | | |
656 | | // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. |
657 | | void toUpper(uint32_t options, |
658 | | const uint8_t *src, int32_t srcLength, |
659 | | ByteSink &sink, Edits *edits, |
660 | 879 | UErrorCode &errorCode) { |
661 | 879 | uint32_t state = 0; |
662 | 9.04M | for (int32_t i = 0; i < srcLength;) { |
663 | 9.04M | int32_t nextIndex = i; |
664 | 9.04M | UChar32 c; |
665 | 9.04M | U8_NEXT(src, nextIndex, srcLength, c); |
666 | 9.04M | uint32_t nextState = 0; |
667 | 9.04M | int32_t type = ucase_getTypeOrIgnorable(c); |
668 | 9.04M | if ((type & UCASE_IGNORABLE) != 0) { |
669 | | // c is case-ignorable |
670 | 408k | nextState |= (state & AFTER_CASED); |
671 | 8.63M | } else if (type != UCASE_NONE) { |
672 | | // c is cased |
673 | 1.77M | nextState |= AFTER_CASED; |
674 | 1.77M | } |
675 | 9.04M | uint32_t data = getLetterData(c); |
676 | 9.04M | if (data > 0) { |
677 | 320k | uint32_t upper = data & UPPER_MASK; |
678 | | // Add a dialytika to this iota or ypsilon vowel |
679 | | // if we removed a tonos from the previous vowel, |
680 | | // and that previous vowel did not also have (or gain) a dialytika. |
681 | | // Adding one only to the final vowel in a longer sequence |
682 | | // (which does not occur in normal writing) would require lookahead. |
683 | | // Set the same flag as for preserving an existing dialytika. |
684 | 320k | if ((data & HAS_VOWEL) != 0 && |
685 | 252k | (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) != |
686 | 252k | 0 && |
687 | 17.0k | (upper == 0x399 || upper == 0x3A5)) { |
688 | 2.51k | data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA |
689 | 2.51k | : HAS_COMBINING_DIALYTIKA; |
690 | 2.51k | } |
691 | 320k | int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. |
692 | 320k | if ((data & HAS_YPOGEGRAMMENI) != 0) { |
693 | 14.3k | numYpogegrammeni = 1; |
694 | 14.3k | } |
695 | 320k | const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0; |
696 | | // Skip combining diacritics after this Greek letter. |
697 | 320k | int32_t nextNextIndex = nextIndex; |
698 | 387k | while (nextIndex < srcLength) { |
699 | 387k | UChar32 c2; |
700 | 387k | U8_NEXT(src, nextNextIndex, srcLength, c2); |
701 | 387k | uint32_t diacriticData = getDiacriticData(c2); |
702 | 387k | if (diacriticData != 0) { |
703 | 67.5k | data |= diacriticData; |
704 | 67.5k | if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { |
705 | 843 | ++numYpogegrammeni; |
706 | 843 | } |
707 | 67.5k | nextIndex = nextNextIndex; |
708 | 319k | } else { |
709 | 319k | break; // not a Greek diacritic |
710 | 319k | } |
711 | 387k | } |
712 | 320k | if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { |
713 | 227k | nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT |
714 | 227k | : AFTER_VOWEL_WITH_COMBINING_ACCENT; |
715 | 227k | } |
716 | | // Map according to Greek rules. |
717 | 320k | UBool addTonos = false; |
718 | 320k | if (upper == 0x397 && |
719 | 213k | (data & HAS_ACCENT) != 0 && |
720 | 210k | numYpogegrammeni == 0 && |
721 | 209k | (state & AFTER_CASED) == 0 && |
722 | 187k | !isFollowedByCasedLetter(src, nextIndex, srcLength)) { |
723 | | // Keep disjunctive "or" with (only) a tonos. |
724 | | // We use the same "word boundary" conditions as for the Final_Sigma test. |
725 | 169k | if (hasPrecomposedAccent) { |
726 | 165k | upper = 0x389; // Preserve the precomposed form. |
727 | 165k | } else { |
728 | 4.61k | addTonos = true; |
729 | 4.61k | } |
730 | 169k | } else if ((data & HAS_DIALYTIKA) != 0) { |
731 | | // Preserve a vowel with dialytika in precomposed form if it exists. |
732 | 11.5k | if (upper == 0x399) { |
733 | 2.72k | upper = 0x3AA; |
734 | 2.72k | data &= ~HAS_EITHER_DIALYTIKA; |
735 | 8.80k | } else if (upper == 0x3A5) { |
736 | 7.29k | upper = 0x3AB; |
737 | 7.29k | data &= ~HAS_EITHER_DIALYTIKA; |
738 | 7.29k | } |
739 | 11.5k | } |
740 | | |
741 | 320k | UBool change; |
742 | 320k | if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { |
743 | 23.1k | change = true; // common, simple usage |
744 | 297k | } else { |
745 | | // Find out first whether we are changing the text. |
746 | 297k | U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block |
747 | 297k | change = (i + 2) > nextIndex || |
748 | 297k | src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) || |
749 | 67.1k | numYpogegrammeni > 0; |
750 | 297k | int32_t i2 = i + 2; |
751 | 297k | if ((data & HAS_EITHER_DIALYTIKA) != 0) { |
752 | 8.39k | change |= (i2 + 2) > nextIndex || |
753 | 7.82k | src[i2] != static_cast<uint8_t>(u8"\u0308"[0]) || |
754 | 492 | src[i2 + 1] != static_cast<uint8_t>(u8"\u0308"[1]); |
755 | 8.39k | i2 += 2; |
756 | 8.39k | } |
757 | 297k | if (addTonos) { |
758 | 4.61k | change |= (i2 + 2) > nextIndex || |
759 | 771 | src[i2] != static_cast<uint8_t>(u8"\u0301"[0]) || |
760 | 449 | src[i2 + 1] != static_cast<uint8_t>(u8"\u0301"[1]); |
761 | 4.61k | i2 += 2; |
762 | 4.61k | } |
763 | 297k | int32_t oldLength = nextIndex - i; |
764 | 297k | int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399 |
765 | 297k | change |= oldLength != newLength; |
766 | 297k | if (change) { |
767 | 232k | if (edits != nullptr) { |
768 | 0 | edits->addReplace(oldLength, newLength); |
769 | 0 | } |
770 | 232k | } else { |
771 | 64.1k | if (edits != nullptr) { |
772 | 0 | edits->addUnchanged(oldLength); |
773 | 0 | } |
774 | | // Write unchanged text? |
775 | 64.1k | change = (options & U_OMIT_UNCHANGED_TEXT) == 0; |
776 | 64.1k | } |
777 | 297k | } |
778 | | |
779 | 320k | if (change) { |
780 | 256k | ByteSinkUtil::appendTwoBytes(upper, sink); |
781 | 256k | if ((data & HAS_EITHER_DIALYTIKA) != 0) { |
782 | 9.37k | sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika |
783 | 9.37k | } |
784 | 256k | if (addTonos) { |
785 | 4.45k | sink.AppendU8(u8"\u0301", 2); |
786 | 4.45k | } |
787 | 271k | while (numYpogegrammeni > 0) { |
788 | 15.2k | sink.AppendU8(u8"\u0399", 2); |
789 | 15.2k | --numYpogegrammeni; |
790 | 15.2k | } |
791 | 256k | } |
792 | 8.72M | } else if(c>=0) { |
793 | 5.88M | const char16_t *s; |
794 | 5.88M | c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK); |
795 | 5.88M | if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) { |
796 | 0 | return; |
797 | 0 | } |
798 | 5.88M | } else { |
799 | | // Malformed UTF-8. |
800 | 2.84M | if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i, |
801 | 2.84M | sink, options, edits, errorCode)) { |
802 | 0 | return; |
803 | 0 | } |
804 | 2.84M | } |
805 | 9.04M | i = nextIndex; |
806 | 9.04M | state = nextState; |
807 | 9.04M | } |
808 | 879 | } |
809 | | |
810 | | } // namespace GreekUpper |
811 | | U_NAMESPACE_END |
812 | | |
813 | | static void U_CALLCONV |
814 | | ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
815 | | const uint8_t *src, int32_t srcLength, |
816 | | icu::ByteSink &sink, icu::Edits *edits, |
817 | 1.24k | UErrorCode &errorCode) { |
818 | 1.24k | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
819 | 1.24k | csc.p=(void *)src; |
820 | 1.24k | csc.limit=srcLength; |
821 | 1.24k | toLower( |
822 | 1.24k | caseLocale, options, |
823 | 1.24k | src, &csc, 0, srcLength, |
824 | 1.24k | sink, edits, errorCode); |
825 | 1.24k | } |
826 | | |
827 | | static void U_CALLCONV |
828 | | ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
829 | | const uint8_t *src, int32_t srcLength, |
830 | | icu::ByteSink &sink, icu::Edits *edits, |
831 | 1.55k | UErrorCode &errorCode) { |
832 | 1.55k | if (caseLocale == UCASE_LOC_GREEK) { |
833 | 879 | GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode); |
834 | 879 | } else { |
835 | 677 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
836 | 677 | csc.p=(void *)src; |
837 | 677 | csc.limit=srcLength; |
838 | 677 | toUpper( |
839 | 677 | caseLocale, options, |
840 | 677 | src, &csc, srcLength, |
841 | 677 | sink, edits, errorCode); |
842 | 677 | } |
843 | 1.55k | } |
844 | | |
845 | | static void U_CALLCONV |
846 | | ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
847 | | const uint8_t *src, int32_t srcLength, |
848 | | icu::ByteSink &sink, icu::Edits *edits, |
849 | 240 | UErrorCode &errorCode) { |
850 | 240 | toLower( |
851 | 240 | -1, options, |
852 | 240 | src, nullptr, 0, srcLength, |
853 | 240 | sink, edits, errorCode); |
854 | 240 | } |
855 | | |
856 | | void |
857 | | ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
858 | | const char *src, int32_t srcLength, |
859 | | UTF8CaseMapper *stringCaseMapper, |
860 | | icu::ByteSink &sink, icu::Edits *edits, |
861 | 0 | UErrorCode &errorCode) { |
862 | | /* check argument values */ |
863 | 0 | if (U_FAILURE(errorCode)) { |
864 | 0 | return; |
865 | 0 | } |
866 | 0 | if ((src == nullptr && srcLength != 0) || srcLength < -1) { |
867 | 0 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
868 | 0 | return; |
869 | 0 | } |
870 | | |
871 | | // Get the string length. |
872 | 0 | if (srcLength == -1) { |
873 | 0 | srcLength = static_cast<int32_t>(uprv_strlen(src)); |
874 | 0 | } |
875 | |
|
876 | 0 | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
877 | 0 | edits->reset(); |
878 | 0 | } |
879 | 0 | stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
880 | 0 | reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, errorCode); |
881 | 0 | sink.Flush(); |
882 | 0 | if (U_SUCCESS(errorCode)) { |
883 | 0 | if (edits != nullptr) { |
884 | 0 | edits->copyErrorTo(errorCode); |
885 | 0 | } |
886 | 0 | } |
887 | 0 | } |
888 | | |
889 | | int32_t |
890 | | ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
891 | | char *dest, int32_t destCapacity, |
892 | | const char *src, int32_t srcLength, |
893 | | UTF8CaseMapper *stringCaseMapper, |
894 | | icu::Edits *edits, |
895 | 7.57k | UErrorCode &errorCode) { |
896 | | /* check argument values */ |
897 | 7.57k | if(U_FAILURE(errorCode)) { |
898 | 0 | return 0; |
899 | 0 | } |
900 | 7.57k | if( destCapacity<0 || |
901 | 7.57k | (dest==nullptr && destCapacity>0) || |
902 | 7.57k | (src==nullptr && srcLength!=0) || srcLength<-1 |
903 | 7.57k | ) { |
904 | 0 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
905 | 0 | return 0; |
906 | 0 | } |
907 | | |
908 | | /* get the string length */ |
909 | 7.57k | if(srcLength==-1) { |
910 | 0 | srcLength = static_cast<int32_t>(uprv_strlen(src)); |
911 | 0 | } |
912 | | |
913 | | /* check for overlapping source and destination */ |
914 | 7.57k | if( dest!=nullptr && |
915 | 7.57k | ((src>=dest && src<(dest+destCapacity)) || |
916 | 7.57k | (dest>=src && dest<(src+srcLength))) |
917 | 7.57k | ) { |
918 | 0 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
919 | 0 | return 0; |
920 | 0 | } |
921 | | |
922 | 7.57k | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
923 | 0 | edits->reset(); |
924 | 0 | } |
925 | 7.57k | int32_t reslen = ByteSinkUtil::viaByteSinkToTerminatedChars( |
926 | 7.57k | dest, destCapacity, |
927 | 7.57k | [&](ByteSink& sink, UErrorCode& status) { |
928 | 7.57k | stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
929 | 7.57k | reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, status); |
930 | 7.57k | }, |
931 | 7.57k | errorCode); |
932 | 7.57k | if (U_SUCCESS(errorCode) && edits != nullptr) { |
933 | 0 | edits->copyErrorTo(errorCode); |
934 | 0 | } |
935 | 7.57k | return reslen; |
936 | 7.57k | } |
937 | | |
938 | | /* public API functions */ |
939 | | |
940 | | U_CAPI int32_t U_EXPORT2 |
941 | | ucasemap_utf8ToLower(const UCaseMap *csm, |
942 | | char *dest, int32_t destCapacity, |
943 | | const char *src, int32_t srcLength, |
944 | 1.24k | UErrorCode *pErrorCode) { |
945 | 1.24k | return ucasemap_mapUTF8( |
946 | 1.24k | csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL |
947 | 1.24k | dest, destCapacity, |
948 | 1.24k | src, srcLength, |
949 | 1.24k | ucasemap_internalUTF8ToLower, nullptr, *pErrorCode); |
950 | 1.24k | } |
951 | | |
952 | | U_CAPI int32_t U_EXPORT2 |
953 | | ucasemap_utf8ToUpper(const UCaseMap *csm, |
954 | | char *dest, int32_t destCapacity, |
955 | | const char *src, int32_t srcLength, |
956 | 1.55k | UErrorCode *pErrorCode) { |
957 | 1.55k | return ucasemap_mapUTF8( |
958 | 1.55k | csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL |
959 | 1.55k | dest, destCapacity, |
960 | 1.55k | src, srcLength, |
961 | 1.55k | ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode); |
962 | 1.55k | } |
963 | | |
964 | | U_CAPI int32_t U_EXPORT2 |
965 | | ucasemap_utf8FoldCase(const UCaseMap *csm, |
966 | | char *dest, int32_t destCapacity, |
967 | | const char *src, int32_t srcLength, |
968 | 240 | UErrorCode *pErrorCode) { |
969 | 240 | return ucasemap_mapUTF8( |
970 | 240 | UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL |
971 | 240 | dest, destCapacity, |
972 | 240 | src, srcLength, |
973 | 240 | ucasemap_internalUTF8Fold, nullptr, *pErrorCode); |
974 | 240 | } |
975 | | |
976 | | U_NAMESPACE_BEGIN |
977 | | |
978 | | void CaseMap::utf8ToLower( |
979 | | const char *locale, uint32_t options, |
980 | | StringPiece src, ByteSink &sink, Edits *edits, |
981 | 0 | UErrorCode &errorCode) { |
982 | 0 | ucasemap_mapUTF8( |
983 | 0 | ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL |
984 | 0 | src.data(), src.length(), |
985 | 0 | ucasemap_internalUTF8ToLower, sink, edits, errorCode); |
986 | 0 | } |
987 | | |
988 | | void CaseMap::utf8ToUpper( |
989 | | const char *locale, uint32_t options, |
990 | | StringPiece src, ByteSink &sink, Edits *edits, |
991 | 0 | UErrorCode &errorCode) { |
992 | 0 | ucasemap_mapUTF8( |
993 | 0 | ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL |
994 | 0 | src.data(), src.length(), |
995 | 0 | ucasemap_internalUTF8ToUpper, sink, edits, errorCode); |
996 | 0 | } |
997 | | |
998 | | void CaseMap::utf8Fold( |
999 | | uint32_t options, |
1000 | | StringPiece src, ByteSink &sink, Edits *edits, |
1001 | 0 | UErrorCode &errorCode) { |
1002 | 0 | ucasemap_mapUTF8( |
1003 | 0 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
1004 | 0 | src.data(), src.length(), |
1005 | 0 | ucasemap_internalUTF8Fold, sink, edits, errorCode); |
1006 | 0 | } |
1007 | | |
1008 | | int32_t CaseMap::utf8ToLower( |
1009 | | const char *locale, uint32_t options, |
1010 | | const char *src, int32_t srcLength, |
1011 | | char *dest, int32_t destCapacity, Edits *edits, |
1012 | 0 | UErrorCode &errorCode) { |
1013 | 0 | return ucasemap_mapUTF8( |
1014 | 0 | ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL |
1015 | 0 | dest, destCapacity, |
1016 | 0 | src, srcLength, |
1017 | 0 | ucasemap_internalUTF8ToLower, edits, errorCode); |
1018 | 0 | } |
1019 | | |
1020 | | int32_t CaseMap::utf8ToUpper( |
1021 | | const char *locale, uint32_t options, |
1022 | | const char *src, int32_t srcLength, |
1023 | | char *dest, int32_t destCapacity, Edits *edits, |
1024 | 0 | UErrorCode &errorCode) { |
1025 | 0 | return ucasemap_mapUTF8( |
1026 | 0 | ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL |
1027 | 0 | dest, destCapacity, |
1028 | 0 | src, srcLength, |
1029 | 0 | ucasemap_internalUTF8ToUpper, edits, errorCode); |
1030 | 0 | } |
1031 | | |
1032 | | int32_t CaseMap::utf8Fold( |
1033 | | uint32_t options, |
1034 | | const char *src, int32_t srcLength, |
1035 | | char *dest, int32_t destCapacity, Edits *edits, |
1036 | 0 | UErrorCode &errorCode) { |
1037 | 0 | return ucasemap_mapUTF8( |
1038 | | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
1039 | 0 | dest, destCapacity, |
1040 | 0 | src, srcLength, |
1041 | 0 | ucasemap_internalUTF8Fold, edits, errorCode); |
1042 | 0 | } |
1043 | | |
1044 | | U_NAMESPACE_END |