/src/libreoffice/i18nutil/source/utility/casefolding.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include <i18nutil/casefolding.hxx> |
21 | | #include "casefolding_data.h" |
22 | | #include <i18nutil/oneToOneMapping.hxx> |
23 | | #include <i18nutil/widthfolding.hxx> |
24 | | #include <i18nutil/transliteration.hxx> |
25 | | #include <com/sun/star/lang/Locale.hpp> |
26 | | #include <com/sun/star/uno/RuntimeException.hpp> |
27 | | #include <rtl/character.hxx> |
28 | | |
29 | | #include <unicode/uchar.h> |
30 | | |
31 | | using namespace com::sun::star::lang; |
32 | | using namespace com::sun::star::uno; |
33 | | |
34 | | namespace i18nutil { |
35 | | |
36 | | const Mapping mapping_03a3[] = {{0, 1, {0x03c2, 0, 0}},{0, 1, {0x03c3, 0, 0}}}; |
37 | | const Mapping mapping_0307[] = {{0, 0, {0, 0, 0}},{0, 1, {0x0307, 0, 0}}}; |
38 | | const Mapping mapping_004a[] = {{0, 2, {0x006a, 0x0307, 0}},{0, 1, {0x006a, 0, 0}}}; |
39 | | const Mapping mapping_012e[] = {{0, 2, {0x012f, 0x0307, 0}},{0, 1, {0x012f, 0, 0}}}; |
40 | | const Mapping mapping_00cc[] = {{0, 3, {0x0069, 0x0307, 0x0300}},{0, 1, {0x00ec, 0, 0}}}; |
41 | | const Mapping mapping_00cd[] = {{0, 3, {0x0069, 0x0307, 0x0301}},{0, 1, {0x00ed, 0, 0}}}; |
42 | | const Mapping mapping_0128[] = {{0, 3, {0x0069, 0x0307, 0x0303}},{0, 1, {0x0129, 0, 0}}}; |
43 | | const Mapping mapping_0049[] = {{0, 2, {0x0069, 0x0307, 0}},{0, 1, {0x0131, 0, 0}},{0, 1, {0x0069, 0, 0}}}; |
44 | | const Mapping mapping_0069[] = {{0, 1, {0x0130, 0, 0}},{0, 1, {0x0049, 0, 0}}}; |
45 | | const Mapping mapping_0130[] = {{0, 1, {0x0069, 0, 0}},{0, 1, {0x0130, 0, 0}}}; |
46 | | |
47 | 55.9M | #define langIs(lang) (aLocale.Language == lang) |
48 | | |
49 | | // only check simple case, there is more complicated case need to be checked. |
50 | 0 | #define type_i(ch) ((ch) == 0x0069 || (ch) == 0x006a) |
51 | | |
52 | | static bool cased_letter(sal_Unicode ch) |
53 | 10.1k | { |
54 | 10.1k | int msb = ch >> 8; |
55 | 10.1k | int cmi = CaseMappingIndex[msb]; |
56 | 10.1k | if (cmi < 0) |
57 | 427 | return false; |
58 | 9.71k | int cmv_idx = (cmi << 8) + (ch & 0xff); |
59 | 9.71k | return bool(static_cast<MappingType>(CaseMappingValue[cmv_idx].type) & MappingType::CasedLetterMask); |
60 | 10.1k | } |
61 | | |
62 | | // for Lithuanian, condition to make explicit dot above when lowercasing capital I's and J's |
63 | | // whenever there are more accents above. |
64 | 0 | #define accent_above(ch) (((ch) >= 0x0300 && (ch) <= 0x0314) || ((ch) >= 0x033D && (ch) <= 0x0344) || (ch) == 0x0346 || ((ch) >= 0x034A && (ch) <= 0x034C)) |
65 | | |
66 | | const Mapping& casefolding::getConditionalValue(const sal_Unicode* str, sal_Int32 pos, sal_Int32 len, Locale const & aLocale, MappingType nMappingType) |
67 | 18.4M | { |
68 | 18.4M | switch(str[pos]) { |
69 | 8.68k | case 0x03a3: |
70 | | // final_sigma (not followed by cased and preceded by cased character) |
71 | | // DOES NOT check ignorable sequence yet (more complicated implementation). |
72 | 8.68k | return !(pos < len && cased_letter(str[pos+1])) && (pos > 0 && cased_letter(str[pos-1])) ? |
73 | 7.96k | mapping_03a3[0] : mapping_03a3[1]; |
74 | 15.4k | case 0x0307: |
75 | 15.4k | return (((nMappingType == MappingType::LowerToUpper && langIs("lt")) || |
76 | 15.4k | (nMappingType == MappingType::UpperToLower && (langIs("tr") || langIs("az")))) && |
77 | 0 | (pos > 0 && type_i(str[pos-1]))) ? // after_i |
78 | 15.4k | mapping_0307[0] : mapping_0307[1]; |
79 | 4.96k | case 0x0130: |
80 | 4.96k | return (langIs("tr") || langIs("az")) ? mapping_0130[0] : mapping_0130[1]; |
81 | 18.0M | case 0x0069: |
82 | 18.0M | return (langIs("tr") || langIs("az")) ? mapping_0069[0] : mapping_0069[1]; |
83 | 340k | case 0x0049: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_0049[0] : |
84 | 340k | (langIs("tr") || langIs("az")) ? mapping_0049[1] : mapping_0049[2]; |
85 | 60.0k | case 0x004a: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_004a[0] : mapping_004a[1]; |
86 | 460 | case 0x012e: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_012e[0] : mapping_012e[1]; |
87 | 13.9k | case 0x00cc: return langIs("lt") ? mapping_00cc[0] : mapping_00cc[1]; |
88 | 9.61k | case 0x00cd: return langIs("lt") ? mapping_00cd[0] : mapping_00cd[1]; |
89 | 257 | case 0x0128: return langIs("lt") ? mapping_0128[0] : mapping_0128[1]; |
90 | 18.4M | } |
91 | | // Should not come here |
92 | 0 | throw RuntimeException(); |
93 | 18.4M | } |
94 | | |
95 | | Mapping casefolding::getValue(const sal_Unicode* str, sal_Int32 pos, sal_Int32 len, Locale const & aLocale, MappingType nMappingType) |
96 | 1.30G | { |
97 | 1.30G | if (pos > 0 && rtl::isHighSurrogate(str[pos-1]) && rtl::isLowSurrogate(str[pos])) |
98 | 76.1k | return { 0, 0, { 0, 0, 0 } }; |
99 | | |
100 | 1.30G | Mapping dummy = { 0, 1, { str[pos], 0, 0 } }; |
101 | | |
102 | 1.30G | sal_uInt32 c; |
103 | 1.30G | if (pos + 1 < len && rtl::isHighSurrogate(str[pos]) && rtl::isLowSurrogate(str[pos + 1])) |
104 | 76.1k | c = rtl::combineSurrogates(str[pos], str[pos + 1]); |
105 | 1.30G | else |
106 | 1.30G | c = str[pos]; |
107 | | |
108 | 1.30G | sal_Int16 address = -1; |
109 | 1.30G | if (c < SAL_N_ELEMENTS(CaseMappingIndex) * 256) |
110 | 1.30G | address = CaseMappingIndex[c >> 8]; |
111 | | |
112 | 1.30G | if (address >= 0) { |
113 | 1.29G | address = (address << 8) + (c & 0xFF); |
114 | 1.29G | if (static_cast<MappingType>(CaseMappingValue[address].type) & nMappingType) { |
115 | 223M | MappingType type = static_cast<MappingType>(CaseMappingValue[address].type); |
116 | 223M | if (type & MappingType::NotValue) { |
117 | 18.7M | if (CaseMappingValue[address].value == 0) |
118 | 18.3M | return getConditionalValue(str, pos, len, aLocale, nMappingType); |
119 | 392k | else { |
120 | 392k | for (int map = CaseMappingValue[address].value; |
121 | 492k | map < CaseMappingValue[address].value + MaxCaseMappingExtras; map++) { |
122 | 492k | if (static_cast<MappingType>(CaseMappingExtra[map].type) & nMappingType) { |
123 | 392k | if (static_cast<MappingType>(CaseMappingExtra[map].type) & MappingType::NotValue) |
124 | 93.0k | return getConditionalValue(str, pos, len, aLocale, nMappingType); |
125 | 298k | else |
126 | 298k | return CaseMappingExtra[map]; |
127 | 392k | } |
128 | 492k | } |
129 | | // Should not come here |
130 | 0 | throw RuntimeException(); |
131 | 392k | } |
132 | 18.7M | } |
133 | 204M | else |
134 | 204M | { |
135 | 204M | dummy.map[0] = CaseMappingValue[address].value; |
136 | 204M | return dummy; |
137 | 204M | } |
138 | 223M | } |
139 | 1.29G | } |
140 | | |
141 | | // If the code point is not supported by our case mapping tables, |
142 | | // fallback to ICU functions. |
143 | | // TODO: this does not handle special case mapping as these require |
144 | | // using ustring.h APIs, which work on the whole string not character |
145 | | // by character. |
146 | | // TODO: what is the difference between ToLower and UpperToLower etc.? |
147 | 1.08G | sal_uInt32 value = c; |
148 | 1.08G | switch (nMappingType) |
149 | 1.08G | { |
150 | 16.5M | case MappingType::ToLower: |
151 | 16.5M | case MappingType::UpperToLower: |
152 | 16.5M | value = u_tolower(c); |
153 | 16.5M | break; |
154 | 1.04G | case MappingType::ToUpper: |
155 | 1.04G | case MappingType::LowerToUpper: |
156 | 1.04G | value = u_toupper(c); |
157 | 1.04G | break; |
158 | 0 | case MappingType::ToTitle: |
159 | 0 | value = u_totitle(c); |
160 | 0 | break; |
161 | 0 | case MappingType::SimpleFolding: |
162 | 21.0M | case MappingType::FullFolding: |
163 | 21.0M | value = u_foldCase(c, U_FOLD_CASE_DEFAULT); |
164 | 21.0M | break; |
165 | 0 | default: break; |
166 | 1.08G | } |
167 | | |
168 | 1.08G | dummy.nmap = rtl::splitSurrogates(value, dummy.map); |
169 | | |
170 | 1.08G | return dummy; |
171 | 1.08G | } |
172 | | |
173 | | static bool |
174 | | is_ja_voice_sound_mark(sal_Unicode& current, sal_Unicode next) |
175 | 0 | { |
176 | 0 | if (next != 0x3099 && next != 0x309a) |
177 | 0 | return false; |
178 | 0 | sal_Unicode c = widthfolding::getCompositionChar(current, next); |
179 | 0 | if (c != 0) |
180 | 0 | current = c; |
181 | 0 | return c != 0; |
182 | 0 | } |
183 | | |
184 | | sal_Unicode casefolding::getNextChar(const sal_Unicode *str, sal_Int32& idx, sal_Int32 len, MappingElement& e, Locale const & aLocale, MappingType nMappingType, TransliterationFlags moduleLoaded) |
185 | 25.8M | { |
186 | 25.8M | if( idx >= len ) |
187 | 96 | { |
188 | 96 | e = MappingElement(); |
189 | 96 | return 0; |
190 | 96 | } |
191 | | |
192 | 25.8M | sal_Unicode c; |
193 | | |
194 | 25.8M | if (moduleLoaded & TransliterationFlags::IGNORE_CASE) { |
195 | 25.8M | if( e.current >= e.element.nmap ) { |
196 | 25.8M | e.element = getValue(str, idx++, len, aLocale, nMappingType); |
197 | 25.8M | e.current = 0; |
198 | 25.8M | } |
199 | 25.8M | c = e.element.map[e.current++]; |
200 | 25.8M | } else { |
201 | 0 | c = *(str + idx++); |
202 | 0 | } |
203 | | |
204 | 25.8M | if (moduleLoaded & TransliterationFlags::IGNORE_KANA) { |
205 | 0 | if ((0x3040 <= c && c <= 0x3094) || (0x309d <= c && c <= 0x309f)) |
206 | 0 | c += 0x60; |
207 | 0 | } |
208 | | |
209 | | // composition: KA + voice-mark --> GA. see halfwidthToFullwidth.cxx for detail |
210 | 25.8M | if (moduleLoaded & TransliterationFlags::IGNORE_WIDTH) { |
211 | 0 | static oneToOneMapping& half2fullTable = widthfolding::gethalf2fullTable(); |
212 | 0 | c = half2fullTable[c]; |
213 | 0 | if (0x3040 <= c && c <= 0x30ff && idx < len && |
214 | 0 | is_ja_voice_sound_mark(c, half2fullTable[*(str + idx)])) |
215 | 0 | idx++; |
216 | 0 | } |
217 | | |
218 | 25.8M | return c; |
219 | 25.8M | } |
220 | | |
221 | | } |
222 | | |
223 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |