/src/libreoffice/i18npool/source/transliteration/transliteration_body.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | // Silence spurious Werror=maybe-uninitialized in transliterateImpl emitted at least by GCC 11.2.0 |
20 | | #if defined __GNUC__ && !defined __clang__ |
21 | | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
22 | | #endif |
23 | | |
24 | | #include <rtl/ref.hxx> |
25 | | #include <i18nutil/casefolding.hxx> |
26 | | #include <i18nutil/unicode.hxx> |
27 | | #include <com/sun/star/i18n/MultipleCharsOutputException.hpp> |
28 | | #include <com/sun/star/i18n/TransliterationType.hpp> |
29 | | #include <comphelper/processfactory.hxx> |
30 | | #include <comphelper/sequence.hxx> |
31 | | #include <o3tl/temporary.hxx> |
32 | | |
33 | | #include <characterclassificationImpl.hxx> |
34 | | |
35 | | #include <transliteration_body.hxx> |
36 | | #include <memory> |
37 | | #include <numeric> |
38 | | |
39 | | using namespace ::com::sun::star::uno; |
40 | | using namespace ::com::sun::star::i18n; |
41 | | using namespace ::com::sun::star::lang; |
42 | | |
43 | | namespace i18npool { |
44 | | |
45 | | Transliteration_body::Transliteration_body() |
46 | 2.09M | { |
47 | 2.09M | nMappingType = MappingType::NONE; |
48 | 2.09M | transliterationName = "Transliteration_body"; |
49 | 2.09M | implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; |
50 | 2.09M | } |
51 | | |
52 | | sal_Int16 SAL_CALL Transliteration_body::getType() |
53 | 0 | { |
54 | 0 | return TransliterationType::ONE_TO_ONE; |
55 | 0 | } |
56 | | |
57 | | sal_Bool SAL_CALL Transliteration_body::equals( |
58 | | const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, |
59 | | const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) |
60 | 0 | { |
61 | 0 | throw RuntimeException(); |
62 | 0 | } |
63 | | |
64 | | Sequence< OUString > SAL_CALL |
65 | | Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) |
66 | 0 | { |
67 | 0 | return { str1, str2 }; |
68 | 0 | } |
69 | | |
70 | | static MappingType lcl_getMappingTypeForToggleCase( MappingType nMappingType, sal_Unicode cChar ) |
71 | 1.30G | { |
72 | 1.30G | MappingType nRes = nMappingType; |
73 | | |
74 | | // take care of TOGGLE_CASE transliteration: |
75 | | // nMappingType should not be a combination of flags, thuse we decide now |
76 | | // which one to use. |
77 | 1.30G | if (nMappingType == (MappingType::LowerToUpper | MappingType::UpperToLower)) |
78 | 0 | { |
79 | 0 | const sal_Int16 nType = unicode::getUnicodeType( cChar ); |
80 | 0 | if (nType & 0x02 /* lower case*/) |
81 | 0 | nRes = MappingType::LowerToUpper; |
82 | 0 | else |
83 | 0 | { |
84 | | // should also work properly for non-upper characters like white spaces, numbers, ... |
85 | 0 | nRes = MappingType::UpperToLower; |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | 1.30G | return nRes; |
90 | 1.30G | } |
91 | | |
92 | | OUString |
93 | | Transliteration_body::transliterateImpl( |
94 | | const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
95 | | Sequence< sal_Int32 >* pOffset) |
96 | 134M | { |
97 | 134M | const sal_Unicode *in = inStr.getStr() + startPos; |
98 | | |
99 | | // We could assume that most calls result in identical string lengths, |
100 | | // thus using a preallocated OUStringBuffer could be an easy way |
101 | | // to assemble the return string without too much hassle. However, |
102 | | // for single characters the OUStringBuffer::append() method is quite |
103 | | // expensive compared to a simple array operation, so it pays here |
104 | | // to copy the final result instead. |
105 | | |
106 | | // Allocate the max possible buffer. Try to use stack instead of heap, |
107 | | // which would have to be reallocated most times anyways. |
108 | 134M | constexpr sal_Int32 nLocalBuf = 2048; |
109 | 134M | sal_Unicode* out; |
110 | 134M | std::unique_ptr<sal_Unicode[]> pHeapBuf; |
111 | 134M | if (nCount <= nLocalBuf) |
112 | 134M | out = static_cast<sal_Unicode*>(alloca(nCount * NMAPPINGMAX * sizeof(sal_Unicode))); |
113 | 4.18k | else |
114 | 4.18k | { |
115 | 4.18k | pHeapBuf.reset(new sal_Unicode[ nCount * NMAPPINGMAX ]); |
116 | 4.18k | out = pHeapBuf.get(); |
117 | 4.18k | } |
118 | | |
119 | 134M | sal_Int32 j = 0; |
120 | | // Two different blocks to eliminate the if(useOffset) condition inside the loop. |
121 | | // Yes, on massive use even such small things do count. |
122 | 134M | if ( pOffset ) |
123 | 1.43k | { |
124 | 1.43k | sal_Int32* offsetData; |
125 | 1.43k | std::unique_ptr<sal_Int32[]> pOffsetHeapBuf; |
126 | 1.43k | sal_Int32 nOffsetCount = std::max<sal_Int32>(nLocalBuf, nCount); |
127 | 1.43k | if (nOffsetCount <= nLocalBuf) |
128 | 1.43k | offsetData = static_cast<sal_Int32*>(alloca(nOffsetCount * NMAPPINGMAX * sizeof(sal_Int32))); |
129 | 0 | else |
130 | 0 | { |
131 | 0 | pOffsetHeapBuf.reset(new sal_Int32[ nOffsetCount * NMAPPINGMAX ]); |
132 | 0 | offsetData = pOffsetHeapBuf.get(); |
133 | 0 | } |
134 | 1.43k | sal_Int32* offsetDataEnd = offsetData; |
135 | | |
136 | 28.8k | for (sal_Int32 i = 0; i < nCount; i++) |
137 | 27.3k | { |
138 | | // take care of TOGGLE_CASE transliteration: |
139 | 27.3k | MappingType nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); |
140 | | |
141 | 27.3k | const i18nutil::Mapping map = i18nutil::casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); |
142 | 27.3k | std::fill_n(offsetDataEnd, map.nmap, i + startPos); |
143 | 27.3k | offsetDataEnd += map.nmap; |
144 | 27.3k | std::copy_n(map.map, map.nmap, out + j); |
145 | 27.3k | j += map.nmap; |
146 | 27.3k | } |
147 | | |
148 | 1.43k | *pOffset = css::uno::Sequence< sal_Int32 >(offsetData, offsetDataEnd - offsetData); |
149 | 1.43k | } |
150 | 134M | else |
151 | 134M | { |
152 | 1.44G | for ( sal_Int32 i = 0; i < nCount; i++) |
153 | 1.30G | { |
154 | | // take care of TOGGLE_CASE transliteration: |
155 | 1.30G | MappingType nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); |
156 | | |
157 | 1.30G | const i18nutil::Mapping map = i18nutil::casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); |
158 | 1.30G | std::copy_n(map.map, map.nmap, out + j); |
159 | 1.30G | j += map.nmap; |
160 | 1.30G | } |
161 | 134M | } |
162 | | |
163 | 134M | return OUString(out, j); |
164 | 134M | } |
165 | | |
166 | | OUString SAL_CALL |
167 | | Transliteration_body::transliterateChar2String( sal_Unicode inChar ) |
168 | 0 | { |
169 | 0 | const i18nutil::Mapping map = i18nutil::casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); |
170 | 0 | rtl_uString* pStr = rtl_uString_alloc(map.nmap); |
171 | 0 | sal_Unicode* out = pStr->buffer; |
172 | 0 | sal_Int32 i; |
173 | |
|
174 | 0 | for (i = 0; i < map.nmap; i++) |
175 | 0 | out[i] = map.map[i]; |
176 | 0 | out[i] = 0; |
177 | |
|
178 | 0 | return OUString( pStr, SAL_NO_ACQUIRE ); |
179 | 0 | } |
180 | | |
181 | | sal_Unicode SAL_CALL |
182 | | Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) |
183 | 0 | { |
184 | 0 | const i18nutil::Mapping map = i18nutil::casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); |
185 | 0 | if (map.nmap > 1) |
186 | 0 | throw MultipleCharsOutputException(); |
187 | 0 | return map.map[0]; |
188 | 0 | } |
189 | | |
190 | | OUString |
191 | | Transliteration_body::foldingImpl( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
192 | | Sequence< sal_Int32 >* pOffset) |
193 | 0 | { |
194 | 0 | return transliterateImpl(inStr, startPos, nCount, pOffset); |
195 | 0 | } |
196 | | |
197 | | Transliteration_casemapping::Transliteration_casemapping() |
198 | 2.09M | { |
199 | 2.09M | nMappingType = MappingType::NONE; |
200 | 2.09M | transliterationName = "casemapping(generic)"; |
201 | 2.09M | implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; |
202 | 2.09M | } |
203 | | |
204 | | Transliteration_u2l::Transliteration_u2l() |
205 | 0 | { |
206 | 0 | nMappingType = MappingType::UpperToLower; |
207 | 0 | transliterationName = "upper_to_lower(generic)"; |
208 | 0 | implementationName = "com.sun.star.i18n.Transliteration.UPPERCASE_LOWERCASE"; |
209 | 0 | } |
210 | | |
211 | | Transliteration_l2u::Transliteration_l2u() |
212 | 0 | { |
213 | 0 | nMappingType = MappingType::LowerToUpper; |
214 | 0 | transliterationName = "lower_to_upper(generic)"; |
215 | 0 | implementationName = "com.sun.star.i18n.Transliteration.LOWERCASE_UPPERCASE"; |
216 | 0 | } |
217 | | |
218 | | Transliteration_togglecase::Transliteration_togglecase() |
219 | 0 | { |
220 | | // usually nMappingType must NOT be a combination of different flags here, |
221 | | // but we take care of that problem in Transliteration_body::transliterate above |
222 | | // before that value is used. There we will decide which of both is to be used on |
223 | | // a per character basis. |
224 | 0 | nMappingType = MappingType::LowerToUpper | MappingType::UpperToLower; |
225 | 0 | transliterationName = "toggle(generic)"; |
226 | 0 | implementationName = "com.sun.star.i18n.Transliteration.TOGGLE_CASE"; |
227 | 0 | } |
228 | | |
229 | | Transliteration_titlecase::Transliteration_titlecase() |
230 | 0 | { |
231 | 0 | nMappingType = MappingType::ToTitle; |
232 | 0 | transliterationName = "title(generic)"; |
233 | 0 | implementationName = "com.sun.star.i18n.Transliteration.TITLE_CASE"; |
234 | 0 | } |
235 | | |
236 | | /// @throws RuntimeException |
237 | | static OUString transliterate_titlecase_Impl( |
238 | | std::u16string_view inStr, sal_Int32 startPos, sal_Int32 nCount, |
239 | | const Locale &rLocale, |
240 | | Sequence< sal_Int32 >* pOffset ) |
241 | 0 | { |
242 | 0 | const OUString aText( inStr.substr( startPos, nCount ) ); |
243 | |
|
244 | 0 | OUString aRes; |
245 | 0 | if (!aText.isEmpty()) |
246 | 0 | { |
247 | 0 | const Reference< XComponentContext >& xContext = ::comphelper::getProcessComponentContext(); |
248 | 0 | rtl::Reference< CharacterClassificationImpl > xCharClassImpl( new CharacterClassificationImpl( xContext ) ); |
249 | | |
250 | | // because xCharClassImpl.toTitle does not handle ligatures or Beta but will raise |
251 | | // an exception we need to handle the first chara manually... |
252 | | |
253 | | // we don't want to change surrogates by accident, thuse we use proper code point iteration |
254 | 0 | sal_uInt32 cFirstChar = aText.iterateCodePoints( &o3tl::temporary(sal_Int32(0)) ); |
255 | 0 | OUString aResolvedLigature( &cFirstChar, 1 ); |
256 | | // toUpper can be used to properly resolve ligatures and characters like Beta |
257 | 0 | aResolvedLigature = xCharClassImpl->toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); |
258 | | // since toTitle will leave all-uppercase text unchanged we first need to |
259 | | // use toLower to bring possible 2nd and following chars in lowercase |
260 | 0 | aResolvedLigature = xCharClassImpl->toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); |
261 | 0 | sal_Int32 nResolvedLen = aResolvedLigature.getLength(); |
262 | | |
263 | | // now we can properly use toTitle to get the expected result for the resolved string. |
264 | | // The rest of the text should just become lowercase. |
265 | 0 | aRes = xCharClassImpl->toTitle( aResolvedLigature, 0, nResolvedLen, rLocale ) + |
266 | 0 | xCharClassImpl->toLower( aText, 1, aText.getLength() - 1, rLocale ); |
267 | 0 | if (pOffset) |
268 | 0 | { |
269 | 0 | pOffset->realloc( aRes.getLength() ); |
270 | |
|
271 | 0 | auto [begin, end] = asNonConstRange(*pOffset); |
272 | 0 | sal_Int32* pOffsetInt = std::fill_n(begin, nResolvedLen, 0); |
273 | 0 | std::iota(pOffsetInt, end, 1); |
274 | 0 | } |
275 | 0 | } |
276 | 0 | return aRes; |
277 | 0 | } |
278 | | |
279 | | // this function expects to be called on a word-by-word basis, |
280 | | // namely that startPos points to the first char of the word |
281 | | OUString Transliteration_titlecase::transliterateImpl( |
282 | | const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
283 | | Sequence< sal_Int32 >* pOffset ) |
284 | 0 | { |
285 | 0 | return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, pOffset ); |
286 | 0 | } |
287 | | |
288 | | Transliteration_sentencecase::Transliteration_sentencecase() |
289 | 0 | { |
290 | 0 | nMappingType = MappingType::ToTitle; // though only to be applied to the first word... |
291 | 0 | transliterationName = "sentence(generic)"; |
292 | 0 | implementationName = "com.sun.star.i18n.Transliteration.SENTENCE_CASE"; |
293 | 0 | } |
294 | | |
295 | | // this function expects to be called on a sentence-by-sentence basis, |
296 | | // namely that startPos points to the first word (NOT first char!) in the sentence |
297 | | OUString Transliteration_sentencecase::transliterateImpl( |
298 | | const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
299 | | Sequence< sal_Int32 >* pOffset ) |
300 | 0 | { |
301 | 0 | return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, pOffset ); |
302 | 0 | } |
303 | | |
304 | | } |
305 | | |
306 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |