/src/libreoffice/sax/source/expatwrap/xml2utf.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | #include <string.h> |
20 | | |
21 | | #include <algorithm> |
22 | | |
23 | | #include <sal/types.h> |
24 | | |
25 | | #include <rtl/textenc.h> |
26 | | #include <rtl/tencinfo.h> |
27 | | #include <com/sun/star/io/NotConnectedException.hpp> |
28 | | #include <com/sun/star/io/XInputStream.hpp> |
29 | | #include <xml2utf.hxx> |
30 | | #include <memory> |
31 | | |
32 | | |
33 | | using namespace ::com::sun::star::uno; |
34 | | using namespace ::com::sun::star::io; |
35 | | |
36 | | |
37 | | namespace sax_expatwrap { |
38 | | |
39 | | sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) |
40 | 534k | { |
41 | 534k | if( ! m_in.is() ) { |
42 | 0 | throw NotConnectedException(); |
43 | 0 | } |
44 | 534k | if( ! m_bStarted ) { |
45 | | // it should be possible to find the encoding attribute |
46 | | // within the first 512 bytes == 128 chars in UCS-4 |
47 | 261k | nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead ); |
48 | 261k | } |
49 | | |
50 | 534k | sal_Int32 nRead; |
51 | 534k | Sequence< sal_Int8 > seqStart; |
52 | 534k | while( true ) |
53 | 534k | { |
54 | 534k | nRead = m_in->readSomeBytes( seq , nMaxToRead ); |
55 | | |
56 | 534k | if( nRead + seqStart.getLength()) |
57 | 286k | { |
58 | | // if nRead is 0, the file is already eof. |
59 | 286k | if( ! m_bStarted && nRead ) |
60 | 255k | { |
61 | | // ensure that enough data is available to parse encoding |
62 | 255k | if( seqStart.hasElements() ) |
63 | 101 | { |
64 | | // prefix with what we had so far. |
65 | 101 | sal_Int32 nLength = seq.getLength(); |
66 | 101 | seq.realloc( seqStart.getLength() + nLength ); |
67 | | |
68 | 101 | memmove (seq.getArray() + seqStart.getLength(), |
69 | 101 | seq.getConstArray(), |
70 | 101 | nLength); |
71 | 101 | memcpy (seq.getArray(), |
72 | 101 | seqStart.getConstArray(), |
73 | 101 | seqStart.getLength()); |
74 | 101 | } |
75 | | |
76 | | // autodetection with the first bytes |
77 | 255k | if( ! isEncodingRecognizable( seq ) ) |
78 | 325 | { |
79 | | // remember what we have so far. |
80 | 325 | seqStart = seq; |
81 | | |
82 | | // read more ! |
83 | 325 | continue; |
84 | 325 | } |
85 | 254k | if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) { |
86 | | // initialize decoding |
87 | 209k | initializeDecoding(); |
88 | 209k | } |
89 | 254k | seqStart = Sequence < sal_Int8 > (); |
90 | 254k | } |
91 | | |
92 | | // do the encoding |
93 | 286k | if( m_pText2Unicode && m_pUnicode2Text && |
94 | 41.4k | m_pText2Unicode->canContinue() ) { |
95 | | |
96 | 39.4k | Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); |
97 | 39.4k | seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); |
98 | 39.4k | } |
99 | | |
100 | 286k | if( ! m_bStarted ) |
101 | 255k | { |
102 | | // it must now be ensured, that no encoding attribute exist anymore |
103 | | // ( otherwise the expat-Parser will crash ) |
104 | | // This must be done after decoding ! |
105 | | // ( e.g. Files decoded in ucs-4 cannot be read properly ) |
106 | 255k | m_bStarted = true; |
107 | 255k | removeEncoding( seq ); |
108 | 255k | } |
109 | 286k | nRead = seq.getLength(); |
110 | 286k | } |
111 | | |
112 | 534k | break; |
113 | 534k | } |
114 | 534k | return nRead; |
115 | 534k | } |
116 | | |
117 | | void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) |
118 | 255k | { |
119 | 255k | const sal_Int8 *pSource = seq.getArray(); |
120 | 255k | if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) |
121 | 55.9k | return; |
122 | | |
123 | | // scan for encoding |
124 | 199k | OString str( reinterpret_cast<char const *>(pSource), seq.getLength() ); |
125 | | |
126 | | // cut sequence to first line break |
127 | | // find first line break; |
128 | 199k | int nMax = str.indexOf( 10 ); |
129 | 199k | if( nMax >= 0 ) |
130 | 192k | { |
131 | 192k | str = str.copy( 0 , nMax ); |
132 | 192k | } |
133 | | |
134 | 199k | int nFound = str.indexOf( " encoding" ); |
135 | 199k | if( nFound < 0 ) return; |
136 | | |
137 | 194k | int nStop; |
138 | 194k | int nStart = str.indexOf( "\"" , nFound ); |
139 | 194k | if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) |
140 | 168k | { |
141 | 168k | nStart = str.indexOf( "'" , nFound ); |
142 | 168k | nStop = str.indexOf( "'" , nStart +1 ); |
143 | 168k | } |
144 | 26.2k | else |
145 | 26.2k | { |
146 | 26.2k | nStop = str.indexOf( "\"" , nStart +1); |
147 | 26.2k | } |
148 | | |
149 | 194k | if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) |
150 | 28.6k | { |
151 | | // remove encoding tag from file |
152 | 28.6k | memmove( &( seq.getArray()[nFound] ) , |
153 | 28.6k | &( seq.getArray()[nStop+1]) , |
154 | 28.6k | seq.getLength() - nStop -1); |
155 | 28.6k | seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); |
156 | 28.6k | } |
157 | 194k | } |
158 | | |
159 | | // Checks, if enough data has been accumulated to recognize the encoding |
160 | | bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) |
161 | 255k | { |
162 | 255k | const sal_Int8 *pSource = seq.getConstArray(); |
163 | 255k | bool bCheckIfFirstClosingBracketExists = false; |
164 | | |
165 | 255k | if( seq.getLength() < 8 ) { |
166 | | // no recognition possible, when less than 8 bytes are available |
167 | 156 | return false; |
168 | 156 | } |
169 | | |
170 | 255k | if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) { |
171 | | // scan if the <?xml tag finishes within this buffer |
172 | 195k | bCheckIfFirstClosingBracketExists = true; |
173 | 195k | } |
174 | 59.8k | else if( ('<' == pSource[0] || '<' == pSource[2] ) && |
175 | 41.2k | ('?' == pSource[4] || '?' == pSource[6] ) ) |
176 | 535 | { |
177 | | // check for utf-16 |
178 | 535 | bCheckIfFirstClosingBracketExists = true; |
179 | 535 | } |
180 | 59.3k | else if( ( '<' == pSource[1] || '<' == pSource[3] ) && |
181 | 16.2k | ( '?' == pSource[5] || '?' == pSource[7] ) ) |
182 | 114 | { |
183 | | // check for |
184 | 114 | bCheckIfFirstClosingBracketExists = true; |
185 | 114 | } |
186 | | |
187 | 255k | if( bCheckIfFirstClosingBracketExists ) |
188 | 195k | { |
189 | | // whole <?xml tag is valid |
190 | 195k | return std::find(seq.begin(), seq.end(), '>') != seq.end(); |
191 | 195k | } |
192 | | |
193 | | // No <? tag in front, no need for a bigger buffer |
194 | 59.2k | return true; |
195 | 255k | } |
196 | | |
197 | | bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) |
198 | 254k | { |
199 | 254k | const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); |
200 | 254k | bool bReturn = true; |
201 | | |
202 | 254k | if( seq.getLength() < 4 ) { |
203 | | // no recognition possible, when less than 4 bytes are available |
204 | 0 | return false; |
205 | 0 | } |
206 | | |
207 | | // first level : detect possible file formats |
208 | 254k | if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) { |
209 | | // scan for encoding |
210 | 195k | OString str( reinterpret_cast<const char *>(pSource), seq.getLength() ); |
211 | | |
212 | | // cut sequence to first line break |
213 | | //find first line break; |
214 | 195k | int nMax = str.indexOf( 10 ); |
215 | 195k | if( nMax >= 0 ) |
216 | 191k | { |
217 | 191k | str = str.copy( 0 , nMax ); |
218 | 191k | } |
219 | | |
220 | 195k | int nFound = str.indexOf( " encoding" ); |
221 | 195k | if( nFound >= 0 ) { |
222 | 190k | int nStop; |
223 | 190k | int nStart = str.indexOf( "\"" , nFound ); |
224 | 190k | if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) |
225 | 164k | { |
226 | 164k | nStart = str.indexOf( "'" , nFound ); |
227 | 164k | nStop = str.indexOf( "'" , nStart +1 ); |
228 | 164k | } |
229 | 26.1k | else |
230 | 26.1k | { |
231 | 26.1k | nStop = str.indexOf( "\"" , nStart +1); |
232 | 26.1k | } |
233 | 190k | if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) |
234 | 28.6k | { |
235 | | // encoding found finally |
236 | 28.6k | m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); |
237 | 28.6k | } |
238 | 190k | } |
239 | 195k | } |
240 | 59.7k | else if( 0xFE == pSource[0] && |
241 | 544 | 0xFF == pSource[1] ) { |
242 | | // UTF-16 big endian |
243 | | // conversion is done so that encoding information can be easily extracted |
244 | 535 | m_sEncoding = "utf-16"_ostr; |
245 | 535 | } |
246 | 59.2k | else if( 0xFF == pSource[0] && |
247 | 285 | 0xFE == pSource[1] ) { |
248 | | // UTF-16 little endian |
249 | | // conversion is done so that encoding information can be easily extracted |
250 | 275 | m_sEncoding = "utf-16"_ostr; |
251 | 275 | } |
252 | 58.9k | else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { |
253 | | // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) |
254 | | // The byte order mark is simply added |
255 | | |
256 | | // simply add the byte order mark ! |
257 | 210 | seq.realloc( seq.getLength() + 2 ); |
258 | 210 | memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); |
259 | 210 | reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE; |
260 | 210 | reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF; |
261 | | |
262 | 210 | m_sEncoding = "utf-16"_ostr; |
263 | 210 | } |
264 | 58.7k | else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { |
265 | | // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) |
266 | | // The byte order mark is simply added |
267 | | |
268 | 363 | seq.realloc( seq.getLength() + 2 ); |
269 | 363 | memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); |
270 | 363 | reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF; |
271 | 363 | reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE; |
272 | | |
273 | 363 | m_sEncoding = "utf-16"_ostr; |
274 | 363 | } |
275 | 58.4k | else if( 0xEF == pSource[0] && |
276 | 12.5k | 0xBB == pSource[1] && |
277 | 12.5k | 0xBF == pSource[2] ) |
278 | 12.5k | { |
279 | | // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order |
280 | | // The BOM is removed. |
281 | 12.5k | memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); |
282 | 12.5k | seq.realloc( seq.getLength() - 3 ); |
283 | 12.5k | m_sEncoding = "utf-8"_ostr; |
284 | 12.5k | } |
285 | 45.8k | else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { |
286 | | // UCS-4 big endian |
287 | 6 | m_sEncoding = "ucs-4"_ostr; |
288 | 6 | } |
289 | 45.8k | else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { |
290 | | // UCS-4 little endian |
291 | 9 | m_sEncoding = "ucs-4"_ostr; |
292 | 9 | } |
293 | | /* TODO: no need to test for the moment since we return sal_False like default case anyway |
294 | | else if( 0x4c == pSource[0] && 0x6f == pSource[1] && |
295 | | 0xa7 == static_cast<unsigned char> (pSource[2]) && |
296 | | 0x94 == static_cast<unsigned char> (pSource[3]) ) { |
297 | | // EBCDIC |
298 | | bReturn = sal_False; // must be extended |
299 | | } |
300 | | */ |
301 | 45.8k | else { |
302 | | // other |
303 | | // UTF8 is directly recognized by the parser. |
304 | 45.8k | bReturn = false; |
305 | 45.8k | } |
306 | | |
307 | 254k | return bReturn; |
308 | 254k | } |
309 | | |
310 | | void XMLFile2UTFConverter::initializeDecoding() |
311 | 209k | { |
312 | | |
313 | 209k | if( !m_sEncoding.isEmpty() ) |
314 | 42.6k | { |
315 | 42.6k | rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); |
316 | 42.6k | if( encoding != RTL_TEXTENCODING_UTF8 ) |
317 | 29.9k | { |
318 | 29.9k | m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding ); |
319 | 29.9k | m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 ); |
320 | 29.9k | } |
321 | 42.6k | } |
322 | 209k | } |
323 | | |
324 | | |
325 | | // Text2UnicodeConverter |
326 | | |
327 | | |
328 | | Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) |
329 | 29.9k | : m_convText2Unicode(nullptr) |
330 | 29.9k | , m_contextText2Unicode(nullptr) |
331 | 29.9k | { |
332 | 29.9k | rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); |
333 | 29.9k | if( RTL_TEXTENCODING_DONTKNOW == encoding ) |
334 | 1.72k | { |
335 | 1.72k | m_bCanContinue = false; |
336 | 1.72k | m_bInitialized = false; |
337 | 1.72k | } |
338 | 28.2k | else |
339 | 28.2k | { |
340 | 28.2k | init( encoding ); |
341 | 28.2k | } |
342 | 29.9k | } |
343 | | |
344 | | Text2UnicodeConverter::~Text2UnicodeConverter() |
345 | 29.9k | { |
346 | 29.9k | if( m_bInitialized ) |
347 | 28.2k | { |
348 | 28.2k | rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); |
349 | 28.2k | rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); |
350 | 28.2k | } |
351 | 29.9k | } |
352 | | |
353 | | void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) |
354 | 28.2k | { |
355 | 28.2k | m_bCanContinue = true; |
356 | 28.2k | m_bInitialized = true; |
357 | | |
358 | 28.2k | m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); |
359 | 28.2k | m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); |
360 | 28.2k | } |
361 | | |
362 | | |
363 | | Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) |
364 | 39.4k | { |
365 | 39.4k | sal_uInt32 uiInfo; |
366 | 39.4k | sal_Size nSrcCvtBytes = 0; |
367 | 39.4k | sal_Size nTargetCount = 0; |
368 | 39.4k | sal_Size nSourceCount = 0; |
369 | | |
370 | | // the whole source size |
371 | 39.4k | sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); |
372 | 39.4k | Sequence<sal_Unicode> seqUnicode ( nSourceSize ); |
373 | | |
374 | 39.4k | const sal_Int8 *pbSource = seqText.getConstArray(); |
375 | 39.4k | std::unique_ptr<sal_Int8[]> pbTempMem; |
376 | | |
377 | 39.4k | if( m_seqSource.hasElements() ) { |
378 | | // put old rest and new byte sequence into one array |
379 | 25 | pbTempMem.reset(new sal_Int8[ nSourceSize ]); |
380 | 25 | memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() ); |
381 | 25 | memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); |
382 | 25 | pbSource = pbTempMem.get(); |
383 | | |
384 | | // set to zero again |
385 | 25 | m_seqSource = Sequence< sal_Int8 >(); |
386 | 25 | } |
387 | | |
388 | 39.4k | while( true ) { |
389 | | |
390 | | /* All invalid characters are transformed to the unicode undefined char */ |
391 | 39.4k | nTargetCount += rtl_convertTextToUnicode( |
392 | 39.4k | m_convText2Unicode, |
393 | 39.4k | m_contextText2Unicode, |
394 | 39.4k | reinterpret_cast<const char *>(&( pbSource[nSourceCount] )), |
395 | 39.4k | nSourceSize - nSourceCount , |
396 | 39.4k | &( seqUnicode.getArray()[ nTargetCount ] ), |
397 | 39.4k | seqUnicode.getLength() - nTargetCount, |
398 | 39.4k | RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | |
399 | 39.4k | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | |
400 | 39.4k | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, |
401 | 39.4k | &uiInfo, |
402 | 39.4k | &nSrcCvtBytes ); |
403 | 39.4k | nSourceCount += nSrcCvtBytes; |
404 | | |
405 | 39.4k | if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) { |
406 | | // save necessary bytes for next conversion |
407 | 0 | seqUnicode.realloc( seqUnicode.getLength() * 2 ); |
408 | 0 | continue; |
409 | 0 | } |
410 | 39.4k | break; |
411 | 39.4k | } |
412 | 39.4k | if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) { |
413 | 840 | m_seqSource.realloc( nSourceSize - nSourceCount ); |
414 | 840 | memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); |
415 | 840 | } |
416 | | |
417 | | // set to correct unicode size |
418 | 39.4k | seqUnicode.realloc( nTargetCount ); |
419 | | |
420 | 39.4k | return seqUnicode; |
421 | 39.4k | } |
422 | | |
423 | | |
424 | | // Unicode2TextConverter |
425 | | |
426 | | |
427 | | Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) |
428 | 29.9k | { |
429 | 29.9k | m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); |
430 | 29.9k | m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); |
431 | 29.9k | } |
432 | | |
433 | | |
434 | | Unicode2TextConverter::~Unicode2TextConverter() |
435 | 29.9k | { |
436 | 29.9k | rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); |
437 | 29.9k | rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); |
438 | 29.9k | } |
439 | | |
440 | | |
441 | | Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) |
442 | 39.4k | { |
443 | 39.4k | std::unique_ptr<sal_Unicode[]> puTempMem; |
444 | | |
445 | 39.4k | if( m_seqSource.hasElements() ) { |
446 | | // For surrogates ! |
447 | | // put old rest and new byte sequence into one array |
448 | | // In general when surrogates are used, they should be rarely |
449 | | // cut off between two convert()-calls. So this code is used |
450 | | // rarely and the extra copy is acceptable. |
451 | 0 | puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]); |
452 | 0 | memcpy( puTempMem.get() , |
453 | 0 | m_seqSource.getConstArray() , |
454 | 0 | m_seqSource.getLength() * sizeof( sal_Unicode ) ); |
455 | 0 | memcpy( |
456 | 0 | &(puTempMem[ m_seqSource.getLength() ]) , |
457 | 0 | puSource , |
458 | 0 | nSourceSize*sizeof( sal_Unicode ) ); |
459 | 0 | puSource = puTempMem.get(); |
460 | 0 | nSourceSize += m_seqSource.getLength(); |
461 | |
|
462 | 0 | m_seqSource = Sequence< sal_Unicode > (); |
463 | 0 | } |
464 | | |
465 | | |
466 | 39.4k | sal_Size nTargetCount = 0; |
467 | 39.4k | sal_Size nSourceCount = 0; |
468 | | |
469 | 39.4k | sal_uInt32 uiInfo; |
470 | 39.4k | sal_Size nSrcCvtChars; |
471 | | |
472 | | // take nSourceSize * 3 as preference |
473 | | // this is an upper boundary for converting to utf8, |
474 | | // which most often used as the target. |
475 | 39.4k | sal_Int32 nSeqSize = nSourceSize * 3; |
476 | | |
477 | 39.4k | Sequence<sal_Int8> seqText( nSeqSize ); |
478 | 39.4k | char *pTarget = reinterpret_cast<char *>(seqText.getArray()); |
479 | 39.4k | while( true ) { |
480 | | |
481 | 39.4k | nTargetCount += rtl_convertUnicodeToText( |
482 | 39.4k | m_convUnicode2Text, |
483 | 39.4k | m_contextUnicode2Text, |
484 | 39.4k | &( puSource[nSourceCount] ), |
485 | 39.4k | nSourceSize - nSourceCount , |
486 | 39.4k | &( pTarget[nTargetCount] ), |
487 | 39.4k | nSeqSize - nTargetCount, |
488 | 39.4k | RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | |
489 | 39.4k | RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , |
490 | 39.4k | &uiInfo, |
491 | 39.4k | &nSrcCvtChars); |
492 | 39.4k | nSourceCount += nSrcCvtChars; |
493 | | |
494 | 39.4k | if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { |
495 | 0 | nSeqSize = nSeqSize *2; |
496 | 0 | seqText.realloc( nSeqSize ); // double array size |
497 | 0 | pTarget = reinterpret_cast<char *>(seqText.getArray()); |
498 | 0 | continue; |
499 | 0 | } |
500 | 39.4k | break; |
501 | 39.4k | } |
502 | | |
503 | | // for surrogates |
504 | 39.4k | if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { |
505 | 0 | m_seqSource.realloc( nSourceSize - nSourceCount ); |
506 | 0 | memcpy( m_seqSource.getArray() , |
507 | 0 | &(puSource[nSourceCount]), |
508 | 0 | (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); |
509 | 0 | } |
510 | | |
511 | | // reduce the size of the buffer (fast, no copy necessary) |
512 | 39.4k | seqText.realloc( nTargetCount ); |
513 | | |
514 | 39.4k | return seqText; |
515 | 39.4k | } |
516 | | |
517 | | } |
518 | | |
519 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |