/src/exiv2/xmpsdk/src/UnicodeConversions.cpp
Line | Count | Source |
1 | | // ================================================================================================= |
2 | | // Copyright 2004-2007 Adobe Systems Incorporated |
3 | | // All Rights Reserved. |
4 | | // |
5 | | // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms |
6 | | // of the Adobe license agreement accompanying it. |
7 | | // ================================================================================================= |
8 | | |
9 | | #include "XMP_Const.h" |
10 | | |
11 | | #if UnicodeTestBuild |
12 | | #include <cassert> |
13 | | #include <stdexcept> |
14 | | #define UC_Assert assert |
15 | | #define UC_Throw(m,k) throw std::logic_error ( m ) |
16 | | #else |
17 | | #define UC_Assert(cond) /* Nothing for now, should be XMP_Assert. */ |
18 | 0 | #define UC_Throw(msg,id) throw XMP_Error ( id, msg ) |
19 | | #endif |
20 | | |
21 | | #include "UnicodeConversions.hpp" |
22 | | |
23 | | using namespace std; |
24 | | |
25 | | // ================================================================================================= |
26 | | |
27 | | // *** Look into using asm inlines, e.g. count-leading bits for multi-byte UTF-8. |
28 | | |
29 | | CodePoint_to_UTF16_Proc CodePoint_to_UTF16BE = 0; |
30 | | CodePoint_to_UTF16_Proc CodePoint_to_UTF16LE = 0; |
31 | | |
32 | | CodePoint_from_UTF16_Proc CodePoint_from_UTF16BE = 0; |
33 | | CodePoint_from_UTF16_Proc CodePoint_from_UTF16LE = 0; |
34 | | |
35 | | UTF8_to_UTF16_Proc UTF8_to_UTF16BE = 0; |
36 | | UTF8_to_UTF16_Proc UTF8_to_UTF16LE = 0; |
37 | | UTF8_to_UTF32_Proc UTF8_to_UTF32BE = 0; |
38 | | UTF8_to_UTF32_Proc UTF8_to_UTF32LE = 0; |
39 | | |
40 | | UTF16_to_UTF8_Proc UTF16BE_to_UTF8 = 0; |
41 | | UTF16_to_UTF8_Proc UTF16LE_to_UTF8 = 0; |
42 | | UTF32_to_UTF8_Proc UTF32BE_to_UTF8 = 0; |
43 | | UTF32_to_UTF8_Proc UTF32LE_to_UTF8 = 0; |
44 | | |
45 | | UTF8_to_UTF16_Proc UTF8_to_UTF16Native = 0; |
46 | | UTF8_to_UTF32_Proc UTF8_to_UTF32Native = 0; |
47 | | UTF16_to_UTF8_Proc UTF16Native_to_UTF8 = 0; |
48 | | UTF32_to_UTF8_Proc UTF32Native_to_UTF8 = 0; |
49 | | |
50 | | UTF16_to_UTF32_Proc UTF16BE_to_UTF32BE = 0; |
51 | | UTF16_to_UTF32_Proc UTF16BE_to_UTF32LE = 0; |
52 | | UTF16_to_UTF32_Proc UTF16LE_to_UTF32BE = 0; |
53 | | UTF16_to_UTF32_Proc UTF16LE_to_UTF32LE = 0; |
54 | | |
55 | | UTF32_to_UTF16_Proc UTF32BE_to_UTF16BE = 0; |
56 | | UTF32_to_UTF16_Proc UTF32BE_to_UTF16LE = 0; |
57 | | UTF32_to_UTF16_Proc UTF32LE_to_UTF16BE = 0; |
58 | | UTF32_to_UTF16_Proc UTF32LE_to_UTF16LE = 0; |
59 | | |
60 | | // ------------------------------------------------------------------------------------------------- |
61 | | |
62 | | static size_t swap32to16Offset = 0; // Offset to "convert" a swapped UTF32 pointer into a swapped UTF16 pointer. |
63 | | |
64 | | // ------------------------------------------------------------------------------------------------- |
65 | | |
66 | | static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written ); |
67 | | static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written ); |
68 | | |
69 | | static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read ); |
70 | | static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read ); |
71 | | |
72 | | // ------------------------------------------------------------------------------------------------- |
73 | | |
74 | | static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In, const size_t utf8Len, |
75 | | UTF16Unit * utf16Out, const size_t utf16Len, |
76 | | size_t * utf8Read, size_t * utf16Written ); |
77 | | |
78 | | static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In, const size_t utf8Len, |
79 | | UTF16Unit * utf16Out, const size_t utf16Len, |
80 | | size_t * utf8Read, size_t * utf16Written ); |
81 | | |
82 | | static void UTF8_to_UTF32Nat ( const UTF8Unit * utf8In, const size_t utf8Len, |
83 | | UTF32Unit * utf32Out, const size_t utf32Len, |
84 | | size_t * utf8Read, size_t * utf32Written ); |
85 | | |
86 | | static void UTF8_to_UTF32Swp ( const UTF8Unit * utf8In, const size_t utf8Len, |
87 | | UTF32Unit * utf32Out, const size_t utf32Len, |
88 | | size_t * utf8Read, size_t * utf32Written ); |
89 | | |
90 | | // ------------------------------------------------------------------------------------------------- |
91 | | |
92 | | static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len, |
93 | | UTF8Unit * utf8Out, const size_t utf8Len, |
94 | | size_t * utf16Read, size_t * utf8Written ); |
95 | | |
96 | | static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len, |
97 | | UTF8Unit * utf8Out, const size_t utf8Len, |
98 | | size_t * utf16Read, size_t * utf8Written ); |
99 | | |
100 | | static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len, |
101 | | UTF8Unit * utf8Out, const size_t utf8Len, |
102 | | size_t * utf32Read, size_t * utf8Written ); |
103 | | |
104 | | static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len, |
105 | | UTF8Unit * utf8Out, const size_t utf8Len, |
106 | | size_t * utf32Read, size_t * utf8Written ); |
107 | | |
108 | | // ------------------------------------------------------------------------------------------------- |
109 | | |
110 | | static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len, |
111 | | UTF32Unit * utf32Out, const size_t utf32Len, |
112 | | size_t * utf16Read, size_t * utf32Written ); |
113 | | |
114 | | static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len, |
115 | | UTF32Unit * utf32Out, const size_t utf32Len, |
116 | | size_t * utf16Read, size_t * utf32Written ); |
117 | | |
118 | | static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len, |
119 | | UTF32Unit * utf32Out, const size_t utf32Len, |
120 | | size_t * utf16Read, size_t * utf32Written ); |
121 | | |
122 | | static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len, |
123 | | UTF32Unit * utf32Out, const size_t utf32Len, |
124 | | size_t * utf16Read, size_t * utf32Written ); |
125 | | |
126 | | // ------------------------------------------------------------------------------------------------- |
127 | | |
128 | | static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len, |
129 | | UTF16Unit * utf16Out, const size_t utf16Len, |
130 | | size_t * utf32Read, size_t * utf16Written ); |
131 | | |
132 | | static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len, |
133 | | UTF16Unit * utf16Out, const size_t utf16Len, |
134 | | size_t * utf32Read, size_t * utf16Written ); |
135 | | |
136 | | static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len, |
137 | | UTF16Unit * utf16Out, const size_t utf16Len, |
138 | | size_t * utf32Read, size_t * utf16Written ); |
139 | | |
140 | | static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len, |
141 | | UTF16Unit * utf16Out, const size_t utf16Len, |
142 | | size_t * utf32Read, size_t * utf16Written ); |
143 | | |
144 | | // ================================================================================================= |
145 | | |
146 | | void InitializeUnicodeConversions() |
147 | 1 | { |
148 | 1 | UC_Assert ( (sizeof(UTF8Unit) == 1) && (sizeof(UTF16Unit) == 2) && (sizeof(UTF32Unit) == 4) ); |
149 | | |
150 | 1 | UTF16Unit u16 = 0x00FF; |
151 | 1 | bool bigEndian = (*((UTF8Unit*)&u16) == 0); |
152 | | |
153 | 1 | UTF8_to_UTF16Native = UTF8_to_UTF16Nat; |
154 | 1 | UTF8_to_UTF32Native = UTF8_to_UTF32Nat; |
155 | 1 | UTF16Native_to_UTF8 = UTF16Nat_to_UTF8; |
156 | 1 | UTF32Native_to_UTF8 = UTF32Nat_to_UTF8; |
157 | | |
158 | 1 | if ( bigEndian ) { |
159 | | |
160 | 0 | swap32to16Offset = 0; |
161 | |
|
162 | 0 | CodePoint_to_UTF16BE = CodePoint_to_UTF16Nat; |
163 | 0 | CodePoint_to_UTF16LE = CodePoint_to_UTF16Swp; |
164 | |
|
165 | 0 | CodePoint_from_UTF16BE = CodePoint_from_UTF16Nat; |
166 | 0 | CodePoint_from_UTF16LE = CodePoint_from_UTF16Swp; |
167 | |
|
168 | 0 | UTF8_to_UTF16BE = UTF8_to_UTF16Nat; |
169 | 0 | UTF8_to_UTF16LE = UTF8_to_UTF16Swp; |
170 | 0 | UTF8_to_UTF32BE = UTF8_to_UTF32Nat; |
171 | 0 | UTF8_to_UTF32LE = UTF8_to_UTF32Swp; |
172 | |
|
173 | 0 | UTF16BE_to_UTF8 = UTF16Nat_to_UTF8; |
174 | 0 | UTF16LE_to_UTF8 = UTF16Swp_to_UTF8; |
175 | 0 | UTF32BE_to_UTF8 = UTF32Nat_to_UTF8; |
176 | 0 | UTF32LE_to_UTF8 = UTF32Swp_to_UTF8; |
177 | |
|
178 | 0 | UTF16BE_to_UTF32BE = UTF16Nat_to_UTF32Nat; |
179 | 0 | UTF16BE_to_UTF32LE = UTF16Nat_to_UTF32Swp; |
180 | 0 | UTF16LE_to_UTF32BE = UTF16Swp_to_UTF32Nat; |
181 | 0 | UTF16LE_to_UTF32LE = UTF16Swp_to_UTF32Swp; |
182 | |
|
183 | 0 | UTF32BE_to_UTF16BE = UTF32Nat_to_UTF16Nat; |
184 | 0 | UTF32BE_to_UTF16LE = UTF32Nat_to_UTF16Swp; |
185 | 0 | UTF32LE_to_UTF16BE = UTF32Swp_to_UTF16Nat; |
186 | 0 | UTF32LE_to_UTF16LE = UTF32Swp_to_UTF16Swp; |
187 | |
|
188 | 1 | } else { |
189 | | |
190 | 1 | swap32to16Offset = 1; // ! Offset in UTF16 units! |
191 | | |
192 | 1 | CodePoint_to_UTF16BE = CodePoint_to_UTF16Swp; |
193 | 1 | CodePoint_to_UTF16LE = CodePoint_to_UTF16Nat; |
194 | | |
195 | 1 | CodePoint_from_UTF16BE = CodePoint_from_UTF16Swp; |
196 | 1 | CodePoint_from_UTF16LE = CodePoint_from_UTF16Nat; |
197 | | |
198 | 1 | UTF8_to_UTF16BE = UTF8_to_UTF16Swp; |
199 | 1 | UTF8_to_UTF16LE = UTF8_to_UTF16Nat; |
200 | 1 | UTF8_to_UTF32BE = UTF8_to_UTF32Swp; |
201 | 1 | UTF8_to_UTF32LE = UTF8_to_UTF32Nat; |
202 | | |
203 | 1 | UTF16BE_to_UTF8 = UTF16Swp_to_UTF8; |
204 | 1 | UTF16LE_to_UTF8 = UTF16Nat_to_UTF8; |
205 | 1 | UTF32BE_to_UTF8 = UTF32Swp_to_UTF8; |
206 | 1 | UTF32LE_to_UTF8 = UTF32Nat_to_UTF8; |
207 | | |
208 | 1 | UTF16BE_to_UTF32BE = UTF16Swp_to_UTF32Swp; |
209 | 1 | UTF16BE_to_UTF32LE = UTF16Swp_to_UTF32Nat; |
210 | 1 | UTF16LE_to_UTF32BE = UTF16Nat_to_UTF32Swp; |
211 | 1 | UTF16LE_to_UTF32LE = UTF16Nat_to_UTF32Nat; |
212 | | |
213 | 1 | UTF32BE_to_UTF16BE = UTF32Swp_to_UTF16Swp; |
214 | 1 | UTF32BE_to_UTF16LE = UTF32Swp_to_UTF16Nat; |
215 | 1 | UTF32LE_to_UTF16BE = UTF32Nat_to_UTF16Swp; |
216 | 1 | UTF32LE_to_UTF16LE = UTF32Nat_to_UTF16Nat; |
217 | | |
218 | 1 | } |
219 | | |
220 | 1 | } // InitializeUnicodeConversions |
221 | | |
222 | | // ================================================================================================= |
223 | | |
224 | | #if XMP_MacBuild && __MWERKS__ |
225 | | |
226 | | #define UTF16InSwap(inPtr) UTF16Unit ( __lhbrx ( (void*)(inPtr), 0 ) ) |
227 | | #define UTF32InSwap(inPtr) UTF32Unit ( __lwbrx ( (void*)(inPtr), 0 ) ) |
228 | | |
229 | | #define UTF16OutSwap(outPtr,value) __sthbrx ( value, (void*)(outPtr), 0 ) |
230 | | #define UTF32OutSwap(outPtr,value) __stwbrx ( value, (void*)(outPtr), 0 ) |
231 | | |
232 | | #else |
233 | | |
234 | | static inline UTF16Unit UTF16InSwap ( const UTF16Unit * inPtr ) |
235 | 0 | { |
236 | 0 | UTF16Unit inUnit = *inPtr; |
237 | 0 | return (inUnit << 8) | (inUnit >> 8); |
238 | 0 | } |
239 | | |
240 | | static inline UTF32Unit UTF32InSwap ( const UTF32Unit * inPtr ) |
241 | 0 | { |
242 | 0 | UTF32Unit inUnit = *inPtr; |
243 | 0 | return (inUnit << 24) | ((inUnit << 8) & 0x00FF0000) | ((inUnit >> 8) & 0x0000FF00) | (inUnit >> 24); |
244 | 0 | } |
245 | | |
246 | | static inline void UTF16OutSwap ( UTF16Unit * outPtr, const UTF16Unit value ) |
247 | 0 | { |
248 | 0 | UTF16Unit outUnit = (value << 8) | (value >> 8); |
249 | 0 | *outPtr = outUnit; |
250 | 0 | } |
251 | | |
252 | | static inline void UTF32OutSwap ( UTF32Unit * outPtr, const UTF32Unit value ) |
253 | 0 | { |
254 | 0 | UTF32Unit outUnit = (value << 24) | ((value << 8) & 0x00FF0000) | ((value >> 8) & 0x0000FF00) | (value >> 24); |
255 | 0 | *outPtr = outUnit; |
256 | 0 | } |
257 | | |
258 | | #endif |
259 | | |
260 | | // ================================================================================================= |
261 | | |
262 | | void SwapUTF16 ( const UTF16Unit * utf16In, UTF16Unit * utf16Out, const size_t utf16Len ) |
263 | 0 | { |
264 | 0 | for ( size_t i = 0; i < utf16Len; ++i ) utf16Out[i] = UTF16InSwap(utf16In+i); |
265 | 0 | } |
266 | | |
267 | 0 | void SwapUTF32 ( const UTF32Unit * utf32In, UTF32Unit * utf32Out, const size_t utf32Len ) { |
268 | 0 | for ( size_t i = 0; i < utf32Len; ++i ) utf32Out[i] = UTF32InSwap(utf32In+i); |
269 | 0 | } |
270 | | |
271 | | // ================================================================================================= |
272 | | |
273 | | extern void ToUTF16 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str, bool bigEndian ) |
274 | 0 | { |
275 | 0 | UTF8_to_UTF16_Proc Converter = UTF8_to_UTF16LE; |
276 | 0 | if ( bigEndian ) Converter = UTF8_to_UTF16BE; |
277 | | |
278 | 0 | enum { kBufferSize = 8*1024 }; |
279 | 0 | UTF16Unit u16Buffer[kBufferSize]; // 16K bytes |
280 | 0 | size_t readCount, writeCount; |
281 | |
|
282 | 0 | utf16Str->erase(); |
283 | 0 | utf16Str->reserve ( 2*utf8Len ); // As good a guess as any. |
284 | | |
285 | 0 | while ( utf8Len > 0 ) { |
286 | 0 | Converter ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount ); |
287 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
288 | 0 | utf16Str->append ( (const char *)u16Buffer, writeCount*2 ); |
289 | 0 | utf8In += readCount; |
290 | 0 | utf8Len -= readCount; |
291 | 0 | } |
292 | |
|
293 | 0 | } // ToUTF16 |
294 | | |
295 | | // ================================================================================================= |
296 | | |
297 | | extern void ToUTF16Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str ) |
298 | 0 | { |
299 | 0 | enum { kBufferSize = 8*1024 }; |
300 | 0 | UTF16Unit u16Buffer[kBufferSize]; // 16K bytes |
301 | 0 | size_t readCount, writeCount; |
302 | |
|
303 | 0 | utf16Str->erase(); |
304 | 0 | utf16Str->reserve ( 2*utf8Len ); // As good a guess as any. |
305 | | |
306 | 0 | while ( utf8Len > 0 ) { |
307 | 0 | UTF8_to_UTF16Nat ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount ); |
308 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
309 | 0 | utf16Str->append ( (const char *)u16Buffer, writeCount*2 ); |
310 | 0 | utf8In += readCount; |
311 | 0 | utf8Len -= readCount; |
312 | 0 | } |
313 | |
|
314 | 0 | } // ToUTF16Native |
315 | | |
316 | | // ================================================================================================= |
317 | | |
318 | | extern void ToUTF32 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str, bool bigEndian ) |
319 | 0 | { |
320 | 0 | UTF8_to_UTF32_Proc Converter = UTF8_to_UTF32LE; |
321 | 0 | if ( bigEndian ) Converter = UTF8_to_UTF32BE; |
322 | | |
323 | 0 | enum { kBufferSize = 4*1024 }; |
324 | 0 | UTF32Unit u32Buffer[kBufferSize]; // 16K bytes |
325 | 0 | size_t readCount, writeCount; |
326 | |
|
327 | 0 | utf32Str->erase(); |
328 | 0 | utf32Str->reserve ( 4*utf8Len ); // As good a guess as any. |
329 | | |
330 | 0 | while ( utf8Len > 0 ) { |
331 | 0 | Converter ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount ); |
332 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
333 | 0 | utf32Str->append ( (const char *)u32Buffer, writeCount*4 ); |
334 | 0 | utf8In += readCount; |
335 | 0 | utf8Len -= readCount; |
336 | 0 | } |
337 | |
|
338 | 0 | } // ToUTF32 |
339 | | |
340 | | // ================================================================================================= |
341 | | |
342 | | extern void ToUTF32Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str ) |
343 | 0 | { |
344 | 0 | enum { kBufferSize = 4*1024 }; |
345 | 0 | UTF32Unit u32Buffer[kBufferSize]; // 16K bytes |
346 | 0 | size_t readCount, writeCount; |
347 | |
|
348 | 0 | utf32Str->erase(); |
349 | 0 | utf32Str->reserve ( 4*utf8Len ); // As good a guess as any. |
350 | | |
351 | 0 | while ( utf8Len > 0 ) { |
352 | 0 | UTF8_to_UTF32Nat ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount ); |
353 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
354 | 0 | utf32Str->append ( (const char *)u32Buffer, writeCount*4 ); |
355 | 0 | utf8In += readCount; |
356 | 0 | utf8Len -= readCount; |
357 | 0 | } |
358 | |
|
359 | 0 | } // ToUTF32Native |
360 | | |
361 | | // ================================================================================================= |
362 | | |
363 | | extern void FromUTF16 ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str, bool bigEndian ) |
364 | 0 | { |
365 | 0 | UTF16_to_UTF8_Proc Converter = UTF16LE_to_UTF8; |
366 | 0 | if ( bigEndian ) Converter = UTF16BE_to_UTF8; |
367 | | |
368 | 0 | enum { kBufferSize = 16*1024 }; |
369 | 0 | UTF8Unit u8Buffer[kBufferSize]; |
370 | 0 | size_t readCount, writeCount; |
371 | |
|
372 | 0 | utf8Str->erase(); |
373 | 0 | utf8Str->reserve ( 2*utf16Len ); // As good a guess as any. |
374 | | |
375 | 0 | while ( utf16Len > 0 ) { |
376 | 0 | Converter ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount ); |
377 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
378 | 0 | utf8Str->append ( (const char *)u8Buffer, writeCount ); |
379 | 0 | utf16In += readCount; |
380 | 0 | utf16Len -= readCount; |
381 | 0 | } |
382 | |
|
383 | 0 | } // FromUTF16 |
384 | | |
385 | | // ================================================================================================= |
386 | | |
387 | | extern void FromUTF16Native ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str ) |
388 | 0 | { |
389 | 0 | enum { kBufferSize = 16*1024 }; |
390 | 0 | UTF8Unit u8Buffer[kBufferSize]; |
391 | 0 | size_t readCount, writeCount; |
392 | |
|
393 | 0 | utf8Str->erase(); |
394 | 0 | utf8Str->reserve ( 2*utf16Len ); // As good a guess as any. |
395 | | |
396 | 0 | while ( utf16Len > 0 ) { |
397 | 0 | UTF16Nat_to_UTF8 ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount ); |
398 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
399 | 0 | utf8Str->append ( (const char *)u8Buffer, writeCount ); |
400 | 0 | utf16In += readCount; |
401 | 0 | utf16Len -= readCount; |
402 | 0 | } |
403 | |
|
404 | 0 | } // FromUTF16Native |
405 | | |
406 | | // ================================================================================================= |
407 | | |
408 | | extern void FromUTF32 ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str, bool bigEndian ) |
409 | 0 | { |
410 | 0 | UTF32_to_UTF8_Proc Converter = UTF32LE_to_UTF8; |
411 | 0 | if ( bigEndian ) Converter = UTF32BE_to_UTF8; |
412 | | |
413 | 0 | enum { kBufferSize = 16*1024 }; |
414 | 0 | UTF8Unit u8Buffer[kBufferSize]; |
415 | 0 | size_t readCount, writeCount; |
416 | |
|
417 | 0 | utf8Str->erase(); |
418 | 0 | utf8Str->reserve ( 2*utf32Len ); // As good a guess as any. |
419 | | |
420 | 0 | while ( utf32Len > 0 ) { |
421 | 0 | Converter ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount ); |
422 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
423 | 0 | utf8Str->append ( (const char *)u8Buffer, writeCount ); |
424 | 0 | utf32In += readCount; |
425 | 0 | utf32Len -= readCount; |
426 | 0 | } |
427 | |
|
428 | 0 | } // FromUTF32 |
429 | | |
430 | | // ================================================================================================= |
431 | | |
432 | | extern void FromUTF32Native ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str ) |
433 | 0 | { |
434 | 0 | enum { kBufferSize = 16*1024 }; |
435 | 0 | UTF8Unit u8Buffer[kBufferSize]; |
436 | 0 | size_t readCount, writeCount; |
437 | |
|
438 | 0 | utf8Str->erase(); |
439 | 0 | utf8Str->reserve ( 2*utf32Len ); // As good a guess as any. |
440 | | |
441 | 0 | while ( utf32Len > 0 ) { |
442 | 0 | UTF32Nat_to_UTF8 ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount ); |
443 | 0 | if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML ); |
444 | 0 | utf8Str->append ( (const char *)u8Buffer, writeCount ); |
445 | 0 | utf32In += readCount; |
446 | 0 | utf32Len -= readCount; |
447 | 0 | } |
448 | |
|
449 | 0 | } // FromUTF32Native |
450 | | |
451 | | // ================================================================================================= |
452 | | |
453 | | static void CodePoint_to_UTF8_Multi ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written ) |
454 | 0 | { |
455 | 0 | size_t unitCount = 0; |
456 | | |
457 | 0 | if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam ); |
458 | 0 | if ( (0xD800 <= cpIn) && (cpIn <= 0xDFFF) ) UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam ); |
459 | | |
460 | | // Compute the number of bytes using 6 data bits each. Then see if the highest order bits will |
461 | | // fit into the leading byte. Write the UTF-8 sequence if there is enough room. |
462 | | |
463 | 0 | UTF32Unit temp, mask; |
464 | 0 | size_t bytesNeeded = 0; |
465 | 0 | for ( temp = cpIn; temp != 0; temp = temp >> 6 ) ++bytesNeeded; |
466 | | |
467 | 0 | temp = cpIn >> ((bytesNeeded-1)*6); // The highest order data bits. |
468 | 0 | mask = (0x80 >> bytesNeeded) - 1; // Available data bits in the leading byte. |
469 | 0 | if ( temp > mask ) ++bytesNeeded; |
470 | |
|
471 | 0 | if ( bytesNeeded > utf8Len ) goto Done; // Not enough room for the output. |
472 | 0 | unitCount = bytesNeeded; |
473 | | |
474 | 0 | temp = cpIn; |
475 | 0 | for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded ) { |
476 | 0 | utf8Out[bytesNeeded] = 0x80 | UTF8Unit ( temp & 0x3F ); |
477 | 0 | temp = temp >> 6; |
478 | 0 | } |
479 | | |
480 | 0 | mask = ~((1 << (8-unitCount)) - 1); |
481 | 0 | utf8Out[0] = UTF8Unit ( mask | temp ); |
482 | |
|
483 | 0 | Done: |
484 | 0 | *utf8Written = unitCount; |
485 | 0 | return; |
486 | | |
487 | 0 | } // CodePoint_to_UTF8_Multi |
488 | | |
489 | | // ================================================================================================= |
490 | | |
491 | | void CodePoint_to_UTF8 ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written ) |
492 | 0 | { |
493 | 0 | size_t unitCount = 0; |
494 | |
|
495 | 0 | UC_Assert ( (utf8Out != 0) && (utf8Written != 0) ); |
496 | 0 | if ( utf8Len == 0 ) goto Done; |
497 | 0 | if ( cpIn > 0x7F ) goto MultiByte; // ! Force linear execution path for ASCII. |
498 | | |
499 | 0 | if ( utf8Len == 0 ) goto Done; |
500 | 0 | unitCount = 1; |
501 | 0 | *utf8Out = UTF8Unit(cpIn); |
502 | |
|
503 | 0 | Done: |
504 | 0 | *utf8Written = unitCount; |
505 | 0 | return; |
506 | | |
507 | 0 | MultiByte: |
508 | 0 | CodePoint_to_UTF8_Multi( cpIn, utf8Out, utf8Len, utf8Written ); |
509 | 0 | return; |
510 | | |
511 | 0 | } // CodePoint_to_UTF8 |
512 | | |
513 | | // ================================================================================================= |
514 | | |
515 | | static void CodePoint_from_UTF8_Multi ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read ) |
516 | 0 | { |
517 | 0 | UTF8Unit inUnit = *utf8In; |
518 | 0 | size_t unitCount = 0; |
519 | 0 | UTF32Unit cp; // ! Avoid gcc complaints about declarations after goto's. |
520 | 0 | const UTF8Unit * utf8Pos; |
521 | | |
522 | | // ------------------------------------------------------------------------------------- |
523 | | // We've got a multibyte UTF-8 character. The first byte has the number of bytes and the |
524 | | // highest order data bits. The other bytes each add 6 more data bits. |
525 | | |
526 | | #if 0 // This might be a more effcient way to count the bytes. |
527 | | static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; |
528 | | size_t bytesNeeded = kByteCounts [ inUnit >> 4 ]; |
529 | | if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((inUnit & 0x08) != 0)) ) { |
530 | | UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam ); |
531 | | } |
532 | | #endif |
533 | |
|
534 | 0 | size_t bytesNeeded = 0; // Count the leading 1 bits in the first byte. |
535 | 0 | for ( UTF8Unit temp = inUnit; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded; |
536 | | // *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC. |
537 | | |
538 | 0 | if ( (bytesNeeded < 2) || (bytesNeeded > 4) ) UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam ); |
539 | 0 | if ( bytesNeeded > utf8Len ) goto Done; // Not enough input in this buffer. |
540 | 0 | unitCount = bytesNeeded; |
541 | | |
542 | 0 | cp = inUnit & ((1 << (7-unitCount)) - 1); // Isolate the initial data bits in the bottom of cp. |
543 | | |
544 | 0 | utf8Pos = utf8In + 1; // We've absorbed the first byte. |
545 | 0 | for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) { |
546 | 0 | inUnit = *utf8Pos; |
547 | 0 | if ( (inUnit & UTF8Unit(0xC0)) != UTF8Unit(0x80) ) UC_Throw ( "Invalid UTF-8 data byte", kXMPErr_BadParam ); |
548 | 0 | cp = (cp << 6) | (inUnit & 0x3F); |
549 | 0 | } |
550 | | |
551 | 0 | if ( cp >= 0xD800 ) { // Skip the next comparisons most of the time. |
552 | 0 | if ( (0xD800 <= cp) && (cp <= 0xDFFF) ) UC_Throw ( "Bad UTF-8 - surrogate code point", kXMPErr_BadParam ); |
553 | 0 | if ( cp > 0x10FFFF ) UC_Throw ( "Bad UTF-8 - out of range", kXMPErr_BadParam ); |
554 | 0 | } |
555 | | |
556 | 0 | *cpOut = cp; // ! Don't put after Done, don't write if no input. |
557 | | |
558 | 0 | Done: |
559 | 0 | *utf8Read = unitCount; |
560 | 0 | return; |
561 | | |
562 | 0 | } // CodePoint_from_UTF8_Multi |
563 | | |
564 | | // ================================================================================================= |
565 | | |
566 | | void CodePoint_from_UTF8 ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read ) |
567 | 0 | { |
568 | 0 | UTF8Unit inUnit; // ! Don't read until we know there is input. |
569 | 0 | size_t unitCount = 0; |
570 | |
|
571 | 0 | UC_Assert ( (utf8In != 0) && (cpOut != 0) && (utf8Read != 0) ); |
572 | 0 | if ( utf8Len == 0 ) goto Done; |
573 | 0 | inUnit = *utf8In; |
574 | 0 | if ( inUnit >= 0x80 ) goto MultiByte; // ! Force linear execution path for ASCII. |
575 | | |
576 | 0 | unitCount = 1; |
577 | 0 | *cpOut = inUnit; // ! Don't put after Done, don't write if no input. |
578 | | |
579 | 0 | Done: |
580 | 0 | *utf8Read = unitCount; |
581 | 0 | return; |
582 | | |
583 | 0 | MultiByte: |
584 | 0 | CodePoint_from_UTF8_Multi ( utf8In, utf8Len, cpOut, utf8Read ); |
585 | 0 | return; |
586 | | |
587 | 0 | } // CodePoint_from_UTF8 |
588 | | |
589 | | // ================================================================================================= |
590 | | |
591 | | static void CodePoint_to_UTF16Nat_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written ) |
592 | 0 | { |
593 | 0 | size_t unitCount = 0; |
594 | 0 | UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's. |
595 | |
|
596 | 0 | if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam ); |
597 | 0 | if ( utf16Len < 2 ) goto Done; // Not enough room for the output. |
598 | | |
599 | 0 | unitCount = 2; |
600 | 0 | temp = cpIn - 0x10000; |
601 | 0 | utf16Out[0] = 0xD800 | UTF16Unit ( temp >> 10 ); |
602 | 0 | utf16Out[1] = 0xDC00 | UTF16Unit ( temp & 0x3FF ); |
603 | | |
604 | 0 | Done: |
605 | 0 | *utf16Written = unitCount; |
606 | 0 | return; |
607 | | |
608 | 0 | } // CodePoint_to_UTF16Nat_Surrogate |
609 | | |
610 | | // ================================================================================================= |
611 | | |
612 | | static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written ) |
613 | 0 | { |
614 | 0 | size_t unitCount = 0; |
615 | |
|
616 | 0 | UC_Assert ( (utf16Out != 0) && (utf16Written != 0) ); |
617 | 0 | if ( utf16Len == 0 ) goto Done; |
618 | 0 | if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP. |
619 | | |
620 | 0 | InBMP: |
621 | 0 | unitCount = 1; |
622 | 0 | *utf16Out = UTF16Unit(cpIn); |
623 | | |
624 | 0 | Done: |
625 | 0 | *utf16Written = unitCount; |
626 | 0 | return; |
627 | | |
628 | 0 | CheckSurrogate: |
629 | 0 | if ( cpIn > 0xFFFF ) goto SurrogatePair; |
630 | 0 | if ( cpIn > 0xDFFF ) goto InBMP; |
631 | 0 | UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam ); |
632 | | |
633 | 0 | SurrogatePair: |
634 | 0 | CodePoint_to_UTF16Nat_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written ); |
635 | 0 | return; |
636 | | |
637 | 0 | } // CodePoint_to_UTF16Nat |
638 | | |
639 | | // ================================================================================================= |
640 | | |
641 | | static void CodePoint_from_UTF16Nat_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read ) |
642 | 0 | { |
643 | 0 | UTF16Unit hiUnit = *utf16In; |
644 | 0 | size_t unitCount = 0; |
645 | 0 | UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's. |
646 | 0 | UTF32Unit cp; |
647 | | |
648 | | // ---------------------------------- |
649 | | // We've got a UTF-16 surrogate pair. |
650 | |
|
651 | 0 | if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam ); |
652 | 0 | if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer. |
653 | | |
654 | 0 | loUnit = *(utf16In+1); |
655 | 0 | if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam ); |
656 | | |
657 | 0 | unitCount = 2; |
658 | 0 | cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000; |
659 | |
|
660 | 0 | *cpOut = cp; // ! Don't put after Done, don't write if no input. |
661 | | |
662 | 0 | Done: |
663 | 0 | *utf16Read = unitCount; |
664 | 0 | return; |
665 | | |
666 | 0 | } // CodePoint_from_UTF16Nat_Surrogate |
667 | | |
668 | | // ================================================================================================= |
669 | | |
670 | | static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read ) |
671 | 0 | { |
672 | 0 | UTF16Unit inUnit; // ! Don't read until we know there is input. |
673 | 0 | size_t unitCount = 0; |
674 | |
|
675 | 0 | UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) ); |
676 | 0 | if ( utf16Len == 0 ) goto Done; |
677 | 0 | inUnit = *utf16In; |
678 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP. |
679 | | |
680 | 0 | unitCount = 1; |
681 | 0 | *cpOut = inUnit; // ! Don't put after Done, don't write if no input. |
682 | | |
683 | 0 | Done: |
684 | 0 | *utf16Read = unitCount; |
685 | 0 | return; |
686 | | |
687 | 0 | SurrogatePair: |
688 | 0 | CodePoint_from_UTF16Nat_Surrogate ( utf16In, utf16Len, cpOut, utf16Read ); |
689 | 0 | return; |
690 | | |
691 | 0 | } // CodePoint_from_UTF16Nat |
692 | | |
693 | | // ================================================================================================= |
694 | | |
695 | | static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In, const size_t utf8Len, |
696 | | UTF16Unit * utf16Out, const size_t utf16Len, |
697 | | size_t * utf8Read, size_t * utf16Written ) |
698 | 0 | { |
699 | 0 | const UTF8Unit * utf8Pos = utf8In; |
700 | 0 | UTF16Unit * utf16Pos = utf16Out; |
701 | | |
702 | 0 | size_t utf8Left = utf8Len; |
703 | 0 | size_t utf16Left = utf16Len; |
704 | | |
705 | 0 | UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) ); |
706 | | |
707 | 0 | while ( (utf8Left > 0) && (utf16Left > 0) ) { |
708 | | |
709 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
710 | 0 | size_t i, limit = utf8Left; |
711 | 0 | if ( limit > utf16Left ) limit = utf16Left; |
712 | 0 | for ( i = 0; i < limit; ++i ) { |
713 | 0 | UTF8Unit inUnit = *utf8Pos; |
714 | 0 | if ( inUnit > 0x7F ) break; |
715 | 0 | *utf16Pos = inUnit; |
716 | 0 | ++utf8Pos; |
717 | 0 | ++utf16Pos; |
718 | 0 | } |
719 | 0 | utf8Left -= i; |
720 | 0 | utf16Left -= i; |
721 | | |
722 | | // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units. |
723 | 0 | while ( (utf8Left > 0) && (utf16Left > 0) ) { |
724 | 0 | UTF32Unit cp; |
725 | 0 | size_t len8, len16; |
726 | 0 | UTF8Unit inUnit = *utf8Pos; |
727 | 0 | if ( inUnit <= 0x7F ) break; |
728 | 0 | CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 ); |
729 | 0 | if ( len8 == 0 ) goto Done; // The input buffer ends in the middle of a character. |
730 | 0 | if ( cp <= 0xFFFF ) { |
731 | 0 | *utf16Pos = UTF16Unit(cp); |
732 | 0 | len16 = 1; |
733 | 0 | } else { |
734 | 0 | CodePoint_to_UTF16Nat_Surrogate ( cp, utf16Pos, utf16Left, &len16 ); |
735 | 0 | if ( len16 == 0 ) goto Done; // Not enough room in the output buffer. |
736 | 0 | } |
737 | 0 | utf8Left -= len8; |
738 | 0 | utf8Pos += len8; |
739 | 0 | utf16Left -= len16; |
740 | 0 | utf16Pos += len16; |
741 | 0 | } |
742 | | |
743 | 0 | } |
744 | | |
745 | 0 | Done: // Set the output lengths. |
746 | 0 | *utf8Read = utf8Len - utf8Left; |
747 | 0 | *utf16Written = utf16Len - utf16Left; |
748 | | |
749 | 0 | } // UTF8_to_UTF16Nat |
750 | | |
751 | | // ================================================================================================= |
752 | | |
753 | | static void UTF8_to_UTF32Nat ( const UTF8Unit * utf8In, const size_t utf8Len, |
754 | | UTF32Unit * utf32Out, const size_t utf32Len, |
755 | | size_t * utf8Read, size_t * utf32Written ) |
756 | 0 | { |
757 | 0 | const UTF8Unit * utf8Pos = utf8In; |
758 | 0 | UTF32Unit * utf32Pos = utf32Out; |
759 | | |
760 | 0 | size_t utf8Left = utf8Len; |
761 | 0 | size_t utf32Left = utf32Len; |
762 | | |
763 | 0 | UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) ); |
764 | | |
765 | 0 | while ( (utf8Left > 0) && (utf32Left > 0) ) { |
766 | | |
767 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
768 | 0 | size_t i, limit = utf8Left; |
769 | 0 | if ( limit > utf32Left ) limit = utf32Left; |
770 | 0 | for ( i = 0; i < limit; ++i ) { |
771 | 0 | UTF8Unit inUnit = *utf8Pos; |
772 | 0 | if ( inUnit > 0x7F ) break; |
773 | 0 | *utf32Pos = inUnit; |
774 | 0 | ++utf8Pos; |
775 | 0 | ++utf32Pos; |
776 | 0 | } |
777 | 0 | utf8Left -= i; |
778 | 0 | utf32Left -= i; |
779 | | |
780 | | // Do a run of non-ASCII, it copies variable input into 1 output unit. |
781 | 0 | while ( (utf8Left > 0) && (utf32Left > 0) ) { |
782 | 0 | size_t len; |
783 | 0 | UTF8Unit inUnit = *utf8Pos; |
784 | 0 | if ( inUnit <= 0x7F ) break; |
785 | 0 | CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, utf32Pos, &len ); |
786 | 0 | if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character. |
787 | 0 | utf8Left -= len; |
788 | 0 | utf8Pos += len; |
789 | 0 | utf32Left -= 1; |
790 | 0 | utf32Pos += 1; |
791 | 0 | } |
792 | | |
793 | 0 | } |
794 | | |
795 | 0 | Done: // Set the output lengths. |
796 | 0 | *utf8Read = utf8Len - utf8Left; |
797 | 0 | *utf32Written = utf32Len - utf32Left; |
798 | | |
799 | 0 | } // UTF8_to_UTF32Nat |
800 | | |
801 | | // ================================================================================================= |
802 | | |
803 | | static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len, |
804 | | UTF8Unit * utf8Out, const size_t utf8Len, |
805 | | size_t * utf16Read, size_t * utf8Written ) |
806 | 0 | { |
807 | 0 | const UTF16Unit * utf16Pos = utf16In; |
808 | 0 | UTF8Unit * utf8Pos = utf8Out; |
809 | | |
810 | 0 | size_t utf16Left = utf16Len; |
811 | 0 | size_t utf8Left = utf8Len; |
812 | | |
813 | 0 | UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) ); |
814 | | |
815 | 0 | while ( (utf16Left > 0) && (utf8Left > 0) ) { |
816 | | |
817 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
818 | 0 | size_t i, limit = utf16Left; |
819 | 0 | if ( limit > utf8Left ) limit = utf8Left; |
820 | 0 | for ( i = 0; i < limit; ++i ) { |
821 | 0 | UTF16Unit inUnit = *utf16Pos; |
822 | 0 | if ( inUnit > 0x7F ) break; |
823 | 0 | *utf8Pos = UTF8Unit(inUnit); |
824 | 0 | ++utf16Pos; |
825 | 0 | ++utf8Pos; |
826 | 0 | } |
827 | 0 | utf16Left -= i; |
828 | 0 | utf8Left -= i; |
829 | | |
830 | | // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units. |
831 | 0 | while ( (utf16Left > 0) && (utf8Left > 0) ) { |
832 | 0 | size_t len8; |
833 | 0 | UTF16Unit inUnit = *utf16Pos; |
834 | 0 | if ( inUnit <= 0x7F ) break; |
835 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break; |
836 | 0 | CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 ); |
837 | 0 | if ( len8 == 0 ) goto Done; // Not enough room in the output buffer. |
838 | 0 | utf16Left -= 1; |
839 | 0 | utf16Pos += 1; |
840 | 0 | utf8Left -= len8; |
841 | 0 | utf8Pos += len8; |
842 | 0 | } |
843 | | |
844 | | // Do a run of surrogate pairs, it copies 2 input units into multiple output units. |
845 | 0 | while ( (utf16Left > 0) && (utf8Left > 0) ) { |
846 | 0 | UTF32Unit cp; |
847 | 0 | size_t len16, len8; |
848 | 0 | UTF16Unit inUnit = *utf16Pos; |
849 | 0 | if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break; |
850 | 0 | CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len16 ); |
851 | 0 | if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair. |
852 | 0 | UC_Assert ( len16 == 2 ); |
853 | 0 | CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 ); |
854 | 0 | if ( len8 == 0 ) goto Done; // Not enough room in the output buffer. |
855 | 0 | utf16Left -= len16; |
856 | 0 | utf16Pos += len16; |
857 | 0 | utf8Left -= len8; |
858 | 0 | utf8Pos += len8; |
859 | 0 | } |
860 | | |
861 | 0 | } |
862 | | |
863 | 0 | Done: // Set the output lengths. |
864 | 0 | *utf16Read = utf16Len - utf16Left; |
865 | 0 | *utf8Written = utf8Len - utf8Left; |
866 | | |
867 | 0 | } // UTF16Nat_to_UTF8 |
868 | | |
869 | | // ================================================================================================= |
870 | | |
871 | | static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len, |
872 | | UTF8Unit * utf8Out, const size_t utf8Len, |
873 | | size_t * utf32Read, size_t * utf8Written ) |
874 | 0 | { |
875 | 0 | const UTF32Unit * utf32Pos = utf32In; |
876 | 0 | UTF8Unit * utf8Pos = utf8Out; |
877 | | |
878 | 0 | size_t utf32Left = utf32Len; |
879 | 0 | size_t utf8Left = utf8Len; |
880 | | |
881 | 0 | UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) ); |
882 | | |
883 | 0 | while ( (utf32Left > 0) && (utf8Left > 0) ) { |
884 | | |
885 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
886 | 0 | size_t i, limit = utf32Left; |
887 | 0 | if ( limit > utf8Left ) limit = utf8Left; |
888 | 0 | for ( i = 0; i < limit; ++i ) { |
889 | 0 | UTF32Unit inUnit = *utf32Pos; |
890 | 0 | if ( inUnit > 0x7F ) break; |
891 | 0 | *utf8Pos = UTF8Unit(inUnit); |
892 | 0 | ++utf32Pos; |
893 | 0 | ++utf8Pos; |
894 | 0 | } |
895 | 0 | utf32Left -= i; |
896 | 0 | utf8Left -= i; |
897 | | |
898 | | // Do a run of non-ASCII, it copies 1 input unit into multiple output units. |
899 | 0 | while ( (utf32Left > 0) && (utf8Left > 0) ) { |
900 | 0 | size_t len; |
901 | 0 | UTF32Unit inUnit = *utf32Pos; |
902 | 0 | if ( inUnit <= 0x7F ) break; |
903 | 0 | CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len ); |
904 | 0 | if ( len == 0 ) goto Done; // Not enough room in the output buffer. |
905 | 0 | utf32Left -= 1; |
906 | 0 | utf32Pos += 1; |
907 | 0 | utf8Left -= len; |
908 | 0 | utf8Pos += len; |
909 | 0 | } |
910 | | |
911 | 0 | } |
912 | | |
913 | 0 | Done: // Set the output lengths. |
914 | 0 | *utf32Read = utf32Len - utf32Left; |
915 | 0 | *utf8Written = utf8Len - utf8Left; |
916 | | |
917 | 0 | } // UTF32Nat_to_UTF8 |
918 | | |
919 | | // ================================================================================================= |
920 | | |
921 | | static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len, |
922 | | UTF32Unit * utf32Out, const size_t utf32Len, |
923 | | size_t * utf16Read, size_t * utf32Written ) |
924 | 0 | { |
925 | 0 | const UTF16Unit * utf16Pos = utf16In; |
926 | 0 | UTF32Unit * utf32Pos = utf32Out; |
927 | | |
928 | 0 | size_t utf16Left = utf16Len; |
929 | 0 | size_t utf32Left = utf32Len; |
930 | | |
931 | 0 | UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) ); |
932 | | |
933 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
934 | | |
935 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
936 | 0 | size_t i, limit = utf16Left; |
937 | 0 | if ( limit > utf32Left ) limit = utf32Left; |
938 | 0 | for ( i = 0; i < limit; ++i ) { |
939 | 0 | UTF16Unit inUnit = *utf16Pos; |
940 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break; |
941 | 0 | *utf32Pos = inUnit; |
942 | 0 | ++utf16Pos; |
943 | 0 | ++utf32Pos; |
944 | 0 | } |
945 | 0 | utf16Left -= i; |
946 | 0 | utf32Left -= i; |
947 | | |
948 | | // Do a run of surrogate pairs, it copies 2 input units into 1 output unit. |
949 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
950 | 0 | size_t len; |
951 | 0 | UTF16Unit inUnit = *utf16Pos; |
952 | 0 | if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break; |
953 | 0 | CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len ); |
954 | 0 | if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair. |
955 | 0 | UC_Assert ( len == 2 ); |
956 | 0 | utf16Left -= len; |
957 | 0 | utf16Pos += len; |
958 | 0 | utf32Left -= 1; |
959 | 0 | utf32Pos += 1; |
960 | 0 | } |
961 | | |
962 | 0 | } |
963 | | |
964 | 0 | Done: // Set the output lengths. |
965 | 0 | *utf16Read = utf16Len - utf16Left; |
966 | 0 | *utf32Written = utf32Len - utf32Left; |
967 | | |
968 | 0 | } // UTF16Nat_to_UTF32Nat |
969 | | |
970 | | // ================================================================================================= |
971 | | |
972 | | static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len, |
973 | | UTF16Unit * utf16Out, const size_t utf16Len, |
974 | | size_t * utf32Read, size_t * utf16Written ) |
975 | 0 | { |
976 | 0 | const UTF32Unit * utf32Pos = utf32In; |
977 | 0 | UTF16Unit * utf16Pos = utf16Out; |
978 | | |
979 | 0 | size_t utf32Left = utf32Len; |
980 | 0 | size_t utf16Left = utf16Len; |
981 | | |
982 | 0 | UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) ); |
983 | | |
984 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
985 | | |
986 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
987 | 0 | size_t i, limit = utf32Left; |
988 | 0 | if ( limit > utf16Left ) limit = utf16Left; |
989 | 0 | for ( i = 0; i < limit; ++i ) { |
990 | 0 | UTF32Unit inUnit = *utf32Pos; |
991 | 0 | if ( inUnit > 0xFFFF ) break; |
992 | 0 | *utf16Pos = UTF16Unit(inUnit); |
993 | 0 | ++utf32Pos; |
994 | 0 | ++utf16Pos; |
995 | 0 | } |
996 | 0 | utf32Left -= i; |
997 | 0 | utf16Left -= i; |
998 | | |
999 | | // Do a run of non-BMP, it copies 1 input unit into 2 output units. |
1000 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1001 | 0 | size_t len; |
1002 | 0 | UTF32Unit inUnit = *utf32Pos; |
1003 | 0 | if ( inUnit <= 0xFFFF ) break; |
1004 | 0 | CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len ); |
1005 | 0 | if ( len == 0 ) goto Done; // Not enough room in the output buffer. |
1006 | 0 | UC_Assert ( len == 2 ); |
1007 | 0 | utf32Left -= 1; |
1008 | 0 | utf32Pos += 1; |
1009 | 0 | utf16Left -= 2; |
1010 | 0 | utf16Pos += 2; |
1011 | 0 | } |
1012 | | |
1013 | 0 | } |
1014 | | |
1015 | 0 | Done: // Set the output lengths. |
1016 | 0 | *utf32Read = utf32Len - utf32Left; |
1017 | 0 | *utf16Written = utf16Len - utf16Left; |
1018 | | |
1019 | 0 | } // UTF32Nat_to_UTF16Nat |
1020 | | |
1021 | | // ================================================================================================= |
1022 | | |
1023 | | static void CodePoint_to_UTF16Swp_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written ) |
1024 | 0 | { |
1025 | 0 | size_t unitCount = 0; |
1026 | 0 | UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's. |
1027 | |
|
1028 | 0 | if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam ); |
1029 | 0 | if ( utf16Len < 2 ) goto Done; // Not enough room for the output. |
1030 | | |
1031 | 0 | unitCount = 2; |
1032 | 0 | temp = cpIn - 0x10000; |
1033 | 0 | UTF16OutSwap ( &utf16Out[0], (0xD800 | UTF16Unit ( temp >> 10 )) ); |
1034 | 0 | UTF16OutSwap ( &utf16Out[1], (0xDC00 | UTF16Unit ( temp & 0x3FF)) ); |
1035 | | |
1036 | 0 | Done: |
1037 | 0 | *utf16Written = unitCount; |
1038 | 0 | return; |
1039 | | |
1040 | 0 | } // CodePoint_to_UTF16Swp_Surrogate |
1041 | | |
1042 | | // ================================================================================================= |
1043 | | |
1044 | | static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written ) |
1045 | 0 | { |
1046 | 0 | size_t unitCount = 0; |
1047 | |
|
1048 | 0 | UC_Assert ( (utf16Out != 0) && (utf16Written != 0) ); |
1049 | 0 | if ( utf16Len == 0 ) goto Done; |
1050 | 0 | if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP. |
1051 | | |
1052 | 0 | InBMP: |
1053 | 0 | unitCount = 1; |
1054 | 0 | UTF16OutSwap ( utf16Out, UTF16Unit(cpIn) ); |
1055 | | |
1056 | 0 | Done: |
1057 | 0 | *utf16Written = unitCount; |
1058 | 0 | return; |
1059 | | |
1060 | 0 | CheckSurrogate: |
1061 | 0 | if ( cpIn > 0xFFFF ) goto SurrogatePair; |
1062 | 0 | if ( cpIn > 0xDFFF ) goto InBMP; |
1063 | 0 | UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam ); |
1064 | | |
1065 | 0 | SurrogatePair: |
1066 | 0 | CodePoint_to_UTF16Swp_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written ); |
1067 | 0 | return; |
1068 | | |
1069 | 0 | } // CodePoint_to_UTF16Swp |
1070 | | |
1071 | | // ================================================================================================= |
1072 | | |
1073 | | static void CodePoint_from_UTF16Swp_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read ) |
1074 | 0 | { |
1075 | 0 | UTF16Unit hiUnit = UTF16InSwap(utf16In); |
1076 | 0 | size_t unitCount = 0; |
1077 | 0 | UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's. |
1078 | 0 | UTF32Unit cp; |
1079 | | |
1080 | | // ---------------------------------- |
1081 | | // We've got a UTF-16 surrogate pair. |
1082 | |
|
1083 | 0 | if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam ); |
1084 | 0 | if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer. |
1085 | | |
1086 | 0 | loUnit = UTF16InSwap(utf16In+1); |
1087 | 0 | if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam ); |
1088 | | |
1089 | 0 | unitCount = 2; |
1090 | 0 | cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000; |
1091 | |
|
1092 | 0 | *cpOut = cp; // ! Don't put after Done, don't write if no input. |
1093 | | |
1094 | 0 | Done: |
1095 | 0 | *utf16Read = unitCount; |
1096 | 0 | return; |
1097 | | |
1098 | 0 | } // CodePoint_from_UTF16Swp_Surrogate |
1099 | | |
1100 | | // ================================================================================================= |
1101 | | |
1102 | | static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read ) |
1103 | 0 | { |
1104 | 0 | UTF16Unit inUnit; // ! Don't read until we know there is input. |
1105 | 0 | size_t unitCount = 0; |
1106 | |
|
1107 | 0 | UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) ); |
1108 | 0 | if ( utf16Len == 0 ) goto Done; |
1109 | 0 | inUnit = UTF16InSwap(utf16In); |
1110 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP. |
1111 | | |
1112 | 0 | unitCount = 1; |
1113 | 0 | *cpOut = inUnit; // ! Don't put after Done, don't write if no input. |
1114 | | |
1115 | 0 | Done: |
1116 | 0 | *utf16Read = unitCount; |
1117 | 0 | return; |
1118 | | |
1119 | 0 | SurrogatePair: |
1120 | 0 | CodePoint_from_UTF16Swp_Surrogate ( utf16In, utf16Len, cpOut, utf16Read ); |
1121 | 0 | return; |
1122 | | |
1123 | 0 | } // CodePoint_from_UTF16Swp |
1124 | | |
1125 | | // ================================================================================================= |
1126 | | |
1127 | | static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In, const size_t utf8Len, |
1128 | | UTF16Unit * utf16Out, const size_t utf16Len, |
1129 | | size_t * utf8Read, size_t * utf16Written ) |
1130 | 0 | { |
1131 | 0 | const UTF8Unit * utf8Pos = utf8In; |
1132 | 0 | UTF16Unit * utf16Pos = utf16Out; |
1133 | | |
1134 | 0 | size_t utf8Left = utf8Len; |
1135 | 0 | size_t utf16Left = utf16Len; |
1136 | | |
1137 | 0 | UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) ); |
1138 | | |
1139 | 0 | while ( (utf8Left > 0) && (utf16Left > 0) ) { |
1140 | | |
1141 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
1142 | 0 | size_t i, limit = utf8Left; |
1143 | 0 | if ( limit > utf16Left ) limit = utf16Left; |
1144 | 0 | for ( i = 0; i < limit; ++i ) { |
1145 | 0 | UTF8Unit inUnit = *utf8Pos; |
1146 | 0 | if ( inUnit > 0x7F ) break; |
1147 | 0 | *utf16Pos = UTF16Unit(inUnit) << 8; // Better than: UTF16OutSwap ( utf16Pos, inUnit ); |
1148 | 0 | ++utf8Pos; |
1149 | 0 | ++utf16Pos; |
1150 | 0 | } |
1151 | 0 | utf8Left -= i; |
1152 | 0 | utf16Left -= i; |
1153 | | |
1154 | | // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units. |
1155 | 0 | while ( (utf8Left > 0) && (utf16Left > 0) ) { |
1156 | 0 | UTF32Unit cp; |
1157 | 0 | size_t len8, len16; |
1158 | 0 | UTF8Unit inUnit = *utf8Pos; |
1159 | 0 | if ( inUnit <= 0x7F ) break; |
1160 | 0 | CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 ); |
1161 | 0 | if ( len8 == 0 ) goto Done; // The input buffer ends in the middle of a character. |
1162 | 0 | if ( cp <= 0xFFFF ) { |
1163 | 0 | UTF16OutSwap ( utf16Pos, UTF16Unit(cp) ); |
1164 | 0 | len16 = 1; |
1165 | 0 | } else { |
1166 | 0 | CodePoint_to_UTF16Swp_Surrogate ( cp, utf16Pos, utf16Left, &len16 ); |
1167 | 0 | if ( len16 == 0 ) goto Done; // Not enough room in the output buffer. |
1168 | 0 | } |
1169 | 0 | utf8Left -= len8; |
1170 | 0 | utf8Pos += len8; |
1171 | 0 | utf16Left -= len16; |
1172 | 0 | utf16Pos += len16; |
1173 | 0 | } |
1174 | | |
1175 | 0 | } |
1176 | | |
1177 | 0 | Done: // Set the output lengths. |
1178 | 0 | *utf8Read = utf8Len - utf8Left; |
1179 | 0 | *utf16Written = utf16Len - utf16Left; |
1180 | | |
1181 | 0 | } // UTF8_to_UTF16Swp |
1182 | | |
1183 | | // ================================================================================================= |
1184 | | |
1185 | | static void UTF8_to_UTF32Swp ( const UTF8Unit * utf8In, const size_t utf8Len, |
1186 | | UTF32Unit * utf32Out, const size_t utf32Len, |
1187 | | size_t * utf8Read, size_t * utf32Written ) |
1188 | 0 | { |
1189 | 0 | const UTF8Unit * utf8Pos = utf8In; |
1190 | 0 | UTF32Unit * utf32Pos = utf32Out; |
1191 | | |
1192 | 0 | size_t utf8Left = utf8Len; |
1193 | 0 | size_t utf32Left = utf32Len; |
1194 | | |
1195 | 0 | UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) ); |
1196 | | |
1197 | 0 | while ( (utf8Left > 0) && (utf32Left > 0) ) { |
1198 | | |
1199 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
1200 | 0 | size_t i, limit = utf8Left; |
1201 | 0 | if ( limit > utf32Left ) limit = utf32Left; |
1202 | 0 | for ( i = 0; i < limit; ++i ) { |
1203 | 0 | UTF8Unit inUnit = *utf8Pos; |
1204 | 0 | if ( inUnit > 0x7F ) break; |
1205 | 0 | *utf32Pos = UTF32Unit(inUnit) << 24; // Better than: UTF32OutSwap ( utf32Pos, inUnit ); |
1206 | 0 | ++utf8Pos; |
1207 | 0 | ++utf32Pos; |
1208 | 0 | } |
1209 | 0 | utf8Left -= i; |
1210 | 0 | utf32Left -= i; |
1211 | | |
1212 | | // Do a run of non-ASCII, it copies variable input into 1 output unit. |
1213 | 0 | while ( (utf8Left > 0) && (utf32Left > 0) ) { |
1214 | 0 | size_t len; |
1215 | 0 | UTF32Unit cp; |
1216 | 0 | UTF8Unit inUnit = *utf8Pos; |
1217 | 0 | if ( inUnit <= 0x7F ) break; |
1218 | 0 | CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len ); |
1219 | 0 | if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character. |
1220 | 0 | UTF32OutSwap ( utf32Pos, cp ); |
1221 | 0 | utf8Left -= len; |
1222 | 0 | utf8Pos += len; |
1223 | 0 | utf32Left -= 1; |
1224 | 0 | utf32Pos += 1; |
1225 | 0 | } |
1226 | | |
1227 | 0 | } |
1228 | | |
1229 | 0 | Done: // Set the output lengths. |
1230 | 0 | *utf8Read = utf8Len - utf8Left; |
1231 | 0 | *utf32Written = utf32Len - utf32Left; |
1232 | | |
1233 | 0 | } // UTF8_to_UTF32Swp |
1234 | | |
1235 | | // ================================================================================================= |
1236 | | |
1237 | | static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len, |
1238 | | UTF8Unit * utf8Out, const size_t utf8Len, |
1239 | | size_t * utf16Read, size_t * utf8Written ) |
1240 | 0 | { |
1241 | 0 | const UTF16Unit * utf16Pos = utf16In; |
1242 | 0 | UTF8Unit * utf8Pos = utf8Out; |
1243 | | |
1244 | 0 | size_t utf16Left = utf16Len; |
1245 | 0 | size_t utf8Left = utf8Len; |
1246 | | |
1247 | 0 | UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) ); |
1248 | | |
1249 | 0 | while ( (utf16Left > 0) && (utf8Left > 0) ) { |
1250 | | |
1251 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
1252 | 0 | size_t i, limit = utf16Left; |
1253 | 0 | if ( limit > utf8Left ) limit = utf8Left; |
1254 | 0 | for ( i = 0; i < limit; ++i ) { |
1255 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1256 | 0 | if ( inUnit > 0x7F ) break; |
1257 | 0 | *utf8Pos = UTF8Unit(inUnit); |
1258 | 0 | ++utf16Pos; |
1259 | 0 | ++utf8Pos; |
1260 | 0 | } |
1261 | 0 | utf16Left -= i; |
1262 | 0 | utf8Left -= i; |
1263 | | |
1264 | | // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units. |
1265 | 0 | while ( (utf16Left > 0) && (utf8Left > 0) ) { |
1266 | 0 | size_t len8; |
1267 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1268 | 0 | if ( inUnit <= 0x7F ) break; |
1269 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break; |
1270 | 0 | CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 ); |
1271 | 0 | if ( len8 == 0 ) goto Done; // Not enough room in the output buffer. |
1272 | 0 | utf16Left -= 1; |
1273 | 0 | utf16Pos += 1; |
1274 | 0 | utf8Left -= len8; |
1275 | 0 | utf8Pos += len8; |
1276 | 0 | } |
1277 | | |
1278 | | // Do a run of surrogate pairs, it copies 2 input units into multiple output units. |
1279 | 0 | while ( (utf16Left > 0) && (utf8Left > 0) ) { |
1280 | 0 | UTF32Unit cp; |
1281 | 0 | size_t len16, len8; |
1282 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1283 | 0 | if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break; |
1284 | 0 | CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len16 ); |
1285 | 0 | if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair. |
1286 | 0 | UC_Assert ( len16 == 2 ); |
1287 | 0 | CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 ); |
1288 | 0 | if ( len8 == 0 ) goto Done; // Not enough room in the output buffer. |
1289 | 0 | utf16Left -= len16; |
1290 | 0 | utf16Pos += len16; |
1291 | 0 | utf8Left -= len8; |
1292 | 0 | utf8Pos += len8; |
1293 | 0 | } |
1294 | | |
1295 | 0 | } |
1296 | | |
1297 | 0 | Done: // Set the output lengths. |
1298 | 0 | *utf16Read = utf16Len - utf16Left; |
1299 | 0 | *utf8Written = utf8Len - utf8Left; |
1300 | | |
1301 | 0 | } // UTF16Swp_to_UTF8 |
1302 | | |
1303 | | // ================================================================================================= |
1304 | | |
1305 | | static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len, |
1306 | | UTF8Unit * utf8Out, const size_t utf8Len, |
1307 | | size_t * utf32Read, size_t * utf8Written ) |
1308 | 0 | { |
1309 | 0 | const UTF32Unit * utf32Pos = utf32In; |
1310 | 0 | UTF8Unit * utf8Pos = utf8Out; |
1311 | | |
1312 | 0 | size_t utf32Left = utf32Len; |
1313 | 0 | size_t utf8Left = utf8Len; |
1314 | | |
1315 | 0 | UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) ); |
1316 | | |
1317 | 0 | while ( (utf32Left > 0) && (utf8Left > 0) ) { |
1318 | | |
1319 | | // Do a run of ASCII, it copies 1 input unit into 1 output unit. |
1320 | 0 | size_t i, limit = utf32Left; |
1321 | 0 | if ( limit > utf8Left ) limit = utf8Left; |
1322 | 0 | for ( i = 0; i < limit; ++i ) { |
1323 | 0 | UTF32Unit cp = UTF32InSwap(utf32Pos); |
1324 | 0 | if ( cp > 0x7F ) break; |
1325 | 0 | *utf8Pos = UTF8Unit(cp); |
1326 | 0 | ++utf32Pos; |
1327 | 0 | ++utf8Pos; |
1328 | 0 | } |
1329 | 0 | utf32Left -= i; |
1330 | 0 | utf8Left -= i; |
1331 | | |
1332 | | // Do a run of non-ASCII, it copies 1 input unit into multiple output units. |
1333 | 0 | while ( (utf32Left > 0) && (utf8Left > 0) ) { |
1334 | 0 | size_t len; |
1335 | 0 | UTF32Unit cp = UTF32InSwap(utf32Pos); |
1336 | 0 | if ( cp <= 0x7F ) break; |
1337 | 0 | CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len ); |
1338 | 0 | if ( len == 0 ) goto Done; // Not enough room in the output buffer. |
1339 | 0 | utf32Left -= 1; |
1340 | 0 | utf32Pos += 1; |
1341 | 0 | utf8Left -= len; |
1342 | 0 | utf8Pos += len; |
1343 | 0 | } |
1344 | | |
1345 | 0 | } |
1346 | | |
1347 | 0 | Done: // Set the output lengths. |
1348 | 0 | *utf32Read = utf32Len - utf32Left; |
1349 | 0 | *utf8Written = utf8Len - utf8Left; |
1350 | | |
1351 | 0 | } // UTF32Swp_to_UTF8 |
1352 | | |
1353 | | // ================================================================================================= |
1354 | | |
1355 | | static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len, |
1356 | | UTF32Unit * utf32Out, const size_t utf32Len, |
1357 | | size_t * utf16Read, size_t * utf32Written ) |
1358 | 0 | { |
1359 | 0 | const UTF16Unit * utf16Pos = utf16In; |
1360 | 0 | UTF32Unit * utf32Pos = utf32Out; |
1361 | | |
1362 | 0 | size_t utf16Left = utf16Len; |
1363 | 0 | size_t utf32Left = utf32Len; |
1364 | | |
1365 | 0 | UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) ); |
1366 | | |
1367 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
1368 | | |
1369 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
1370 | 0 | size_t i, limit = utf16Left; |
1371 | 0 | if ( limit > utf32Left ) limit = utf32Left; |
1372 | 0 | for ( i = 0; i < limit; ++i ) { |
1373 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1374 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break; |
1375 | 0 | *utf32Pos = UTF32Unit(*utf16Pos) << 16; // Better than: UTF32OutSwap ( utf32Pos, inUnit ); |
1376 | 0 | ++utf16Pos; |
1377 | 0 | ++utf32Pos; |
1378 | 0 | } |
1379 | 0 | utf16Left -= i; |
1380 | 0 | utf32Left -= i; |
1381 | | |
1382 | | // Do a run of surrogate pairs, it copies 2 input units into 1 output unit. |
1383 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
1384 | 0 | size_t len; |
1385 | 0 | UTF32Unit cp; |
1386 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1387 | 0 | if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break; |
1388 | 0 | CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len ); |
1389 | 0 | if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair. |
1390 | 0 | UTF32OutSwap ( utf32Pos, cp ); |
1391 | 0 | UC_Assert ( len == 2 ); |
1392 | 0 | utf16Left -= len; |
1393 | 0 | utf16Pos += len; |
1394 | 0 | utf32Left -= 1; |
1395 | 0 | utf32Pos += 1; |
1396 | 0 | } |
1397 | | |
1398 | 0 | } |
1399 | | |
1400 | 0 | Done: // Set the output lengths. |
1401 | 0 | *utf16Read = utf16Len - utf16Left; |
1402 | 0 | *utf32Written = utf32Len - utf32Left; |
1403 | | |
1404 | 0 | } // UTF16Swp_to_UTF32Swp |
1405 | | |
1406 | | // ================================================================================================= |
1407 | | |
1408 | | static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len, |
1409 | | UTF16Unit * utf16Out, const size_t utf16Len, |
1410 | | size_t * utf32Read, size_t * utf16Written ) |
1411 | 0 | { |
1412 | 0 | const UTF32Unit * utf32Pos = utf32In; |
1413 | 0 | UTF16Unit * utf16Pos = utf16Out; |
1414 | | |
1415 | 0 | size_t utf32Left = utf32Len; |
1416 | 0 | size_t utf16Left = utf16Len; |
1417 | | |
1418 | 0 | const size_t k32to16Offset = swap32to16Offset; // ! Make sure compiler treats as an invariant. |
1419 | | |
1420 | 0 | UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) ); |
1421 | | |
1422 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1423 | | |
1424 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
1425 | 0 | size_t i, limit = utf32Left; |
1426 | 0 | if ( limit > utf16Left ) limit = utf16Left; |
1427 | 0 | for ( i = 0; i < limit; ++i ) { |
1428 | 0 | UTF32Unit inUnit = UTF32InSwap(utf32Pos); |
1429 | 0 | if ( inUnit > 0xFFFF ) break; |
1430 | 0 | *utf16Pos = *(((UTF16Unit*)utf32Pos) + k32to16Offset); // Better than: UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) ); |
1431 | 0 | ++utf32Pos; |
1432 | 0 | ++utf16Pos; |
1433 | 0 | } |
1434 | 0 | utf32Left -= i; |
1435 | 0 | utf16Left -= i; |
1436 | | |
1437 | | // Do a run of non-BMP, it copies 1 input unit into 2 output units. |
1438 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1439 | 0 | size_t len; |
1440 | 0 | UTF32Unit inUnit = UTF32InSwap(utf32Pos); |
1441 | 0 | if ( inUnit <= 0xFFFF ) break; |
1442 | 0 | CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len ); |
1443 | 0 | if ( len == 0 ) goto Done; // Not enough room in the output buffer. |
1444 | 0 | UC_Assert ( len == 2 ); |
1445 | 0 | utf32Left -= 1; |
1446 | 0 | utf32Pos += 1; |
1447 | 0 | utf16Left -= 2; |
1448 | 0 | utf16Pos += 2; |
1449 | 0 | } |
1450 | | |
1451 | 0 | } |
1452 | | |
1453 | 0 | Done: // Set the output lengths. |
1454 | 0 | *utf32Read = utf32Len - utf32Left; |
1455 | 0 | *utf16Written = utf16Len - utf16Left; |
1456 | | |
1457 | 0 | } // UTF32Swp_to_UTF16Swp |
1458 | | |
1459 | | // ================================================================================================= |
1460 | | |
1461 | | static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len, |
1462 | | UTF32Unit * utf32Out, const size_t utf32Len, |
1463 | | size_t * utf16Read, size_t * utf32Written ) |
1464 | 0 | { |
1465 | 0 | const UTF16Unit * utf16Pos = utf16In; |
1466 | 0 | UTF32Unit * utf32Pos = utf32Out; |
1467 | | |
1468 | 0 | size_t utf16Left = utf16Len; |
1469 | 0 | size_t utf32Left = utf32Len; |
1470 | | |
1471 | 0 | UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) ); |
1472 | | |
1473 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
1474 | | |
1475 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
1476 | 0 | size_t i, limit = utf16Left; |
1477 | 0 | if ( limit > utf32Left ) limit = utf32Left; |
1478 | 0 | for ( i = 0; i < limit; ++i ) { |
1479 | 0 | UTF16Unit inUnit = *utf16Pos; |
1480 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break; |
1481 | 0 | UTF32OutSwap ( utf32Pos, inUnit ); |
1482 | 0 | ++utf16Pos; |
1483 | 0 | ++utf32Pos; |
1484 | 0 | } |
1485 | 0 | utf16Left -= i; |
1486 | 0 | utf32Left -= i; |
1487 | | |
1488 | | // Do a run of surrogate pairs, it copies 2 input units into 1 output unit. |
1489 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
1490 | 0 | size_t len; |
1491 | 0 | UTF32Unit cp; |
1492 | 0 | UTF16Unit inUnit = *utf16Pos; |
1493 | 0 | if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break; |
1494 | 0 | CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len ); |
1495 | 0 | if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair. |
1496 | 0 | UC_Assert ( len == 2 ); |
1497 | 0 | UTF32OutSwap ( utf32Pos, cp ); |
1498 | 0 | utf16Left -= len; |
1499 | 0 | utf16Pos += len; |
1500 | 0 | utf32Left -= 1; |
1501 | 0 | utf32Pos += 1; |
1502 | 0 | } |
1503 | | |
1504 | 0 | } |
1505 | | |
1506 | 0 | Done: // Set the output lengths. |
1507 | 0 | *utf16Read = utf16Len - utf16Left; |
1508 | 0 | *utf32Written = utf32Len - utf32Left; |
1509 | | |
1510 | 0 | } // UTF16Nat_to_UTF32Swp |
1511 | | |
1512 | | // ================================================================================================= |
1513 | | |
1514 | | static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len, |
1515 | | UTF32Unit * utf32Out, const size_t utf32Len, |
1516 | | size_t * utf16Read, size_t * utf32Written ) |
1517 | 0 | { |
1518 | 0 | const UTF16Unit * utf16Pos = utf16In; |
1519 | 0 | UTF32Unit * utf32Pos = utf32Out; |
1520 | | |
1521 | 0 | size_t utf16Left = utf16Len; |
1522 | 0 | size_t utf32Left = utf32Len; |
1523 | | |
1524 | 0 | UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) ); |
1525 | | |
1526 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
1527 | | |
1528 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
1529 | 0 | size_t i, limit = utf16Left; |
1530 | 0 | if ( limit > utf32Left ) limit = utf32Left; |
1531 | 0 | for ( i = 0; i < limit; ++i ) { |
1532 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1533 | 0 | if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break; |
1534 | 0 | *utf32Pos = inUnit; |
1535 | 0 | ++utf16Pos; |
1536 | 0 | ++utf32Pos; |
1537 | 0 | } |
1538 | 0 | utf16Left -= i; |
1539 | 0 | utf32Left -= i; |
1540 | | |
1541 | | // Do a run of surrogate pairs, it copies 2 input units into 1 output unit. |
1542 | 0 | while ( (utf16Left > 0) && (utf32Left > 0) ) { |
1543 | 0 | size_t len; |
1544 | 0 | UTF16Unit inUnit = UTF16InSwap(utf16Pos); |
1545 | 0 | if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break; |
1546 | 0 | CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len ); |
1547 | 0 | if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair. |
1548 | 0 | UC_Assert ( len == 2 ); |
1549 | 0 | utf16Left -= len; |
1550 | 0 | utf16Pos += len; |
1551 | 0 | utf32Left -= 1; |
1552 | 0 | utf32Pos += 1; |
1553 | 0 | } |
1554 | | |
1555 | 0 | } |
1556 | | |
1557 | 0 | Done: // Set the output lengths. |
1558 | 0 | *utf16Read = utf16Len - utf16Left; |
1559 | 0 | *utf32Written = utf32Len - utf32Left; |
1560 | | |
1561 | 0 | } // UTF16Swp_to_UTF32Nat |
1562 | | |
1563 | | // ================================================================================================= |
1564 | | |
1565 | | static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len, |
1566 | | UTF16Unit * utf16Out, const size_t utf16Len, |
1567 | | size_t * utf32Read, size_t * utf16Written ) |
1568 | 0 | { |
1569 | 0 | const UTF32Unit * utf32Pos = utf32In; |
1570 | 0 | UTF16Unit * utf16Pos = utf16Out; |
1571 | | |
1572 | 0 | size_t utf32Left = utf32Len; |
1573 | 0 | size_t utf16Left = utf16Len; |
1574 | | |
1575 | 0 | UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) ); |
1576 | | |
1577 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1578 | | |
1579 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
1580 | 0 | size_t i, limit = utf32Left; |
1581 | 0 | if ( limit > utf16Left ) limit = utf16Left; |
1582 | 0 | for ( i = 0; i < limit; ++i ) { |
1583 | 0 | UTF32Unit inUnit = *utf32Pos; |
1584 | 0 | if ( inUnit > 0xFFFF ) break; |
1585 | 0 | UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) ); |
1586 | 0 | ++utf32Pos; |
1587 | 0 | ++utf16Pos; |
1588 | 0 | } |
1589 | 0 | utf32Left -= i; |
1590 | 0 | utf16Left -= i; |
1591 | | |
1592 | | // Do a run of non-BMP, it copies 1 input unit into 2 output units. |
1593 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1594 | 0 | size_t len; |
1595 | 0 | UTF32Unit inUnit = *utf32Pos; |
1596 | 0 | if ( inUnit <= 0xFFFF ) break; |
1597 | 0 | CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len ); |
1598 | 0 | if ( len == 0 ) goto Done; // Not enough room in the output buffer. |
1599 | 0 | UC_Assert ( len == 2 ); |
1600 | 0 | utf32Left -= 1; |
1601 | 0 | utf32Pos += 1; |
1602 | 0 | utf16Left -= 2; |
1603 | 0 | utf16Pos += 2; |
1604 | 0 | } |
1605 | | |
1606 | 0 | } |
1607 | | |
1608 | 0 | Done: // Set the output lengths. |
1609 | 0 | *utf32Read = utf32Len - utf32Left; |
1610 | 0 | *utf16Written = utf16Len - utf16Left; |
1611 | | |
1612 | 0 | } // UTF32Nat_to_UTF16Swp |
1613 | | |
1614 | | // ================================================================================================= |
1615 | | |
1616 | | static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len, |
1617 | | UTF16Unit * utf16Out, const size_t utf16Len, |
1618 | | size_t * utf32Read, size_t * utf16Written ) |
1619 | 0 | { |
1620 | 0 | const UTF32Unit * utf32Pos = utf32In; |
1621 | 0 | UTF16Unit * utf16Pos = utf16Out; |
1622 | | |
1623 | 0 | size_t utf32Left = utf32Len; |
1624 | 0 | size_t utf16Left = utf16Len; |
1625 | | |
1626 | 0 | UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) ); |
1627 | | |
1628 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1629 | | |
1630 | | // Do a run of BMP, it copies 1 input unit into 1 output unit. |
1631 | 0 | size_t i, limit = utf32Left; |
1632 | 0 | if ( limit > utf16Left ) limit = utf16Left; |
1633 | 0 | for ( i = 0; i < limit; ++i ) { |
1634 | 0 | UTF32Unit inUnit = UTF32InSwap(utf32Pos); |
1635 | 0 | if ( inUnit > 0xFFFF ) break; |
1636 | 0 | *utf16Pos = UTF16Unit(inUnit); |
1637 | 0 | ++utf32Pos; |
1638 | 0 | ++utf16Pos; |
1639 | 0 | } |
1640 | 0 | utf32Left -= i; |
1641 | 0 | utf16Left -= i; |
1642 | | |
1643 | | // Do a run of non-BMP, it copies 1 input unit into 2 output units. |
1644 | 0 | while ( (utf32Left > 0) && (utf16Left > 0) ) { |
1645 | 0 | size_t len; |
1646 | 0 | UTF32Unit inUnit = UTF32InSwap(utf32Pos); |
1647 | 0 | if ( inUnit <= 0xFFFF ) break; |
1648 | 0 | CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len ); |
1649 | 0 | if ( len == 0 ) goto Done; // Not enough room in the output buffer. |
1650 | 0 | UC_Assert ( len == 2 ); |
1651 | 0 | utf32Left -= 1; |
1652 | 0 | utf32Pos += 1; |
1653 | 0 | utf16Left -= 2; |
1654 | 0 | utf16Pos += 2; |
1655 | 0 | } |
1656 | | |
1657 | 0 | } |
1658 | | |
1659 | 0 | Done: // Set the output lengths. |
1660 | 0 | *utf32Read = utf32Len - utf32Left; |
1661 | 0 | *utf16Written = utf16Len - utf16Left; |
1662 | | |
1663 | 0 | } // UTF32Swp_to_UTF16Nat |
1664 | | |
1665 | | // ================================================================================================= |