Coverage Report

Created: 2026-02-26 07:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/exiv2/xmpsdk/src/UnicodeConversions.cpp
Line
Count
Source
1
// =================================================================================================
2
// Copyright 2004-2007 Adobe Systems Incorporated
3
// All Rights Reserved.
4
//
5
// NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
6
// of the Adobe license agreement accompanying it.
7
// =================================================================================================
8
9
#include "XMP_Const.h"
10
11
#if UnicodeTestBuild
12
  #include <cassert>
13
  #include <stdexcept>
14
  #define UC_Assert assert
15
  #define UC_Throw(m,k) throw std::logic_error ( m )
16
#else
17
  #define UC_Assert(cond) /* Nothing for now, should be XMP_Assert. */
18
0
  #define UC_Throw(msg,id)  throw XMP_Error ( id, msg )
19
#endif
20
21
#include "UnicodeConversions.hpp"
22
23
using namespace std;
24
25
// =================================================================================================
26
27
// *** Look into using asm inlines, e.g. count-leading bits for multi-byte UTF-8.
28
29
CodePoint_to_UTF16_Proc CodePoint_to_UTF16BE = 0;
30
CodePoint_to_UTF16_Proc CodePoint_to_UTF16LE = 0;
31
32
CodePoint_from_UTF16_Proc CodePoint_from_UTF16BE = 0;
33
CodePoint_from_UTF16_Proc CodePoint_from_UTF16LE = 0;
34
35
UTF8_to_UTF16_Proc  UTF8_to_UTF16BE = 0;
36
UTF8_to_UTF16_Proc  UTF8_to_UTF16LE = 0;
37
UTF8_to_UTF32_Proc  UTF8_to_UTF32BE = 0;
38
UTF8_to_UTF32_Proc  UTF8_to_UTF32LE = 0;
39
40
UTF16_to_UTF8_Proc  UTF16BE_to_UTF8 = 0;
41
UTF16_to_UTF8_Proc  UTF16LE_to_UTF8 = 0;
42
UTF32_to_UTF8_Proc  UTF32BE_to_UTF8 = 0;
43
UTF32_to_UTF8_Proc  UTF32LE_to_UTF8 = 0;
44
45
UTF8_to_UTF16_Proc  UTF8_to_UTF16Native = 0;
46
UTF8_to_UTF32_Proc  UTF8_to_UTF32Native = 0;
47
UTF16_to_UTF8_Proc  UTF16Native_to_UTF8 = 0;
48
UTF32_to_UTF8_Proc  UTF32Native_to_UTF8 = 0;
49
50
UTF16_to_UTF32_Proc UTF16BE_to_UTF32BE = 0;
51
UTF16_to_UTF32_Proc UTF16BE_to_UTF32LE = 0;
52
UTF16_to_UTF32_Proc UTF16LE_to_UTF32BE = 0;
53
UTF16_to_UTF32_Proc UTF16LE_to_UTF32LE = 0;
54
55
UTF32_to_UTF16_Proc UTF32BE_to_UTF16BE = 0;
56
UTF32_to_UTF16_Proc UTF32BE_to_UTF16LE = 0;
57
UTF32_to_UTF16_Proc UTF32LE_to_UTF16BE = 0;
58
UTF32_to_UTF16_Proc UTF32LE_to_UTF16LE = 0;
59
60
// -------------------------------------------------------------------------------------------------
61
62
static size_t swap32to16Offset = 0; // Offset to "convert" a swapped UTF32 pointer into a swapped UTF16 pointer.
63
64
// -------------------------------------------------------------------------------------------------
65
66
static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
67
static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
68
69
static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
70
static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
71
72
// -------------------------------------------------------------------------------------------------
73
74
static void UTF8_to_UTF16Nat ( const UTF8Unit *  utf8In,    const size_t utf8Len,
75
                       UTF16Unit *       utf16Out,  const size_t utf16Len,
76
                       size_t *          utf8Read,  size_t *     utf16Written );
77
78
static void UTF8_to_UTF16Swp ( const UTF8Unit *  utf8In,    const size_t utf8Len,
79
                       UTF16Unit *       utf16Out,  const size_t utf16Len,
80
                       size_t *          utf8Read,  size_t *     utf16Written );
81
82
static void UTF8_to_UTF32Nat ( const UTF8Unit *  utf8In,    const size_t utf8Len,
83
                       UTF32Unit *       utf32Out,  const size_t utf32Len,
84
                       size_t *          utf8Read,  size_t *     utf32Written );
85
86
static void UTF8_to_UTF32Swp ( const UTF8Unit *  utf8In,    const size_t utf8Len,
87
                       UTF32Unit *       utf32Out,  const size_t utf32Len,
88
                       size_t *          utf8Read,  size_t *     utf32Written );
89
90
// -------------------------------------------------------------------------------------------------
91
92
static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
93
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
94
                       size_t *          utf16Read, size_t *     utf8Written );
95
96
static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
97
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
98
                       size_t *          utf16Read, size_t *     utf8Written );
99
100
static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
101
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
102
                       size_t *          utf32Read, size_t *     utf8Written );
103
104
static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
105
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
106
                       size_t *          utf32Read, size_t *     utf8Written );
107
108
// -------------------------------------------------------------------------------------------------
109
110
static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
111
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
112
                           size_t *          utf16Read, size_t *     utf32Written );
113
114
static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
115
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
116
                           size_t *          utf16Read, size_t *     utf32Written );
117
118
static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
119
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
120
                           size_t *          utf16Read, size_t *     utf32Written );
121
122
static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
123
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
124
                           size_t *          utf16Read, size_t *     utf32Written );
125
126
// -------------------------------------------------------------------------------------------------
127
128
static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
129
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
130
                           size_t *          utf32Read, size_t *     utf16Written );
131
132
static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
133
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
134
                           size_t *          utf32Read, size_t *     utf16Written );
135
136
static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
137
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
138
                           size_t *          utf32Read, size_t *     utf16Written );
139
140
static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
141
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
142
                           size_t *          utf32Read, size_t *     utf16Written );
143
144
// =================================================================================================
145
146
void InitializeUnicodeConversions()
147
1
{
148
1
  UC_Assert ( (sizeof(UTF8Unit) == 1) && (sizeof(UTF16Unit) == 2) && (sizeof(UTF32Unit) == 4) ); 
149
150
1
  UTF16Unit u16  = 0x00FF;
151
1
  bool bigEndian = (*((UTF8Unit*)&u16) == 0);
152
153
1
  UTF8_to_UTF16Native = UTF8_to_UTF16Nat;
154
1
  UTF8_to_UTF32Native = UTF8_to_UTF32Nat;
155
1
  UTF16Native_to_UTF8 = UTF16Nat_to_UTF8;
156
1
  UTF32Native_to_UTF8 = UTF32Nat_to_UTF8;
157
  
158
1
  if ( bigEndian ) {
159
  
160
0
    swap32to16Offset = 0;
161
162
0
    CodePoint_to_UTF16BE = CodePoint_to_UTF16Nat;
163
0
    CodePoint_to_UTF16LE = CodePoint_to_UTF16Swp;
164
165
0
    CodePoint_from_UTF16BE = CodePoint_from_UTF16Nat;
166
0
    CodePoint_from_UTF16LE = CodePoint_from_UTF16Swp;
167
168
0
    UTF8_to_UTF16BE = UTF8_to_UTF16Nat;
169
0
    UTF8_to_UTF16LE = UTF8_to_UTF16Swp;
170
0
    UTF8_to_UTF32BE = UTF8_to_UTF32Nat;
171
0
    UTF8_to_UTF32LE = UTF8_to_UTF32Swp;
172
173
0
    UTF16BE_to_UTF8 = UTF16Nat_to_UTF8;
174
0
    UTF16LE_to_UTF8 = UTF16Swp_to_UTF8;
175
0
    UTF32BE_to_UTF8 = UTF32Nat_to_UTF8;
176
0
    UTF32LE_to_UTF8 = UTF32Swp_to_UTF8;
177
178
0
    UTF16BE_to_UTF32BE = UTF16Nat_to_UTF32Nat;
179
0
    UTF16BE_to_UTF32LE = UTF16Nat_to_UTF32Swp;
180
0
    UTF16LE_to_UTF32BE = UTF16Swp_to_UTF32Nat;
181
0
    UTF16LE_to_UTF32LE = UTF16Swp_to_UTF32Swp;
182
183
0
    UTF32BE_to_UTF16BE = UTF32Nat_to_UTF16Nat;
184
0
    UTF32BE_to_UTF16LE = UTF32Nat_to_UTF16Swp;
185
0
    UTF32LE_to_UTF16BE = UTF32Swp_to_UTF16Nat;
186
0
    UTF32LE_to_UTF16LE = UTF32Swp_to_UTF16Swp;
187
188
1
  } else {
189
  
190
1
    swap32to16Offset = 1; // ! Offset in UTF16 units!
191
192
1
    CodePoint_to_UTF16BE = CodePoint_to_UTF16Swp;
193
1
    CodePoint_to_UTF16LE = CodePoint_to_UTF16Nat;
194
195
1
    CodePoint_from_UTF16BE = CodePoint_from_UTF16Swp;
196
1
    CodePoint_from_UTF16LE = CodePoint_from_UTF16Nat;
197
198
1
    UTF8_to_UTF16BE = UTF8_to_UTF16Swp;
199
1
    UTF8_to_UTF16LE = UTF8_to_UTF16Nat;
200
1
    UTF8_to_UTF32BE = UTF8_to_UTF32Swp;
201
1
    UTF8_to_UTF32LE = UTF8_to_UTF32Nat;
202
203
1
    UTF16BE_to_UTF8 = UTF16Swp_to_UTF8;
204
1
    UTF16LE_to_UTF8 = UTF16Nat_to_UTF8;
205
1
    UTF32BE_to_UTF8 = UTF32Swp_to_UTF8;
206
1
    UTF32LE_to_UTF8 = UTF32Nat_to_UTF8;
207
208
1
    UTF16BE_to_UTF32BE = UTF16Swp_to_UTF32Swp;
209
1
    UTF16BE_to_UTF32LE = UTF16Swp_to_UTF32Nat;
210
1
    UTF16LE_to_UTF32BE = UTF16Nat_to_UTF32Swp;
211
1
    UTF16LE_to_UTF32LE = UTF16Nat_to_UTF32Nat;
212
213
1
    UTF32BE_to_UTF16BE = UTF32Swp_to_UTF16Swp;
214
1
    UTF32BE_to_UTF16LE = UTF32Swp_to_UTF16Nat;
215
1
    UTF32LE_to_UTF16BE = UTF32Nat_to_UTF16Swp;
216
1
    UTF32LE_to_UTF16LE = UTF32Nat_to_UTF16Nat;
217
218
1
  }
219
220
1
}  // InitializeUnicodeConversions
221
222
// =================================================================================================
223
224
#if XMP_MacBuild && __MWERKS__ 
225
226
  #define UTF16InSwap(inPtr)  UTF16Unit ( __lhbrx ( (void*)(inPtr), 0 ) )
227
  #define UTF32InSwap(inPtr)  UTF32Unit ( __lwbrx ( (void*)(inPtr), 0 ) )
228
  
229
  #define UTF16OutSwap(outPtr,value)  __sthbrx ( value, (void*)(outPtr), 0 )
230
  #define UTF32OutSwap(outPtr,value)  __stwbrx ( value, (void*)(outPtr), 0 )
231
232
#else
233
234
  static inline UTF16Unit UTF16InSwap ( const UTF16Unit * inPtr )
235
0
  {
236
0
    UTF16Unit inUnit = *inPtr;
237
0
    return (inUnit << 8) | (inUnit >> 8);
238
0
  }
239
240
  static inline UTF32Unit UTF32InSwap ( const UTF32Unit * inPtr )
241
0
  {
242
0
    UTF32Unit inUnit = *inPtr;
243
0
    return (inUnit << 24) | ((inUnit << 8) & 0x00FF0000) | ((inUnit >> 8) & 0x0000FF00) | (inUnit >> 24);
244
0
  }
245
246
  static inline void UTF16OutSwap ( UTF16Unit * outPtr, const UTF16Unit value )
247
0
  {
248
0
    UTF16Unit outUnit = (value << 8) | (value >> 8);
249
0
    *outPtr = outUnit;
250
0
  }
251
252
  static inline void UTF32OutSwap ( UTF32Unit * outPtr, const UTF32Unit value )
253
0
  {
254
0
    UTF32Unit outUnit = (value << 24) | ((value << 8) & 0x00FF0000) | ((value >> 8) & 0x0000FF00) | (value >> 24);
255
0
    *outPtr = outUnit;
256
0
  }
257
258
#endif
259
260
// =================================================================================================
261
262
void SwapUTF16 ( const UTF16Unit * utf16In, UTF16Unit * utf16Out, const size_t utf16Len )
263
0
{
264
0
  for ( size_t i = 0; i < utf16Len; ++i ) utf16Out[i] = UTF16InSwap(utf16In+i);
265
0
}
266
267
0
void SwapUTF32 ( const UTF32Unit * utf32In, UTF32Unit * utf32Out, const size_t utf32Len ) {
268
0
  for ( size_t i = 0; i < utf32Len; ++i ) utf32Out[i] = UTF32InSwap(utf32In+i);
269
0
}
270
271
// =================================================================================================
272
273
extern void ToUTF16 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str, bool bigEndian )
274
0
{
275
0
  UTF8_to_UTF16_Proc Converter = UTF8_to_UTF16LE;
276
0
  if ( bigEndian ) Converter = UTF8_to_UTF16BE;
277
  
278
0
  enum { kBufferSize = 8*1024 };
279
0
  UTF16Unit u16Buffer[kBufferSize]; // 16K bytes
280
0
  size_t readCount, writeCount;
281
282
0
  utf16Str->erase();
283
0
  utf16Str->reserve ( 2*utf8Len );  // As good a guess as any.
284
  
285
0
  while ( utf8Len > 0 ) {
286
0
    Converter ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
287
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
288
0
    utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
289
0
    utf8In  += readCount;
290
0
    utf8Len -= readCount;
291
0
  }
292
293
0
}  // ToUTF16
294
295
// =================================================================================================
296
297
extern void ToUTF16Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str )
298
0
{
299
0
  enum { kBufferSize = 8*1024 };
300
0
  UTF16Unit u16Buffer[kBufferSize]; // 16K bytes
301
0
  size_t readCount, writeCount;
302
303
0
  utf16Str->erase();
304
0
  utf16Str->reserve ( 2*utf8Len );  // As good a guess as any.
305
  
306
0
  while ( utf8Len > 0 ) {
307
0
    UTF8_to_UTF16Nat ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
308
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
309
0
    utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
310
0
    utf8In  += readCount;
311
0
    utf8Len -= readCount;
312
0
  }
313
314
0
}  // ToUTF16Native
315
316
// =================================================================================================
317
318
extern void ToUTF32 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str, bool bigEndian )
319
0
{
320
0
  UTF8_to_UTF32_Proc Converter = UTF8_to_UTF32LE;
321
0
  if ( bigEndian ) Converter = UTF8_to_UTF32BE;
322
  
323
0
  enum { kBufferSize = 4*1024 };
324
0
  UTF32Unit u32Buffer[kBufferSize]; // 16K bytes
325
0
  size_t readCount, writeCount;
326
327
0
  utf32Str->erase();
328
0
  utf32Str->reserve ( 4*utf8Len );  // As good a guess as any.
329
  
330
0
  while ( utf8Len > 0 ) {
331
0
    Converter ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
332
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
333
0
    utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
334
0
    utf8In  += readCount;
335
0
    utf8Len -= readCount;
336
0
  }
337
338
0
}  // ToUTF32
339
340
// =================================================================================================
341
342
extern void ToUTF32Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str )
343
0
{
344
0
  enum { kBufferSize = 4*1024 };
345
0
  UTF32Unit u32Buffer[kBufferSize]; // 16K bytes
346
0
  size_t readCount, writeCount;
347
348
0
  utf32Str->erase();
349
0
  utf32Str->reserve ( 4*utf8Len );  // As good a guess as any.
350
  
351
0
  while ( utf8Len > 0 ) {
352
0
    UTF8_to_UTF32Nat ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
353
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
354
0
    utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
355
0
    utf8In  += readCount;
356
0
    utf8Len -= readCount;
357
0
  }
358
359
0
}  // ToUTF32Native
360
361
// =================================================================================================
362
363
extern void FromUTF16 ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str, bool bigEndian )
364
0
{
365
0
  UTF16_to_UTF8_Proc Converter = UTF16LE_to_UTF8;
366
0
  if ( bigEndian ) Converter = UTF16BE_to_UTF8;
367
  
368
0
  enum { kBufferSize = 16*1024 };
369
0
  UTF8Unit u8Buffer[kBufferSize];
370
0
  size_t readCount, writeCount;
371
372
0
  utf8Str->erase();
373
0
  utf8Str->reserve ( 2*utf16Len );  // As good a guess as any.
374
  
375
0
  while ( utf16Len > 0 ) {
376
0
    Converter ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
377
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
378
0
    utf8Str->append ( (const char *)u8Buffer, writeCount );
379
0
    utf16In  += readCount;
380
0
    utf16Len -= readCount;
381
0
  }
382
383
0
}  // FromUTF16
384
385
// =================================================================================================
386
387
extern void FromUTF16Native ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str )
388
0
{
389
0
  enum { kBufferSize = 16*1024 };
390
0
  UTF8Unit u8Buffer[kBufferSize];
391
0
  size_t readCount, writeCount;
392
393
0
  utf8Str->erase();
394
0
  utf8Str->reserve ( 2*utf16Len );  // As good a guess as any.
395
  
396
0
  while ( utf16Len > 0 ) {
397
0
    UTF16Nat_to_UTF8 ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
398
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
399
0
    utf8Str->append ( (const char *)u8Buffer, writeCount );
400
0
    utf16In  += readCount;
401
0
    utf16Len -= readCount;
402
0
  }
403
404
0
}  // FromUTF16Native
405
406
// =================================================================================================
407
408
extern void FromUTF32 ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str, bool bigEndian )
409
0
{
410
0
  UTF32_to_UTF8_Proc Converter = UTF32LE_to_UTF8;
411
0
  if ( bigEndian ) Converter = UTF32BE_to_UTF8;
412
  
413
0
  enum { kBufferSize = 16*1024 };
414
0
  UTF8Unit u8Buffer[kBufferSize];
415
0
  size_t readCount, writeCount;
416
417
0
  utf8Str->erase();
418
0
  utf8Str->reserve ( 2*utf32Len );  // As good a guess as any.
419
  
420
0
  while ( utf32Len > 0 ) {
421
0
    Converter ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
422
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
423
0
    utf8Str->append ( (const char *)u8Buffer, writeCount );
424
0
    utf32In  += readCount;
425
0
    utf32Len -= readCount;
426
0
  }
427
428
0
}  // FromUTF32
429
430
// =================================================================================================
431
432
extern void FromUTF32Native ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str )
433
0
{
434
0
  enum { kBufferSize = 16*1024 };
435
0
  UTF8Unit u8Buffer[kBufferSize];
436
0
  size_t readCount, writeCount;
437
438
0
  utf8Str->erase();
439
0
  utf8Str->reserve ( 2*utf32Len );  // As good a guess as any.
440
  
441
0
  while ( utf32Len > 0 ) {
442
0
    UTF32Nat_to_UTF8 ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
443
0
    if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
444
0
    utf8Str->append ( (const char *)u8Buffer, writeCount );
445
0
    utf32In  += readCount;
446
0
    utf32Len -= readCount;
447
0
  }
448
449
0
}  // FromUTF32Native
450
451
// =================================================================================================
452
453
static void CodePoint_to_UTF8_Multi ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
454
0
{
455
0
  size_t unitCount = 0;
456
  
457
0
  if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
458
0
  if ( (0xD800 <= cpIn) && (cpIn <= 0xDFFF) ) UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
459
  
460
  // Compute the number of bytes using 6 data bits each. Then see if the highest order bits will
461
  // fit into the leading byte. Write the UTF-8 sequence if there is enough room.
462
  
463
0
  UTF32Unit temp, mask;
464
0
  size_t bytesNeeded = 0;
465
0
  for ( temp = cpIn; temp != 0; temp = temp >> 6 ) ++bytesNeeded;
466
  
467
0
  temp = cpIn >> ((bytesNeeded-1)*6); // The highest order data bits.
468
0
  mask = (0x80 >> bytesNeeded) - 1; // Available data bits in the leading byte.
469
0
  if ( temp > mask ) ++bytesNeeded;
470
471
0
  if ( bytesNeeded > utf8Len ) goto Done; // Not enough room for the output.
472
0
  unitCount = bytesNeeded;
473
  
474
0
  temp = cpIn;
475
0
  for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded ) {
476
0
    utf8Out[bytesNeeded] = 0x80 | UTF8Unit ( temp & 0x3F );
477
0
    temp = temp >> 6;
478
0
  }
479
  
480
0
  mask = ~((1 << (8-unitCount)) - 1);
481
0
  utf8Out[0] = UTF8Unit ( mask | temp );
482
483
0
Done:
484
0
  *utf8Written = unitCount;
485
0
  return;
486
  
487
0
}  // CodePoint_to_UTF8_Multi
488
489
// =================================================================================================
490
491
void CodePoint_to_UTF8 ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
492
0
{
493
0
  size_t unitCount = 0;
494
495
0
  UC_Assert ( (utf8Out != 0) && (utf8Written != 0) );
496
0
  if ( utf8Len == 0 ) goto Done;
497
0
  if ( cpIn > 0x7F ) goto MultiByte; // ! Force linear execution path for ASCII.
498
  
499
0
  if ( utf8Len == 0 ) goto Done;
500
0
  unitCount = 1;
501
0
  *utf8Out = UTF8Unit(cpIn);
502
503
0
Done:
504
0
  *utf8Written = unitCount;
505
0
  return;
506
  
507
0
MultiByte:
508
0
   CodePoint_to_UTF8_Multi( cpIn, utf8Out, utf8Len, utf8Written );
509
0
   return;
510
  
511
0
}  // CodePoint_to_UTF8
512
513
// =================================================================================================
514
515
static void CodePoint_from_UTF8_Multi ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
516
0
{
517
0
  UTF8Unit  inUnit = *utf8In;
518
0
  size_t    unitCount = 0;
519
0
  UTF32Unit cp; // ! Avoid gcc complaints about declarations after goto's.
520
0
  const UTF8Unit * utf8Pos;
521
522
  // -------------------------------------------------------------------------------------
523
  // We've got a multibyte UTF-8 character. The first byte has the number of bytes and the
524
  // highest order data bits. The other bytes each add 6 more data bits.
525
  
526
  #if 0 // This might be a more effcient way to count the bytes.
527
    static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
528
    size_t bytesNeeded = kByteCounts [ inUnit >> 4 ];
529
    if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((inUnit & 0x08) != 0)) ) {
530
      UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
531
    }
532
  #endif
533
534
0
  size_t bytesNeeded = 0; // Count the leading 1 bits in the first byte.
535
0
  for ( UTF8Unit temp = inUnit; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
536
    // *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
537
  
538
0
  if ( (bytesNeeded < 2) || (bytesNeeded > 4) ) UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
539
0
  if ( bytesNeeded > utf8Len ) goto Done; // Not enough input in this buffer.
540
0
  unitCount = bytesNeeded;
541
  
542
0
  cp = inUnit & ((1 << (7-unitCount)) - 1); // Isolate the initial data bits in the bottom of cp.
543
  
544
0
  utf8Pos = utf8In + 1; // We've absorbed the first byte.
545
0
  for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
546
0
    inUnit = *utf8Pos;
547
0
    if ( (inUnit & UTF8Unit(0xC0)) != UTF8Unit(0x80) ) UC_Throw ( "Invalid UTF-8 data byte", kXMPErr_BadParam );
548
0
    cp = (cp << 6) | (inUnit & 0x3F);
549
0
  }
550
  
551
0
  if ( cp >= 0xD800 ) { // Skip the next comparisons most of the time.
552
0
    if ( (0xD800 <= cp) && (cp <= 0xDFFF) ) UC_Throw ( "Bad UTF-8 - surrogate code point", kXMPErr_BadParam );
553
0
    if ( cp > 0x10FFFF ) UC_Throw ( "Bad UTF-8 - out of range", kXMPErr_BadParam );
554
0
  }
555
  
556
0
  *cpOut = cp;  // ! Don't put after Done, don't write if no input.
557
  
558
0
Done: 
559
0
  *utf8Read = unitCount;
560
0
  return;
561
  
562
0
}  // CodePoint_from_UTF8_Multi
563
564
// =================================================================================================
565
566
void CodePoint_from_UTF8 ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
567
0
{
568
0
  UTF8Unit inUnit;  // ! Don't read until we know there is input.
569
0
  size_t unitCount = 0;
570
571
0
  UC_Assert ( (utf8In != 0) && (cpOut != 0) && (utf8Read != 0) );
572
0
  if ( utf8Len == 0 ) goto Done;
573
0
  inUnit = *utf8In;
574
0
  if ( inUnit >= 0x80 ) goto MultiByte; // ! Force linear execution path for ASCII.
575
  
576
0
  unitCount = 1;
577
0
  *cpOut = inUnit;  // ! Don't put after Done, don't write if no input.
578
  
579
0
Done: 
580
0
  *utf8Read = unitCount;
581
0
  return;
582
583
0
MultiByte:
584
0
  CodePoint_from_UTF8_Multi ( utf8In, utf8Len, cpOut, utf8Read );
585
0
  return;
586
  
587
0
}  // CodePoint_from_UTF8
588
589
// =================================================================================================
590
591
static void CodePoint_to_UTF16Nat_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
592
0
{
593
0
  size_t    unitCount = 0;
594
0
  UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's.
595
596
0
  if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
597
0
  if ( utf16Len < 2 ) goto Done; // Not enough room for the output.
598
599
0
  unitCount = 2;
600
0
  temp = cpIn - 0x10000;
601
0
  utf16Out[0] = 0xD800 | UTF16Unit ( temp >> 10 );
602
0
  utf16Out[1] = 0xDC00 | UTF16Unit ( temp & 0x3FF );
603
  
604
0
Done:
605
0
  *utf16Written = unitCount;
606
0
  return;
607
  
608
0
}  // CodePoint_to_UTF16Nat_Surrogate
609
610
// =================================================================================================
611
612
static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
613
0
{
614
0
  size_t unitCount = 0;
615
616
0
  UC_Assert ( (utf16Out != 0) && (utf16Written != 0) ); 
617
0
  if ( utf16Len == 0 ) goto Done;
618
0
  if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP.
619
620
0
InBMP:  
621
0
  unitCount = 1;
622
0
  *utf16Out = UTF16Unit(cpIn);
623
  
624
0
Done:
625
0
  *utf16Written = unitCount;
626
0
  return;
627
628
0
CheckSurrogate:
629
0
  if ( cpIn > 0xFFFF ) goto SurrogatePair;
630
0
  if ( cpIn > 0xDFFF ) goto InBMP;
631
0
  UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
632
  
633
0
SurrogatePair:
634
0
  CodePoint_to_UTF16Nat_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
635
0
  return;
636
  
637
0
}  // CodePoint_to_UTF16Nat
638
639
// =================================================================================================
640
641
static void CodePoint_from_UTF16Nat_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
642
0
{
643
0
  UTF16Unit hiUnit = *utf16In;
644
0
  size_t    unitCount = 0;
645
0
  UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's.
646
0
  UTF32Unit cp;
647
648
  // ----------------------------------
649
  // We've got a UTF-16 surrogate pair.
650
651
0
  if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
652
0
  if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer.
653
  
654
0
  loUnit  = *(utf16In+1);
655
0
  if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
656
  
657
0
  unitCount = 2;
658
0
  cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
659
660
0
  *cpOut = cp;  // ! Don't put after Done, don't write if no input.
661
  
662
0
Done:
663
0
  *utf16Read = unitCount;
664
0
  return;
665
  
666
0
}  // CodePoint_from_UTF16Nat_Surrogate
667
668
// =================================================================================================
669
670
static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
671
0
{
672
0
  UTF16Unit inUnit; // ! Don't read until we know there is input.
673
0
  size_t unitCount = 0;
674
675
0
  UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
676
0
  if ( utf16Len == 0 ) goto Done;
677
0
  inUnit = *utf16In;
678
0
  if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP.
679
680
0
  unitCount = 1;
681
0
  *cpOut = inUnit;  // ! Don't put after Done, don't write if no input.
682
  
683
0
Done:
684
0
  *utf16Read = unitCount;
685
0
  return;
686
687
0
SurrogatePair:
688
0
  CodePoint_from_UTF16Nat_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
689
0
  return;
690
  
691
0
}  // CodePoint_from_UTF16Nat
692
693
// =================================================================================================
694
695
static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In,   const size_t utf8Len,
696
                       UTF16Unit *      utf16Out, const size_t utf16Len,
697
                       size_t *         utf8Read, size_t *     utf16Written )
698
0
{
699
0
  const UTF8Unit * utf8Pos  = utf8In;
700
0
  UTF16Unit *      utf16Pos = utf16Out;
701
  
702
0
  size_t utf8Left  = utf8Len;
703
0
  size_t utf16Left = utf16Len;
704
  
705
0
  UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
706
  
707
0
  while ( (utf8Left > 0) && (utf16Left > 0) ) {
708
  
709
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
710
0
    size_t i, limit = utf8Left;
711
0
    if ( limit > utf16Left ) limit = utf16Left;
712
0
    for ( i = 0; i < limit; ++i ) {
713
0
      UTF8Unit inUnit = *utf8Pos;
714
0
      if ( inUnit > 0x7F ) break;
715
0
      *utf16Pos = inUnit;
716
0
      ++utf8Pos;
717
0
      ++utf16Pos;
718
0
    }
719
0
    utf8Left  -= i;
720
0
    utf16Left -= i;
721
    
722
    // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
723
0
    while ( (utf8Left > 0) && (utf16Left > 0) ) {
724
0
      UTF32Unit cp;
725
0
      size_t len8, len16;
726
0
      UTF8Unit inUnit = *utf8Pos;
727
0
      if ( inUnit <= 0x7F ) break;
728
0
      CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
729
0
      if ( len8 == 0 ) goto Done;   // The input buffer ends in the middle of a character.
730
0
      if ( cp <= 0xFFFF ) {
731
0
        *utf16Pos = UTF16Unit(cp);
732
0
        len16 = 1;
733
0
      } else {
734
0
        CodePoint_to_UTF16Nat_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
735
0
        if ( len16 == 0 ) goto Done; // Not enough room in the output buffer.
736
0
      }
737
0
      utf8Left  -= len8;
738
0
      utf8Pos   += len8;
739
0
      utf16Left -= len16;
740
0
      utf16Pos  += len16;
741
0
    }
742
  
743
0
  }
744
745
0
Done: // Set the output lengths.
746
0
  *utf8Read = utf8Len - utf8Left;
747
0
  *utf16Written = utf16Len - utf16Left;
748
  
749
0
}  // UTF8_to_UTF16Nat
750
751
// =================================================================================================
752
753
static void UTF8_to_UTF32Nat ( const UTF8Unit *  utf8In,   const size_t utf8Len,
754
                       UTF32Unit *       utf32Out, const size_t utf32Len,
755
                       size_t *          utf8Read, size_t *     utf32Written )
756
0
{
757
0
  const UTF8Unit * utf8Pos  = utf8In;
758
0
  UTF32Unit *      utf32Pos = utf32Out;
759
  
760
0
  size_t utf8Left  = utf8Len;
761
0
  size_t utf32Left = utf32Len;
762
  
763
0
  UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
764
  
765
0
  while ( (utf8Left > 0) && (utf32Left > 0) ) {
766
  
767
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
768
0
    size_t i, limit = utf8Left;
769
0
    if ( limit > utf32Left ) limit = utf32Left;
770
0
    for ( i = 0; i < limit; ++i ) {
771
0
      UTF8Unit inUnit = *utf8Pos;
772
0
      if ( inUnit > 0x7F ) break;
773
0
      *utf32Pos = inUnit;
774
0
      ++utf8Pos;
775
0
      ++utf32Pos;
776
0
    }
777
0
    utf8Left -= i;
778
0
    utf32Left -= i;
779
    
780
    // Do a run of non-ASCII, it copies variable input into 1 output unit.
781
0
    while ( (utf8Left > 0) && (utf32Left > 0) ) {
782
0
      size_t len;
783
0
      UTF8Unit inUnit = *utf8Pos;
784
0
      if ( inUnit <= 0x7F ) break;
785
0
      CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, utf32Pos, &len );
786
0
      if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character.
787
0
      utf8Left  -= len;
788
0
      utf8Pos   += len;
789
0
      utf32Left -= 1;
790
0
      utf32Pos  += 1;
791
0
    }
792
  
793
0
  }
794
  
795
0
Done: // Set the output lengths.
796
0
  *utf8Read = utf8Len - utf8Left;
797
0
  *utf32Written = utf32Len - utf32Left;
798
  
799
0
}  // UTF8_to_UTF32Nat
800
801
// =================================================================================================
802
803
static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
804
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
805
                       size_t *          utf16Read, size_t *     utf8Written )
806
0
{
807
0
  const UTF16Unit * utf16Pos = utf16In;
808
0
  UTF8Unit *        utf8Pos  = utf8Out;
809
  
810
0
  size_t utf16Left = utf16Len;
811
0
  size_t utf8Left  = utf8Len;
812
  
813
0
  UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
814
  
815
0
  while ( (utf16Left > 0) && (utf8Left > 0) ) {
816
  
817
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
818
0
    size_t i, limit = utf16Left;
819
0
    if ( limit > utf8Left ) limit = utf8Left;
820
0
    for ( i = 0; i < limit; ++i ) {
821
0
      UTF16Unit inUnit = *utf16Pos;
822
0
      if ( inUnit > 0x7F ) break;
823
0
      *utf8Pos = UTF8Unit(inUnit);
824
0
      ++utf16Pos;
825
0
      ++utf8Pos;
826
0
    }
827
0
    utf16Left -= i;
828
0
    utf8Left  -= i;
829
    
830
    // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
831
0
    while ( (utf16Left > 0) && (utf8Left > 0) ) {
832
0
      size_t len8;
833
0
      UTF16Unit inUnit = *utf16Pos;
834
0
      if ( inUnit <= 0x7F ) break;
835
0
      if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
836
0
      CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
837
0
      if ( len8 == 0 ) goto Done;   // Not enough room in the output buffer.
838
0
      utf16Left -= 1;
839
0
      utf16Pos  += 1;
840
0
      utf8Left  -= len8;
841
0
      utf8Pos   += len8;
842
0
    }
843
    
844
    // Do a run of surrogate pairs, it copies 2 input units into multiple output units.
845
0
    while ( (utf16Left > 0) && (utf8Left > 0) ) {
846
0
      UTF32Unit cp;
847
0
      size_t len16, len8;
848
0
      UTF16Unit inUnit = *utf16Pos;
849
0
      if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
850
0
      CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
851
0
      if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
852
0
      UC_Assert ( len16 == 2 );
853
0
      CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
854
0
      if ( len8 == 0 ) goto Done;   // Not enough room in the output buffer.
855
0
      utf16Left -= len16;
856
0
      utf16Pos  += len16;
857
0
      utf8Left  -= len8;
858
0
      utf8Pos   += len8;
859
0
    }
860
  
861
0
  }
862
  
863
0
Done: // Set the output lengths.
864
0
  *utf16Read = utf16Len - utf16Left;
865
0
  *utf8Written = utf8Len - utf8Left;
866
  
867
0
}  // UTF16Nat_to_UTF8
868
869
// =================================================================================================
870
871
static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
872
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
873
                       size_t *          utf32Read, size_t *     utf8Written )
874
0
{
875
0
  const UTF32Unit * utf32Pos = utf32In;
876
0
  UTF8Unit *        utf8Pos  = utf8Out;
877
  
878
0
  size_t utf32Left = utf32Len;
879
0
  size_t utf8Left  = utf8Len;
880
  
881
0
  UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
882
  
883
0
  while ( (utf32Left > 0) && (utf8Left > 0) ) {
884
  
885
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
886
0
    size_t i, limit = utf32Left;
887
0
    if ( limit > utf8Left ) limit = utf8Left;
888
0
    for ( i = 0; i < limit; ++i ) {
889
0
      UTF32Unit inUnit = *utf32Pos;
890
0
      if ( inUnit > 0x7F ) break;
891
0
      *utf8Pos = UTF8Unit(inUnit);
892
0
      ++utf32Pos;
893
0
      ++utf8Pos;
894
0
    }
895
0
    utf32Left -= i;
896
0
    utf8Left  -= i;
897
    
898
    // Do a run of non-ASCII, it copies 1 input unit into multiple output units.
899
0
    while ( (utf32Left > 0) && (utf8Left > 0) ) {
900
0
      size_t len;
901
0
      UTF32Unit inUnit = *utf32Pos;
902
0
      if ( inUnit <= 0x7F ) break;
903
0
      CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len );
904
0
      if ( len == 0 ) goto Done; // Not enough room in the output buffer.
905
0
      utf32Left -= 1;
906
0
      utf32Pos  += 1;
907
0
      utf8Left  -= len;
908
0
      utf8Pos   += len;
909
0
    }
910
  
911
0
  }
912
  
913
0
Done: // Set the output lengths.
914
0
  *utf32Read = utf32Len - utf32Left;
915
0
  *utf8Written = utf8Len - utf8Left;
916
  
917
0
}  // UTF32Nat_to_UTF8
918
919
// =================================================================================================
920
921
static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
922
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
923
                           size_t *          utf16Read, size_t *     utf32Written )
924
0
{
925
0
  const UTF16Unit * utf16Pos = utf16In;
926
0
  UTF32Unit *       utf32Pos = utf32Out;
927
  
928
0
  size_t utf16Left = utf16Len;
929
0
  size_t utf32Left = utf32Len;
930
  
931
0
  UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
932
  
933
0
  while ( (utf16Left > 0) && (utf32Left > 0) ) {
934
  
935
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
936
0
    size_t i, limit = utf16Left;
937
0
    if ( limit > utf32Left ) limit = utf32Left;
938
0
    for ( i = 0; i < limit; ++i ) {
939
0
      UTF16Unit inUnit = *utf16Pos;
940
0
      if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
941
0
      *utf32Pos = inUnit;
942
0
      ++utf16Pos;
943
0
      ++utf32Pos;
944
0
    }
945
0
    utf16Left -= i;
946
0
    utf32Left -= i;
947
    
948
    // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
949
0
    while ( (utf16Left > 0) && (utf32Left > 0) ) {
950
0
      size_t len;
951
0
      UTF16Unit inUnit = *utf16Pos;
952
0
      if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
953
0
      CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
954
0
      if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
955
0
      UC_Assert ( len == 2 );
956
0
      utf16Left -= len;
957
0
      utf16Pos  += len;
958
0
      utf32Left -= 1;
959
0
      utf32Pos  += 1;
960
0
    }
961
  
962
0
  }
963
  
964
0
Done: // Set the output lengths.
965
0
  *utf16Read = utf16Len - utf16Left;
966
0
  *utf32Written = utf32Len - utf32Left;
967
  
968
0
}  // UTF16Nat_to_UTF32Nat
969
970
// =================================================================================================
971
972
static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
973
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
974
                           size_t *          utf32Read, size_t *     utf16Written )
975
0
{
976
0
  const UTF32Unit * utf32Pos = utf32In;
977
0
  UTF16Unit *       utf16Pos = utf16Out;
978
  
979
0
  size_t utf32Left = utf32Len;
980
0
  size_t utf16Left = utf16Len;
981
  
982
0
  UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
983
  
984
0
  while ( (utf32Left > 0) && (utf16Left > 0) ) {
985
  
986
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
987
0
    size_t i, limit = utf32Left;
988
0
    if ( limit > utf16Left ) limit = utf16Left;
989
0
    for ( i = 0; i < limit; ++i ) {
990
0
      UTF32Unit inUnit = *utf32Pos;
991
0
      if ( inUnit > 0xFFFF ) break;
992
0
      *utf16Pos = UTF16Unit(inUnit);
993
0
      ++utf32Pos;
994
0
      ++utf16Pos;
995
0
    }
996
0
    utf32Left -= i;
997
0
    utf16Left -= i;
998
    
999
    // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1000
0
    while ( (utf32Left > 0) && (utf16Left > 0) ) {
1001
0
      size_t len;
1002
0
      UTF32Unit inUnit = *utf32Pos;
1003
0
      if ( inUnit <= 0xFFFF ) break;
1004
0
      CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1005
0
      if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1006
0
      UC_Assert ( len == 2 );
1007
0
      utf32Left -= 1;
1008
0
      utf32Pos  += 1;
1009
0
      utf16Left -= 2;
1010
0
      utf16Pos  += 2;
1011
0
    }
1012
  
1013
0
  }
1014
  
1015
0
Done: // Set the output lengths.
1016
0
  *utf32Read = utf32Len - utf32Left;
1017
0
  *utf16Written = utf16Len - utf16Left;
1018
  
1019
0
}  // UTF32Nat_to_UTF16Nat
1020
1021
// =================================================================================================
1022
1023
static void CodePoint_to_UTF16Swp_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1024
0
{
1025
0
  size_t unitCount = 0;
1026
0
  UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's.
1027
1028
0
  if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
1029
0
  if ( utf16Len < 2 ) goto Done; // Not enough room for the output.
1030
1031
0
  unitCount = 2;
1032
0
  temp = cpIn - 0x10000;
1033
0
  UTF16OutSwap ( &utf16Out[0], (0xD800 | UTF16Unit ( temp >> 10 )) );
1034
0
  UTF16OutSwap ( &utf16Out[1], (0xDC00 | UTF16Unit ( temp & 0x3FF)) );
1035
  
1036
0
Done:
1037
0
  *utf16Written = unitCount;
1038
0
  return;
1039
  
1040
0
}  // CodePoint_to_UTF16Swp_Surrogate
1041
1042
// =================================================================================================
1043
1044
static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1045
0
{
1046
0
  size_t unitCount = 0;
1047
1048
0
  UC_Assert ( (utf16Out != 0) && (utf16Written != 0) ); 
1049
0
  if ( utf16Len == 0 ) goto Done;
1050
0
  if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP.
1051
1052
0
InBMP:  
1053
0
  unitCount = 1;
1054
0
  UTF16OutSwap ( utf16Out, UTF16Unit(cpIn) );
1055
  
1056
0
Done:
1057
0
  *utf16Written = unitCount;
1058
0
  return;
1059
1060
0
CheckSurrogate:
1061
0
  if ( cpIn > 0xFFFF ) goto SurrogatePair;
1062
0
  if ( cpIn > 0xDFFF ) goto InBMP;
1063
0
  UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
1064
  
1065
0
SurrogatePair:
1066
0
  CodePoint_to_UTF16Swp_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
1067
0
  return;
1068
  
1069
0
}  // CodePoint_to_UTF16Swp
1070
1071
// =================================================================================================
1072
1073
static void CodePoint_from_UTF16Swp_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1074
0
{
1075
0
  UTF16Unit hiUnit = UTF16InSwap(utf16In);
1076
0
  size_t unitCount = 0;
1077
0
  UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's.
1078
0
  UTF32Unit cp;
1079
1080
  // ----------------------------------
1081
  // We've got a UTF-16 surrogate pair.
1082
1083
0
  if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
1084
0
  if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer.
1085
  
1086
0
  loUnit  = UTF16InSwap(utf16In+1);
1087
0
  if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
1088
  
1089
0
  unitCount = 2;
1090
0
  cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
1091
1092
0
  *cpOut = cp;  // ! Don't put after Done, don't write if no input.
1093
  
1094
0
Done:
1095
0
  *utf16Read = unitCount;
1096
0
  return;
1097
  
1098
0
}  // CodePoint_from_UTF16Swp_Surrogate
1099
1100
// =================================================================================================
1101
1102
static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1103
0
{
1104
0
  UTF16Unit inUnit; // ! Don't read until we know there is input.
1105
0
  size_t unitCount = 0;
1106
1107
0
  UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
1108
0
  if ( utf16Len == 0 ) goto Done;
1109
0
  inUnit = UTF16InSwap(utf16In);
1110
0
  if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP.
1111
1112
0
  unitCount = 1;
1113
0
  *cpOut = inUnit;  // ! Don't put after Done, don't write if no input.
1114
  
1115
0
Done:
1116
0
  *utf16Read = unitCount;
1117
0
  return;
1118
1119
0
SurrogatePair:
1120
0
  CodePoint_from_UTF16Swp_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
1121
0
  return;
1122
  
1123
0
}  // CodePoint_from_UTF16Swp
1124
1125
// =================================================================================================
1126
1127
static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In,   const size_t utf8Len,
1128
                       UTF16Unit *      utf16Out, const size_t utf16Len,
1129
                       size_t *         utf8Read, size_t *     utf16Written )
1130
0
{
1131
0
  const UTF8Unit * utf8Pos  = utf8In;
1132
0
  UTF16Unit *      utf16Pos = utf16Out;
1133
  
1134
0
  size_t utf8Left  = utf8Len;
1135
0
  size_t utf16Left = utf16Len;
1136
  
1137
0
  UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
1138
  
1139
0
  while ( (utf8Left > 0) && (utf16Left > 0) ) {
1140
  
1141
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1142
0
    size_t i, limit = utf8Left;
1143
0
    if ( limit > utf16Left ) limit = utf16Left;
1144
0
    for ( i = 0; i < limit; ++i ) {
1145
0
      UTF8Unit inUnit = *utf8Pos;
1146
0
      if ( inUnit > 0x7F ) break;
1147
0
      *utf16Pos = UTF16Unit(inUnit) << 8; // Better than: UTF16OutSwap ( utf16Pos, inUnit );
1148
0
      ++utf8Pos;
1149
0
      ++utf16Pos;
1150
0
    }
1151
0
    utf8Left  -= i;
1152
0
    utf16Left -= i;
1153
    
1154
    // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
1155
0
    while ( (utf8Left > 0) && (utf16Left > 0) ) {
1156
0
      UTF32Unit cp;
1157
0
      size_t len8, len16;
1158
0
      UTF8Unit inUnit = *utf8Pos;
1159
0
      if ( inUnit <= 0x7F ) break;
1160
0
      CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
1161
0
      if ( len8 == 0 ) goto Done;   // The input buffer ends in the middle of a character.
1162
0
      if ( cp <= 0xFFFF ) {
1163
0
        UTF16OutSwap ( utf16Pos, UTF16Unit(cp) );
1164
0
        len16 = 1;
1165
0
      } else {
1166
0
        CodePoint_to_UTF16Swp_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
1167
0
        if ( len16 == 0 ) goto Done; // Not enough room in the output buffer.
1168
0
      }
1169
0
      utf8Left  -= len8;
1170
0
      utf8Pos   += len8;
1171
0
      utf16Left -= len16;
1172
0
      utf16Pos  += len16;
1173
0
    }
1174
  
1175
0
  }
1176
1177
0
Done: // Set the output lengths.
1178
0
  *utf8Read = utf8Len - utf8Left;
1179
0
  *utf16Written = utf16Len - utf16Left;
1180
  
1181
0
}  // UTF8_to_UTF16Swp
1182
1183
// =================================================================================================
1184
1185
static void UTF8_to_UTF32Swp ( const UTF8Unit *  utf8In,   const size_t utf8Len,
1186
                       UTF32Unit *       utf32Out, const size_t utf32Len,
1187
                       size_t *          utf8Read, size_t *     utf32Written )
1188
0
{
1189
0
  const UTF8Unit * utf8Pos  = utf8In;
1190
0
  UTF32Unit *      utf32Pos = utf32Out;
1191
  
1192
0
  size_t utf8Left  = utf8Len;
1193
0
  size_t utf32Left = utf32Len;
1194
  
1195
0
  UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
1196
  
1197
0
  while ( (utf8Left > 0) && (utf32Left > 0) ) {
1198
  
1199
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1200
0
    size_t i, limit = utf8Left;
1201
0
    if ( limit > utf32Left ) limit = utf32Left;
1202
0
    for ( i = 0; i < limit; ++i ) {
1203
0
      UTF8Unit inUnit = *utf8Pos;
1204
0
      if ( inUnit > 0x7F ) break;
1205
0
      *utf32Pos = UTF32Unit(inUnit) << 24;  // Better than: UTF32OutSwap ( utf32Pos, inUnit );
1206
0
      ++utf8Pos;
1207
0
      ++utf32Pos;
1208
0
    }
1209
0
    utf8Left -= i;
1210
0
    utf32Left -= i;
1211
    
1212
    // Do a run of non-ASCII, it copies variable input into 1 output unit.
1213
0
    while ( (utf8Left > 0) && (utf32Left > 0) ) {
1214
0
      size_t len;
1215
0
      UTF32Unit cp;
1216
0
      UTF8Unit inUnit = *utf8Pos;
1217
0
      if ( inUnit <= 0x7F ) break;
1218
0
      CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len );
1219
0
      if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character.
1220
0
      UTF32OutSwap ( utf32Pos, cp );
1221
0
      utf8Left  -= len;
1222
0
      utf8Pos   += len;
1223
0
      utf32Left -= 1;
1224
0
      utf32Pos  += 1;
1225
0
    }
1226
  
1227
0
  }
1228
  
1229
0
Done: // Set the output lengths.
1230
0
  *utf8Read = utf8Len - utf8Left;
1231
0
  *utf32Written = utf32Len - utf32Left;
1232
  
1233
0
}  // UTF8_to_UTF32Swp
1234
1235
// =================================================================================================
1236
1237
static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
1238
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
1239
                       size_t *          utf16Read, size_t *     utf8Written )
1240
0
{
1241
0
  const UTF16Unit * utf16Pos = utf16In;
1242
0
  UTF8Unit *        utf8Pos  = utf8Out;
1243
  
1244
0
  size_t utf16Left = utf16Len;
1245
0
  size_t utf8Left  = utf8Len;
1246
  
1247
0
  UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
1248
  
1249
0
  while ( (utf16Left > 0) && (utf8Left > 0) ) {
1250
  
1251
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1252
0
    size_t i, limit = utf16Left;
1253
0
    if ( limit > utf8Left ) limit = utf8Left;
1254
0
    for ( i = 0; i < limit; ++i ) {
1255
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1256
0
      if ( inUnit > 0x7F ) break;
1257
0
      *utf8Pos = UTF8Unit(inUnit);
1258
0
      ++utf16Pos;
1259
0
      ++utf8Pos;
1260
0
    }
1261
0
    utf16Left -= i;
1262
0
    utf8Left  -= i;
1263
    
1264
    // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
1265
0
    while ( (utf16Left > 0) && (utf8Left > 0) ) {
1266
0
      size_t len8;
1267
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1268
0
      if ( inUnit <= 0x7F ) break;
1269
0
      if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1270
0
      CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
1271
0
      if ( len8 == 0 ) goto Done;   // Not enough room in the output buffer.
1272
0
      utf16Left -= 1;
1273
0
      utf16Pos  += 1;
1274
0
      utf8Left  -= len8;
1275
0
      utf8Pos   += len8;
1276
0
    }
1277
    
1278
    // Do a run of surrogate pairs, it copies 2 input units into multiple output units.
1279
0
    while ( (utf16Left > 0) && (utf8Left > 0) ) {
1280
0
      UTF32Unit cp;
1281
0
      size_t len16, len8;
1282
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1283
0
      if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1284
0
      CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
1285
0
      if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1286
0
      UC_Assert ( len16 == 2 );
1287
0
      CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
1288
0
      if ( len8 == 0 ) goto Done;   // Not enough room in the output buffer.
1289
0
      utf16Left -= len16;
1290
0
      utf16Pos  += len16;
1291
0
      utf8Left  -= len8;
1292
0
      utf8Pos   += len8;
1293
0
    }
1294
  
1295
0
  }
1296
  
1297
0
Done: // Set the output lengths.
1298
0
  *utf16Read = utf16Len - utf16Left;
1299
0
  *utf8Written = utf8Len - utf8Left;
1300
  
1301
0
}  // UTF16Swp_to_UTF8
1302
1303
// =================================================================================================
1304
1305
static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
1306
                       UTF8Unit *        utf8Out,   const size_t utf8Len,
1307
                       size_t *          utf32Read, size_t *     utf8Written )
1308
0
{
1309
0
  const UTF32Unit * utf32Pos = utf32In;
1310
0
  UTF8Unit *        utf8Pos  = utf8Out;
1311
  
1312
0
  size_t utf32Left = utf32Len;
1313
0
  size_t utf8Left  = utf8Len;
1314
  
1315
0
  UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
1316
  
1317
0
  while ( (utf32Left > 0) && (utf8Left > 0) ) {
1318
  
1319
    // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1320
0
    size_t i, limit = utf32Left;
1321
0
    if ( limit > utf8Left ) limit = utf8Left;
1322
0
    for ( i = 0; i < limit; ++i ) {
1323
0
      UTF32Unit cp = UTF32InSwap(utf32Pos);
1324
0
      if ( cp > 0x7F ) break;
1325
0
      *utf8Pos = UTF8Unit(cp);
1326
0
      ++utf32Pos;
1327
0
      ++utf8Pos;
1328
0
    }
1329
0
    utf32Left -= i;
1330
0
    utf8Left  -= i;
1331
    
1332
    // Do a run of non-ASCII, it copies 1 input unit into multiple output units.
1333
0
    while ( (utf32Left > 0) && (utf8Left > 0) ) {
1334
0
      size_t len;
1335
0
      UTF32Unit cp = UTF32InSwap(utf32Pos);
1336
0
      if ( cp <= 0x7F ) break;
1337
0
      CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len );
1338
0
      if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1339
0
      utf32Left -= 1;
1340
0
      utf32Pos  += 1;
1341
0
      utf8Left  -= len;
1342
0
      utf8Pos   += len;
1343
0
    }
1344
  
1345
0
  }
1346
  
1347
0
Done: // Set the output lengths.
1348
0
  *utf32Read = utf32Len - utf32Left;
1349
0
  *utf8Written = utf8Len - utf8Left;
1350
  
1351
0
}  // UTF32Swp_to_UTF8
1352
1353
// =================================================================================================
1354
1355
static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
1356
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
1357
                           size_t *          utf16Read, size_t *     utf32Written )
1358
0
{
1359
0
  const UTF16Unit * utf16Pos = utf16In;
1360
0
  UTF32Unit *       utf32Pos = utf32Out;
1361
  
1362
0
  size_t utf16Left = utf16Len;
1363
0
  size_t utf32Left = utf32Len;
1364
  
1365
0
  UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1366
  
1367
0
  while ( (utf16Left > 0) && (utf32Left > 0) ) {
1368
  
1369
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
1370
0
    size_t i, limit = utf16Left;
1371
0
    if ( limit > utf32Left ) limit = utf32Left;
1372
0
    for ( i = 0; i < limit; ++i ) {
1373
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1374
0
      if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1375
0
      *utf32Pos = UTF32Unit(*utf16Pos) << 16; // Better than: UTF32OutSwap ( utf32Pos, inUnit );
1376
0
      ++utf16Pos;
1377
0
      ++utf32Pos;
1378
0
    }
1379
0
    utf16Left -= i;
1380
0
    utf32Left -= i;
1381
    
1382
    // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1383
0
    while ( (utf16Left > 0) && (utf32Left > 0) ) {
1384
0
      size_t len;
1385
0
      UTF32Unit cp;
1386
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1387
0
      if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1388
0
      CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1389
0
      if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1390
0
      UTF32OutSwap ( utf32Pos, cp );
1391
0
      UC_Assert ( len == 2 );
1392
0
      utf16Left -= len;
1393
0
      utf16Pos  += len;
1394
0
      utf32Left -= 1;
1395
0
      utf32Pos  += 1;
1396
0
    }
1397
  
1398
0
  }
1399
  
1400
0
Done: // Set the output lengths.
1401
0
  *utf16Read = utf16Len - utf16Left;
1402
0
  *utf32Written = utf32Len - utf32Left;
1403
  
1404
0
}  // UTF16Swp_to_UTF32Swp
1405
1406
// =================================================================================================
1407
1408
static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
1409
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
1410
                           size_t *          utf32Read, size_t *     utf16Written )
1411
0
{
1412
0
  const UTF32Unit * utf32Pos = utf32In;
1413
0
  UTF16Unit *       utf16Pos = utf16Out;
1414
  
1415
0
  size_t utf32Left = utf32Len;
1416
0
  size_t utf16Left = utf16Len;
1417
  
1418
0
  const size_t k32to16Offset = swap32to16Offset;  // ! Make sure compiler treats as an invariant.
1419
  
1420
0
  UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1421
  
1422
0
  while ( (utf32Left > 0) && (utf16Left > 0) ) {
1423
  
1424
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
1425
0
    size_t i, limit = utf32Left;
1426
0
    if ( limit > utf16Left ) limit = utf16Left;
1427
0
    for ( i = 0; i < limit; ++i ) {
1428
0
      UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1429
0
      if ( inUnit > 0xFFFF ) break;
1430
0
      *utf16Pos = *(((UTF16Unit*)utf32Pos) + k32to16Offset);  // Better than: UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1431
0
      ++utf32Pos;
1432
0
      ++utf16Pos;
1433
0
    }
1434
0
    utf32Left -= i;
1435
0
    utf16Left -= i;
1436
    
1437
    // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1438
0
    while ( (utf32Left > 0) && (utf16Left > 0) ) {
1439
0
      size_t len;
1440
0
      UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1441
0
      if ( inUnit <= 0xFFFF ) break;
1442
0
      CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1443
0
      if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1444
0
      UC_Assert ( len == 2 );
1445
0
      utf32Left -= 1;
1446
0
      utf32Pos  += 1;
1447
0
      utf16Left -= 2;
1448
0
      utf16Pos  += 2;
1449
0
    }
1450
  
1451
0
  }
1452
  
1453
0
Done: // Set the output lengths.
1454
0
  *utf32Read = utf32Len - utf32Left;
1455
0
  *utf16Written = utf16Len - utf16Left;
1456
  
1457
0
}  // UTF32Swp_to_UTF16Swp
1458
1459
// =================================================================================================
1460
1461
static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
1462
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
1463
                           size_t *          utf16Read, size_t *     utf32Written )
1464
0
{
1465
0
  const UTF16Unit * utf16Pos = utf16In;
1466
0
  UTF32Unit *       utf32Pos = utf32Out;
1467
  
1468
0
  size_t utf16Left = utf16Len;
1469
0
  size_t utf32Left = utf32Len;
1470
  
1471
0
  UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1472
  
1473
0
  while ( (utf16Left > 0) && (utf32Left > 0) ) {
1474
  
1475
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
1476
0
    size_t i, limit = utf16Left;
1477
0
    if ( limit > utf32Left ) limit = utf32Left;
1478
0
    for ( i = 0; i < limit; ++i ) {
1479
0
      UTF16Unit inUnit = *utf16Pos;
1480
0
      if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1481
0
      UTF32OutSwap ( utf32Pos, inUnit );
1482
0
      ++utf16Pos;
1483
0
      ++utf32Pos;
1484
0
    }
1485
0
    utf16Left -= i;
1486
0
    utf32Left -= i;
1487
    
1488
    // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1489
0
    while ( (utf16Left > 0) && (utf32Left > 0) ) {
1490
0
      size_t len;
1491
0
      UTF32Unit cp;
1492
0
      UTF16Unit inUnit = *utf16Pos;
1493
0
      if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1494
0
      CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1495
0
      if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1496
0
      UC_Assert ( len == 2 );
1497
0
      UTF32OutSwap ( utf32Pos, cp );
1498
0
      utf16Left -= len;
1499
0
      utf16Pos  += len;
1500
0
      utf32Left -= 1;
1501
0
      utf32Pos  += 1;
1502
0
    }
1503
  
1504
0
  }
1505
  
1506
0
Done: // Set the output lengths.
1507
0
  *utf16Read = utf16Len - utf16Left;
1508
0
  *utf32Written = utf32Len - utf32Left;
1509
  
1510
0
}  // UTF16Nat_to_UTF32Swp
1511
1512
// =================================================================================================
1513
1514
static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
1515
                           UTF32Unit *       utf32Out,  const size_t utf32Len,
1516
                           size_t *          utf16Read, size_t *     utf32Written )
1517
0
{
1518
0
  const UTF16Unit * utf16Pos = utf16In;
1519
0
  UTF32Unit *       utf32Pos = utf32Out;
1520
  
1521
0
  size_t utf16Left = utf16Len;
1522
0
  size_t utf32Left = utf32Len;
1523
  
1524
0
  UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1525
  
1526
0
  while ( (utf16Left > 0) && (utf32Left > 0) ) {
1527
  
1528
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
1529
0
    size_t i, limit = utf16Left;
1530
0
    if ( limit > utf32Left ) limit = utf32Left;
1531
0
    for ( i = 0; i < limit; ++i ) {
1532
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1533
0
      if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1534
0
      *utf32Pos = inUnit;
1535
0
      ++utf16Pos;
1536
0
      ++utf32Pos;
1537
0
    }
1538
0
    utf16Left -= i;
1539
0
    utf32Left -= i;
1540
    
1541
    // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1542
0
    while ( (utf16Left > 0) && (utf32Left > 0) ) {
1543
0
      size_t len;
1544
0
      UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1545
0
      if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1546
0
      CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
1547
0
      if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1548
0
      UC_Assert ( len == 2 );
1549
0
      utf16Left -= len;
1550
0
      utf16Pos  += len;
1551
0
      utf32Left -= 1;
1552
0
      utf32Pos  += 1;
1553
0
    }
1554
  
1555
0
  }
1556
  
1557
0
Done: // Set the output lengths.
1558
0
  *utf16Read = utf16Len - utf16Left;
1559
0
  *utf32Written = utf32Len - utf32Left;
1560
  
1561
0
}  // UTF16Swp_to_UTF32Nat
1562
1563
// =================================================================================================
1564
1565
static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
1566
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
1567
                           size_t *          utf32Read, size_t *     utf16Written )
1568
0
{
1569
0
  const UTF32Unit * utf32Pos = utf32In;
1570
0
  UTF16Unit *       utf16Pos = utf16Out;
1571
  
1572
0
  size_t utf32Left = utf32Len;
1573
0
  size_t utf16Left = utf16Len;
1574
  
1575
0
  UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1576
  
1577
0
  while ( (utf32Left > 0) && (utf16Left > 0) ) {
1578
  
1579
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
1580
0
    size_t i, limit = utf32Left;
1581
0
    if ( limit > utf16Left ) limit = utf16Left;
1582
0
    for ( i = 0; i < limit; ++i ) {
1583
0
      UTF32Unit inUnit = *utf32Pos;
1584
0
      if ( inUnit > 0xFFFF ) break;
1585
0
      UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1586
0
      ++utf32Pos;
1587
0
      ++utf16Pos;
1588
0
    }
1589
0
    utf32Left -= i;
1590
0
    utf16Left -= i;
1591
    
1592
    // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1593
0
    while ( (utf32Left > 0) && (utf16Left > 0) ) {
1594
0
      size_t len;
1595
0
      UTF32Unit inUnit = *utf32Pos;
1596
0
      if ( inUnit <= 0xFFFF ) break;
1597
0
      CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1598
0
      if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1599
0
      UC_Assert ( len == 2 );
1600
0
      utf32Left -= 1;
1601
0
      utf32Pos  += 1;
1602
0
      utf16Left -= 2;
1603
0
      utf16Pos  += 2;
1604
0
    }
1605
  
1606
0
  }
1607
  
1608
0
Done: // Set the output lengths.
1609
0
  *utf32Read = utf32Len - utf32Left;
1610
0
  *utf16Written = utf16Len - utf16Left;
1611
  
1612
0
}  // UTF32Nat_to_UTF16Swp
1613
1614
// =================================================================================================
1615
1616
static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
1617
                           UTF16Unit *       utf16Out,  const size_t utf16Len,
1618
                           size_t *          utf32Read, size_t *     utf16Written )
1619
0
{
1620
0
  const UTF32Unit * utf32Pos = utf32In;
1621
0
  UTF16Unit *       utf16Pos = utf16Out;
1622
  
1623
0
  size_t utf32Left = utf32Len;
1624
0
  size_t utf16Left = utf16Len;
1625
  
1626
0
  UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1627
  
1628
0
  while ( (utf32Left > 0) && (utf16Left > 0) ) {
1629
  
1630
    // Do a run of BMP, it copies 1 input unit into 1 output unit.
1631
0
    size_t i, limit = utf32Left;
1632
0
    if ( limit > utf16Left ) limit = utf16Left;
1633
0
    for ( i = 0; i < limit; ++i ) {
1634
0
      UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1635
0
      if ( inUnit > 0xFFFF ) break;
1636
0
      *utf16Pos = UTF16Unit(inUnit);
1637
0
      ++utf32Pos;
1638
0
      ++utf16Pos;
1639
0
    }
1640
0
    utf32Left -= i;
1641
0
    utf16Left -= i;
1642
    
1643
    // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1644
0
    while ( (utf32Left > 0) && (utf16Left > 0) ) {
1645
0
      size_t len;
1646
0
      UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1647
0
      if ( inUnit <= 0xFFFF ) break;
1648
0
      CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1649
0
      if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1650
0
      UC_Assert ( len == 2 );
1651
0
      utf32Left -= 1;
1652
0
      utf32Pos  += 1;
1653
0
      utf16Left -= 2;
1654
0
      utf16Pos  += 2;
1655
0
    }
1656
  
1657
0
  }
1658
  
1659
0
Done: // Set the output lengths.
1660
0
  *utf32Read = utf32Len - utf32Left;
1661
0
  *utf16Written = utf16Len - utf16Left;
1662
  
1663
0
}  // UTF32Swp_to_UTF16Nat
1664
1665
// =================================================================================================