Coverage Report

Created: 2025-12-11 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libcups/cups/transcode.c
Line
Count
Source
1
//
2
// Transcoding support for CUPS.
3
//
4
// Copyright © 2020-2025 by OpenPrinting.
5
// Copyright © 2007-2014 by Apple Inc.
6
// Copyright © 1997-2007 by Easy Software Products.
7
//
8
// Licensed under Apache License v2.0.  See the file "LICENSE" for more
9
// information.
10
//
11
12
#include "cups.h"
13
#include "thread.h"
14
#include "transcode.h"
15
#include "debug-internal.h"
16
#include "string-private.h"
17
#include <limits.h>
18
#include <time.h>
19
#ifdef HAVE_ICONV_H
20
#  include <iconv.h>
21
#endif // HAVE_ICONV_H
22
23
24
//
25
// Local globals...
26
//
27
28
static const char * const map_encodings[] =
29
{         // Encoding strings
30
  "us-ascii",   "iso-8859-1",
31
  "iso-8859-2",   "iso-8859-3",
32
  "iso-8859-4",   "iso-8859-5",
33
  "iso-8859-6",   "iso-8859-7",
34
  "iso-8859-8",   "iso-8859-9",
35
  "iso-8859-10",  "utf-8",
36
  "iso-8859-13",  "iso-8859-14",
37
  "iso-8859-15",  "cp874",
38
  "cp1250",   "cp1251",
39
  "cp1252",   "cp1253",
40
  "cp1254",   "cp1255",
41
  "cp1256",   "cp1257",
42
  "cp1258",   "koi8-r",
43
  "koi8-u",   "iso-8859-11",
44
  "iso-8859-16",  "mac",
45
  "unknown",    "unknown",
46
  "unknown",    "unknown",
47
  "unknown",    "unknown",
48
  "unknown",    "unknown",
49
  "unknown",    "unknown",
50
  "unknown",    "unknown",
51
  "unknown",    "unknown",
52
  "unknown",    "unknown",
53
  "unknown",    "unknown",
54
  "unknown",    "unknown",
55
  "unknown",    "unknown",
56
  "unknown",    "unknown",
57
  "unknown",    "unknown",
58
  "unknown",    "unknown",
59
  "unknown",    "unknown",
60
  "unknown",    "unknown",
61
  "unknown",    "unknown",
62
  "cp932",    "cp936",
63
  "cp949",    "cp950",
64
  "cp1361",   "bg18030",
65
  "unknown",    "unknown",
66
  "unknown",    "unknown",
67
  "unknown",    "unknown",
68
  "unknown",    "unknown",
69
  "unknown",    "unknown",
70
  "unknown",    "unknown",
71
  "unknown",    "unknown",
72
  "unknown",    "unknown",
73
  "unknown",    "unknown",
74
  "unknown",    "unknown",
75
  "unknown",    "unknown",
76
  "unknown",    "unknown",
77
  "unknown",    "unknown",
78
  "unknown",    "unknown",
79
  "unknown",    "unknown",
80
  "unknown",    "unknown",
81
  "unknown",    "unknown",
82
  "unknown",    "unknown",
83
  "unknown",    "unknown",
84
  "unknown",    "unknown",
85
  "unknown",    "unknown",
86
  "unknown",    "unknown",
87
  "unknown",    "unknown",
88
  "unknown",    "unknown",
89
  "unknown",    "unknown",
90
  "unknown",    "unknown",
91
  "unknown",    "unknown",
92
  "unknown",    "unknown",
93
  "unknown",    "unknown",
94
  "euc-cn",   "euc-jp",
95
  "euc-kr",   "euc-tw",
96
  "shift_jisx0213"
97
};
98
#ifdef HAVE_ICONV_H
99
static cups_mutex_t map_mutex = CUPS_MUTEX_INITIALIZER;
100
          // Mutex to control access to maps
101
static iconv_t    map_from_utf8 = (iconv_t)-1;
102
          // Convert from UTF-8 to charset
103
static iconv_t    map_to_utf8 = (iconv_t)-1;
104
          // Convert from charset to UTF-8
105
static cups_encoding_t  map_encoding = CUPS_ENCODING_AUTO;
106
          // Which charset is cached
107
#endif // HAVE_ICONV_H
108
109
110
//
111
// Local functions...
112
//
113
114
static void   flush_map(void);
115
116
117
//
118
// 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
119
//
120
121
ssize_t         // O - Number of bytes or `-1` on error
122
cupsCharsetToUTF8(
123
    char                  *dest,  // O - Target string
124
    const char            *src,   // I - Source string
125
    const size_t          maxout, // I - Max output size in bytes
126
    const cups_encoding_t encoding) // I - Encoding
127
0
{
128
0
  char    *destptr;   // Pointer into UTF-8 buffer
129
0
#ifdef HAVE_ICONV_H
130
0
  size_t  srclen,     // Length of source string
131
0
    outBytesLeft;   // Bytes remaining in output buffer
132
0
#endif // HAVE_ICONV_H
133
134
135
  // Check for valid arguments...
136
0
  if (!dest || !src || maxout < 1)
137
0
  {
138
0
    if (dest)
139
0
      *dest = '\0';
140
141
0
    return (-1);
142
0
  }
143
144
  // Handle identity conversions...
145
0
  if (encoding == CUPS_ENCODING_UTF_8 || encoding <= CUPS_ENCODING_US_ASCII || encoding >= CUPS_ENCODING_VBCS_END)
146
0
  {
147
0
    cupsCopyString((char *)dest, src, maxout);
148
0
    return ((ssize_t)strlen((char *)dest));
149
0
  }
150
151
  // Handle ISO-8859-1 to UTF-8 directly...
152
0
  destptr = dest;
153
154
0
  if (encoding == CUPS_ENCODING_ISO8859_1)
155
0
  {
156
0
    int   ch;     // Character from string
157
0
    char  *destend;   // End of UTF-8 buffer
158
159
0
    destend = dest + maxout - 2;
160
161
0
    while (*src && destptr < destend)
162
0
    {
163
0
      ch = *src++ & 255;
164
165
0
      if (ch & 128)
166
0
      {
167
0
  *destptr++ = (char)(0xc0 | (ch >> 6));
168
0
  *destptr++ = (char)(0x80 | (ch & 0x3f));
169
0
      }
170
0
      else
171
0
      {
172
0
  *destptr++ = (char)ch;
173
0
      }
174
0
    }
175
176
0
    *destptr = '\0';
177
178
0
    return ((ssize_t)(destptr - dest));
179
0
  }
180
181
  // Convert input legacy charset to UTF-8...
182
0
#ifdef HAVE_ICONV_H
183
0
  cupsMutexLock(&map_mutex);
184
185
0
  if (map_encoding != encoding)
186
0
  {
187
0
    char  toset[1024];    // Destination character set
188
189
0
    flush_map();
190
191
0
    snprintf(toset, sizeof(toset), "%s//IGNORE", cupsEncodingString(encoding));
192
193
0
    map_encoding  = encoding;
194
0
    map_from_utf8 = iconv_open(cupsEncodingString(encoding), "UTF-8");
195
0
    map_to_utf8   = iconv_open("UTF-8", toset);
196
0
  }
197
198
0
  if (map_to_utf8 != (iconv_t)-1)
199
0
  {
200
0
    char *altdestptr = (char *)dest;  // Silence bogus GCC type-punned
201
202
0
    srclen       = strlen(src);
203
0
    outBytesLeft = maxout - 1;
204
205
0
    iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
206
0
    *altdestptr = '\0';
207
208
0
    cupsMutexUnlock(&map_mutex);
209
210
0
    return ((ssize_t)(altdestptr - (char *)dest));
211
0
  }
212
213
0
  cupsMutexUnlock(&map_mutex);
214
0
#endif // HAVE_ICONV_H
215
216
  // No iconv() support, so error out...
217
0
  *destptr = '\0';
218
219
0
  return (-1);
220
0
}
221
222
223
//
224
// 'cupsEncodingString()' - Return the character encoding name string for the
225
//                          given encoding enumeration.
226
//
227
228
const char *        // O - Character encoding string
229
cupsEncodingString(
230
    cups_encoding_t value)    // I - Encoding value
231
0
{
232
0
  if (value < CUPS_ENCODING_US_ASCII || value >= (cups_encoding_t)(sizeof(map_encodings) / sizeof(map_encodings[0])))
233
0
    return (map_encodings[0]);
234
0
  else
235
0
    return (map_encodings[value]);
236
0
}
237
238
239
//
240
// 'cupsEncodingValue()' - Return the encoding enumeration value for a given
241
//                         character encoding name string.
242
//
243
244
cups_encoding_t       // O - Encoding value
245
cupsEncodingValue(const char *s)  // I - Character encoding string
246
0
{
247
0
  if (s)
248
0
  {
249
0
    size_t  i;      // Looping var
250
251
0
    for (i = 0; i < (sizeof(map_encodings) / sizeof(map_encodings[0])); i ++)
252
0
    {
253
0
      if (!_cups_strcasecmp(s, map_encodings[i]))
254
0
        return ((cups_encoding_t)i);
255
0
    }
256
0
  }
257
258
0
  return (CUPS_ENCODING_US_ASCII);
259
0
}
260
261
262
//
263
// 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
264
//
265
266
ssize_t         // O - Number of bytes or `-1` on error
267
cupsUTF8ToCharset(
268
    char                  *dest,  // O - Target string
269
    const char            *src,   // I - Source string
270
    const size_t          maxout, // I - Max output in bytes
271
    const cups_encoding_t encoding) // I - Encoding
272
0
{
273
0
  char    *destptr;   // Pointer into destination
274
0
#ifdef HAVE_ICONV_H
275
0
  size_t  srclen,     // Length of source string
276
0
    outBytesLeft;   // Bytes remaining in output buffer
277
0
#endif // HAVE_ICONV_H
278
279
280
  // Check for valid arguments...
281
0
  if (!dest || !src || maxout < 1)
282
0
  {
283
0
    if (dest)
284
0
      *dest = '\0';
285
286
0
    return (-1);
287
0
  }
288
289
  // Handle identity conversions...
290
0
  if (encoding == CUPS_ENCODING_UTF_8 || encoding >= CUPS_ENCODING_VBCS_END)
291
0
  {
292
0
    cupsCopyString(dest, (char *)src, maxout);
293
0
    return ((ssize_t)strlen(dest));
294
0
  }
295
296
 /*
297
  * Handle UTF-8 to ISO-8859-1 directly...
298
  */
299
300
0
  destptr = dest;
301
302
0
  if (encoding == CUPS_ENCODING_ISO8859_1 || encoding <= CUPS_ENCODING_US_ASCII)
303
0
  {
304
0
    int   ch,     // Character from string
305
0
    maxch;      // Maximum character for charset
306
0
    char  *destend;   // End of ISO-8859-1 buffer
307
308
0
    maxch   = encoding == CUPS_ENCODING_ISO8859_1 ? 256 : 128;
309
0
    destend = dest + maxout - 1;
310
311
0
    while (*src && destptr < destend)
312
0
    {
313
0
      ch = *src++;
314
315
0
      if ((ch & 0xe0) == 0xc0 && (*src & 0xc0) == 0x80)
316
0
      {
317
        // 2-byte UTF-8
318
0
  ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
319
320
0
  if (ch < maxch)
321
0
          *destptr++ = (char)ch;
322
0
  else
323
0
          *destptr++ = '?';
324
0
      }
325
0
      else if ((ch & 0xf0) == 0xe0 || (ch & 0xf8) == 0xf0)
326
0
      {
327
        // 3-byte or 4-byte UTF-8
328
0
        *destptr++ = '?';
329
0
      }
330
0
      else if (!(ch & 0x80))
331
0
      {
332
        // ASCII
333
0
  *destptr++ = (char)ch;
334
0
      }
335
0
    }
336
337
0
    *destptr = '\0';
338
339
0
    return ((ssize_t)(destptr - dest));
340
0
  }
341
342
0
#ifdef HAVE_ICONV_H
343
  // Convert input UTF-8 to legacy charset...
344
0
  cupsMutexLock(&map_mutex);
345
346
0
  if (map_encoding != encoding)
347
0
  {
348
0
    char  toset[1024];    // Destination character set
349
350
0
    flush_map();
351
352
0
    snprintf(toset, sizeof(toset), "%s//IGNORE", cupsEncodingString(encoding));
353
354
0
    map_encoding  = encoding;
355
0
    map_from_utf8 = iconv_open(cupsEncodingString(encoding), "UTF-8");
356
0
    map_to_utf8   = iconv_open("UTF-8", toset);
357
0
  }
358
359
0
  if (map_from_utf8 != (iconv_t)-1)
360
0
  {
361
0
    char *altsrc = (char *)src;   // Silence bogus GCC type-punned
362
363
0
    srclen       = strlen((char *)src);
364
0
    outBytesLeft = maxout - 1;
365
366
0
    iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
367
0
    *destptr = '\0';
368
369
0
    cupsMutexUnlock(&map_mutex);
370
371
0
    return ((ssize_t)(destptr - dest));
372
0
  }
373
374
0
  cupsMutexUnlock(&map_mutex);
375
0
#endif // HAVE_ICONV_H
376
377
  // No iconv() support, so error out...
378
0
  *destptr = '\0';
379
380
0
  return (-1);
381
0
}
382
383
384
//
385
// 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
386
//
387
// This function converts a UTF-8 (8-bit encoding of Unicode) `nul`-terminated
388
// C string to a UTF-32 (32-bit encoding of Unicode) string.
389
//
390
391
ssize_t         // O - Number of words or `-1` on error
392
cupsUTF8ToUTF32(
393
    cups_utf32_t *dest,     // O - Target string
394
    const char   *src,      // I - Source string
395
    const size_t maxout)    // I - Max output in words
396
0
{
397
0
  size_t  i;      // Looping variable
398
0
  int   ch,     // Character value
399
0
    next;     // Next character value
400
0
  cups_utf32_t  ch32;     // UTF-32 character value
401
402
403
  // Check for valid arguments and clear output...
404
0
  if (dest)
405
0
    *dest = 0;
406
407
0
  if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
408
0
    return (-1);
409
410
  // Convert input UTF-8 to output UTF-32...
411
0
  for (i = maxout - 1; *src && i > 0; i --)
412
0
  {
413
0
    ch = *src++;
414
415
    // Convert UTF-8 character(s) to UTF-32 character...
416
0
    if (!(ch & 0x80))
417
0
    {
418
      // One-octet UTF-8 <= 127 (US-ASCII)...
419
0
      *dest++ = (cups_utf32_t)ch;
420
0
      continue;
421
0
    }
422
0
    else if ((ch & 0xe0) == 0xc0)
423
0
    {
424
      // Two-octet UTF-8 <= 2047 (Latin-x)...
425
0
      next = *src++;
426
0
      if ((next & 0xc0) != 0x80)
427
0
  return (-1);
428
429
0
      ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
430
431
      // Check for non-shortest form (invalid UTF-8)...
432
0
      if (ch32 < 0x80)
433
0
  return (-1);
434
435
0
      *dest++ = ch32;
436
0
    }
437
0
    else if ((ch & 0xf0) == 0xe0)
438
0
    {
439
      // Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
440
0
      next = *src++;
441
0
      if ((next & 0xc0) != 0x80)
442
0
  return (-1);
443
444
0
      ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
445
446
0
      next = *src++;
447
0
      if ((next & 0xc0) != 0x80)
448
0
  return (-1);
449
450
0
      ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
451
452
      // Check for non-shortest form (invalid UTF-8)...
453
0
      if (ch32 < 0x800)
454
0
  return (-1);
455
456
0
      *dest++ = ch32;
457
0
    }
458
0
    else if ((ch & 0xf8) == 0xf0)
459
0
    {
460
      // Four-octet UTF-8...
461
0
      next = *src++;
462
0
      if ((next & 0xc0) != 0x80)
463
0
  return (-1);
464
465
0
      ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
466
467
0
      next = *src++;
468
0
      if ((next & 0xc0) != 0x80)
469
0
  return (-1);
470
471
0
      ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
472
473
0
      next = *src++;
474
0
      if ((next & 0xc0) != 0x80)
475
0
  return (-1);
476
477
0
      ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
478
479
      // Check for non-shortest form (invalid UTF-8)...
480
0
      if (ch32 < 0x10000)
481
0
  return (-1);
482
483
0
      *dest++ = ch32;
484
0
    }
485
0
    else
486
0
    {
487
      // More than 4-octet (invalid UTF-8 sequence)...
488
0
      return (-1);
489
0
    }
490
491
    // Check for UTF-16 surrogate (illegal UTF-8)...
492
0
    if (ch32 >= 0xd800 && ch32 <= 0xdfff)
493
0
      return (-1);
494
0
  }
495
496
0
  *dest = 0;
497
498
0
  return ((ssize_t)(maxout - 1 - i));
499
0
}
500
501
502
//
503
// 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
504
//
505
// This function converts a UTF-32 (32-bit encoding of Unicode) string to a
506
// UTF-8 (8-bit encoding of Unicode) `nul`-terminated C string.
507
//
508
509
ssize_t         // O - Number of bytes or `-1` on error
510
cupsUTF32ToUTF8(
511
    char               *dest,   // O - Target string
512
    const cups_utf32_t *src,    // I - Source string
513
    const size_t       maxout)    // I - Max output in bytes
514
0
{
515
0
  char    *start;     // Start of destination string
516
0
  size_t  i;      // Looping variable
517
0
  int   swap;     // Byte-swap input to output
518
0
  cups_utf32_t  ch;     // Character value
519
520
521
  // Check for valid arguments and clear output...
522
0
  if (dest)
523
0
    *dest = '\0';
524
525
0
  if (!dest || !src || maxout < 1)
526
0
    return (-1);
527
528
  // Check for leading BOM in UTF-32 and inverted BOM...
529
0
  start = dest;
530
0
  swap  = *src == 0xfffe0000;
531
532
0
  if (*src == 0xfffe0000 || *src == 0xfeff)
533
0
    src ++;
534
535
  // Convert input UTF-32 to output UTF-8...
536
0
  for (i = maxout - 1; *src && i > 0;)
537
0
  {
538
0
    ch = *src++;
539
540
    // Byte swap input UTF-32 if necessary (only byte-swapping 24 of 32 bits)
541
0
    if (swap)
542
0
      ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
543
544
    // Check for beyond Plane 16 (invalid UTF-32)...
545
0
    if (ch > 0x10ffff)
546
0
      return (-1);
547
548
    // Convert UTF-32 character to UTF-8 character(s)...
549
0
    if (ch < 0x80)
550
0
    {
551
      // One-octet UTF-8 <= 127 (US-ASCII)...
552
0
      *dest++ = (char)ch;
553
0
      i --;
554
0
    }
555
0
    else if (ch < 0x800)
556
0
    {
557
      // Two-octet UTF-8 <= 2047 (Latin-x)...
558
0
      if (i < 2)
559
0
        return (-1);
560
561
0
      *dest++ = (char)(0xc0 | ((ch >> 6) & 0x1f));
562
0
      *dest++ = (char)(0x80 | (ch & 0x3f));
563
0
      i -= 2;
564
0
    }
565
0
    else if (ch < 0x10000)
566
0
    {
567
      // Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
568
0
      if (i < 3)
569
0
        return (-1);
570
571
0
      *dest++ = (char)(0xe0 | ((ch >> 12) & 0x0f));
572
0
      *dest++ = (char)(0x80 | ((ch >> 6) & 0x3f));
573
0
      *dest++ = (char)(0x80 | (ch & 0x3f));
574
0
      i -= 3;
575
0
    }
576
0
    else
577
0
    {
578
      // Four-octet UTF-8...
579
0
      if (i < 4)
580
0
        return (-1);
581
582
0
      *dest++ = (char)(0xf0 | ((ch >> 18) & 0x07));
583
0
      *dest++ = (char)(0x80 | ((ch >> 12) & 0x3f));
584
0
      *dest++ = (char)(0x80 | ((ch >> 6) & 0x3f));
585
0
      *dest++ = (char)(0x80 | (ch & 0x3f));
586
0
      i -= 4;
587
0
    }
588
0
  }
589
590
0
  *dest = '\0';
591
592
0
  return ((ssize_t)(dest - start));
593
0
}
594
595
596
//
597
// 'flush_map()' - Flush all character set maps out of cache.
598
//
599
600
static void
601
flush_map(void)
602
0
{
603
0
#ifdef HAVE_ICONV_H
604
0
  if (map_from_utf8 != (iconv_t)-1)
605
0
  {
606
0
    iconv_close(map_from_utf8);
607
0
    map_from_utf8 = (iconv_t)-1;
608
0
  }
609
610
0
  if (map_to_utf8 != (iconv_t)-1)
611
0
  {
612
0
    iconv_close(map_to_utf8);
613
0
    map_to_utf8 = (iconv_t)-1;
614
0
  }
615
616
0
  map_encoding = CUPS_ENCODING_AUTO;
617
0
#endif // HAVE_ICONV_H
618
0
}