Coverage Report

Created: 2025-10-10 06:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libcups/cups/transcode.c
Line
Count
Source
1
//
2
// Transcoding support for CUPS.
3
//
4
// Copyright © 2022 by OpenPrinting.
5
// Copyright © 2007-2014 by Apple Inc.
6
// Copyright © 1997-2007 by Easy Software Products.
7
//
8
// Licensed under Apache License v2.0.  See the file "LICENSE" for more
9
// information.
10
//
11
12
#include "cups.h"
13
#include "thread.h"
14
#include "transcode.h"
15
#include "debug-internal.h"
16
#include "string-private.h"
17
#include <limits.h>
18
#include <time.h>
19
#ifdef HAVE_ICONV_H
20
#  include <iconv.h>
21
#endif // HAVE_ICONV_H
22
23
24
//
25
// Local globals...
26
//
27
28
static const char * const map_encodings[] =
29
{         // Encoding strings
30
  "us-ascii",   "iso-8859-1",
31
  "iso-8859-2",   "iso-8859-3",
32
  "iso-8859-4",   "iso-8859-5",
33
  "iso-8859-6",   "iso-8859-7",
34
  "iso-8859-8",   "iso-8859-9",
35
  "iso-8859-10",  "utf-8",
36
  "iso-8859-13",  "iso-8859-14",
37
  "iso-8859-15",  "cp874",
38
  "cp1250",   "cp1251",
39
  "cp1252",   "cp1253",
40
  "cp1254",   "cp1255",
41
  "cp1256",   "cp1257",
42
  "cp1258",   "koi8-r",
43
  "koi8-u",   "iso-8859-11",
44
  "iso-8859-16",  "mac",
45
  "unknown",    "unknown",
46
  "unknown",    "unknown",
47
  "unknown",    "unknown",
48
  "unknown",    "unknown",
49
  "unknown",    "unknown",
50
  "unknown",    "unknown",
51
  "unknown",    "unknown",
52
  "unknown",    "unknown",
53
  "unknown",    "unknown",
54
  "unknown",    "unknown",
55
  "unknown",    "unknown",
56
  "unknown",    "unknown",
57
  "unknown",    "unknown",
58
  "unknown",    "unknown",
59
  "unknown",    "unknown",
60
  "unknown",    "unknown",
61
  "unknown",    "unknown",
62
  "cp932",    "cp936",
63
  "cp949",    "cp950",
64
  "cp1361",   "bg18030",
65
  "unknown",    "unknown",
66
  "unknown",    "unknown",
67
  "unknown",    "unknown",
68
  "unknown",    "unknown",
69
  "unknown",    "unknown",
70
  "unknown",    "unknown",
71
  "unknown",    "unknown",
72
  "unknown",    "unknown",
73
  "unknown",    "unknown",
74
  "unknown",    "unknown",
75
  "unknown",    "unknown",
76
  "unknown",    "unknown",
77
  "unknown",    "unknown",
78
  "unknown",    "unknown",
79
  "unknown",    "unknown",
80
  "unknown",    "unknown",
81
  "unknown",    "unknown",
82
  "unknown",    "unknown",
83
  "unknown",    "unknown",
84
  "unknown",    "unknown",
85
  "unknown",    "unknown",
86
  "unknown",    "unknown",
87
  "unknown",    "unknown",
88
  "unknown",    "unknown",
89
  "unknown",    "unknown",
90
  "unknown",    "unknown",
91
  "unknown",    "unknown",
92
  "unknown",    "unknown",
93
  "unknown",    "unknown",
94
  "euc-cn",   "euc-jp",
95
  "euc-kr",   "euc-tw",
96
  "shift_jisx0213"
97
};
98
#ifdef HAVE_ICONV_H
99
static cups_mutex_t map_mutex = CUPS_MUTEX_INITIALIZER;
100
          // Mutex to control access to maps
101
static iconv_t    map_from_utf8 = (iconv_t)-1;
102
          // Convert from UTF-8 to charset
103
static iconv_t    map_to_utf8 = (iconv_t)-1;
104
          // Convert from charset to UTF-8
105
static cups_encoding_t  map_encoding = CUPS_ENCODING_AUTO;
106
          // Which charset is cached
107
#endif // HAVE_ICONV_H
108
109
110
//
111
// Local functions...
112
//
113
114
static void   flush_map(void);
115
116
117
//
118
// 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
119
//
120
121
ssize_t         // O - Number of bytes or `-1` on error
122
cupsCharsetToUTF8(
123
    char                  *dest,  // O - Target string
124
    const char            *src,   // I - Source string
125
    const size_t          maxout, // I - Max output size in bytes
126
    const cups_encoding_t encoding) // I - Encoding
127
0
{
128
0
  char    *destptr;   // Pointer into UTF-8 buffer
129
0
#ifdef HAVE_ICONV_H
130
0
  size_t  srclen,     // Length of source string
131
0
    outBytesLeft;   // Bytes remaining in output buffer
132
0
#endif // HAVE_ICONV_H
133
134
135
  // Check for valid arguments...
136
0
  if (!dest || !src || maxout < 1)
137
0
  {
138
0
    if (dest)
139
0
      *dest = '\0';
140
141
0
    return (-1);
142
0
  }
143
144
  // Handle identity conversions...
145
0
  if (encoding == CUPS_ENCODING_UTF_8 || encoding <= CUPS_ENCODING_US_ASCII || encoding >= CUPS_ENCODING_VBCS_END)
146
0
  {
147
0
    cupsCopyString((char *)dest, src, maxout);
148
0
    return ((ssize_t)strlen((char *)dest));
149
0
  }
150
151
  // Handle ISO-8859-1 to UTF-8 directly...
152
0
  destptr = dest;
153
154
0
  if (encoding == CUPS_ENCODING_ISO8859_1)
155
0
  {
156
0
    int   ch;     // Character from string
157
0
    char  *destend;   // End of UTF-8 buffer
158
159
160
0
    destend = dest + maxout - 2;
161
162
0
    while (*src && destptr < destend)
163
0
    {
164
0
      ch = *src++ & 255;
165
166
0
      if (ch & 128)
167
0
      {
168
0
  *destptr++ = (char)(0xc0 | (ch >> 6));
169
0
  *destptr++ = (char)(0x80 | (ch & 0x3f));
170
0
      }
171
0
      else
172
0
  *destptr++ = (char)ch;
173
0
    }
174
175
0
    *destptr = '\0';
176
177
0
    return ((ssize_t)(destptr - dest));
178
0
  }
179
180
  // Convert input legacy charset to UTF-8...
181
0
#ifdef HAVE_ICONV_H
182
0
  cupsMutexLock(&map_mutex);
183
184
0
  if (map_encoding != encoding)
185
0
  {
186
0
    char  toset[1024];    // Destination character set
187
188
0
    flush_map();
189
190
0
    snprintf(toset, sizeof(toset), "%s//IGNORE", cupsEncodingString(encoding));
191
192
0
    map_encoding  = encoding;
193
0
    map_from_utf8 = iconv_open(cupsEncodingString(encoding), "UTF-8");
194
0
    map_to_utf8   = iconv_open("UTF-8", toset);
195
0
  }
196
197
0
  if (map_to_utf8 != (iconv_t)-1)
198
0
  {
199
0
    char *altdestptr = (char *)dest;  // Silence bogus GCC type-punned
200
201
0
    srclen       = strlen(src);
202
0
    outBytesLeft = maxout - 1;
203
204
0
    iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
205
0
    *altdestptr = '\0';
206
207
0
    cupsMutexUnlock(&map_mutex);
208
209
0
    return ((ssize_t)(altdestptr - (char *)dest));
210
0
  }
211
212
0
  cupsMutexUnlock(&map_mutex);
213
0
#endif // HAVE_ICONV_H
214
215
  // No iconv() support, so error out...
216
0
  *destptr = '\0';
217
218
0
  return (-1);
219
0
}
220
221
222
//
223
// 'cupsEncodingString()' - Return the character encoding name string for the
224
//                          given encoding enumeration.
225
//
226
227
const char *        // O - Character encoding string
228
cupsEncodingString(
229
    cups_encoding_t value)    // I - Encoding value
230
0
{
231
0
  if (value < CUPS_ENCODING_US_ASCII || value >= (cups_encoding_t)(sizeof(map_encodings) / sizeof(map_encodings[0])))
232
0
    return (map_encodings[0]);
233
0
  else
234
0
    return (map_encodings[value]);
235
0
}
236
237
238
//
239
// 'cupsEncodingValue()' - Return the encoding enumeration value for a given
240
//                         character encoding name string.
241
//
242
243
cups_encoding_t       // O - Encoding value
244
cupsEncodingValue(const char *s)  // I - Character encoding string
245
0
{
246
0
  if (s)
247
0
  {
248
0
    size_t  i;      // Looping var
249
250
0
    for (i = 0; i < (sizeof(map_encodings) / sizeof(map_encodings[0])); i ++)
251
0
    {
252
0
      if (!_cups_strcasecmp(s, map_encodings[i]))
253
0
        return ((cups_encoding_t)i);
254
0
    }
255
0
  }
256
257
0
  return (CUPS_ENCODING_US_ASCII);
258
0
}
259
260
261
//
262
// 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
263
//
264
265
ssize_t         // O - Number of bytes or `-1` on error
266
cupsUTF8ToCharset(
267
    char                  *dest,  // O - Target string
268
    const char            *src,   // I - Source string
269
    const size_t          maxout, // I - Max output in bytes
270
    const cups_encoding_t encoding) // I - Encoding
271
0
{
272
0
  char    *destptr;   // Pointer into destination
273
0
#ifdef HAVE_ICONV_H
274
0
  size_t  srclen,     // Length of source string
275
0
    outBytesLeft;   // Bytes remaining in output buffer
276
0
#endif // HAVE_ICONV_H
277
278
279
  // Check for valid arguments...
280
0
  if (!dest || !src || maxout < 1)
281
0
  {
282
0
    if (dest)
283
0
      *dest = '\0';
284
285
0
    return (-1);
286
0
  }
287
288
  // Handle identity conversions...
289
0
  if (encoding == CUPS_ENCODING_UTF_8 || encoding >= CUPS_ENCODING_VBCS_END)
290
0
  {
291
0
    cupsCopyString(dest, (char *)src, maxout);
292
0
    return ((ssize_t)strlen(dest));
293
0
  }
294
295
 /*
296
  * Handle UTF-8 to ISO-8859-1 directly...
297
  */
298
299
0
  destptr = dest;
300
301
0
  if (encoding == CUPS_ENCODING_ISO8859_1 || encoding <= CUPS_ENCODING_US_ASCII)
302
0
  {
303
0
    int   ch,     // Character from string
304
0
    maxch;      // Maximum character for charset
305
0
    char  *destend;   // End of ISO-8859-1 buffer
306
307
0
    maxch   = encoding == CUPS_ENCODING_ISO8859_1 ? 256 : 128;
308
0
    destend = dest + maxout - 1;
309
310
0
    while (*src && destptr < destend)
311
0
    {
312
0
      ch = *src++;
313
314
0
      if ((ch & 0xe0) == 0xc0)
315
0
      {
316
0
  ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
317
318
0
  if (ch < maxch)
319
0
          *destptr++ = (char)ch;
320
0
  else
321
0
          *destptr++ = '?';
322
0
      }
323
0
      else if ((ch & 0xf0) == 0xe0 || (ch & 0xf8) == 0xf0)
324
0
      {
325
0
        *destptr++ = '?';
326
0
      }
327
0
      else if (!(ch & 0x80))
328
0
      {
329
0
  *destptr++ = (char)ch;
330
0
      }
331
0
    }
332
333
0
    *destptr = '\0';
334
335
0
    return ((ssize_t)(destptr - dest));
336
0
  }
337
338
0
#ifdef HAVE_ICONV_H
339
  // Convert input UTF-8 to legacy charset...
340
0
  cupsMutexLock(&map_mutex);
341
342
0
  if (map_encoding != encoding)
343
0
  {
344
0
    char  toset[1024];    // Destination character set
345
346
0
    flush_map();
347
348
0
    snprintf(toset, sizeof(toset), "%s//IGNORE", cupsEncodingString(encoding));
349
350
0
    map_encoding  = encoding;
351
0
    map_from_utf8 = iconv_open(cupsEncodingString(encoding), "UTF-8");
352
0
    map_to_utf8   = iconv_open("UTF-8", toset);
353
0
  }
354
355
0
  if (map_from_utf8 != (iconv_t)-1)
356
0
  {
357
0
    char *altsrc = (char *)src;   // Silence bogus GCC type-punned
358
359
0
    srclen       = strlen((char *)src);
360
0
    outBytesLeft = maxout - 1;
361
362
0
    iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
363
0
    *destptr = '\0';
364
365
0
    cupsMutexUnlock(&map_mutex);
366
367
0
    return ((ssize_t)(destptr - dest));
368
0
  }
369
370
0
  cupsMutexUnlock(&map_mutex);
371
0
#endif // HAVE_ICONV_H
372
373
  // No iconv() support, so error out...
374
0
  *destptr = '\0';
375
376
0
  return (-1);
377
0
}
378
379
380
//
381
// 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
382
//
383
// This function converts a UTF-8 (8-bit encoding of Unicode) `nul`-terminated
384
// C string to a UTF-32 (32-bit encoding of Unicode) string.
385
//
386
387
ssize_t         // O - Number of words or `-1` on error
388
cupsUTF8ToUTF32(
389
    cups_utf32_t *dest,     // O - Target string
390
    const char   *src,      // I - Source string
391
    const size_t maxout)    // I - Max output in words
392
0
{
393
0
  size_t  i;      // Looping variable
394
0
  int   ch,     // Character value
395
0
    next;     // Next character value
396
0
  cups_utf32_t  ch32;     // UTF-32 character value
397
398
399
  // Check for valid arguments and clear output...
400
0
  if (dest)
401
0
    *dest = 0;
402
403
0
  if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
404
0
    return (-1);
405
406
  // Convert input UTF-8 to output UTF-32...
407
0
  for (i = maxout - 1; *src && i > 0; i --)
408
0
  {
409
0
    ch = *src++;
410
411
    // Convert UTF-8 character(s) to UTF-32 character...
412
0
    if (!(ch & 0x80))
413
0
    {
414
      // One-octet UTF-8 <= 127 (US-ASCII)...
415
0
      *dest++ = (cups_utf32_t)ch;
416
0
      continue;
417
0
    }
418
0
    else if ((ch & 0xe0) == 0xc0)
419
0
    {
420
      // Two-octet UTF-8 <= 2047 (Latin-x)...
421
0
      next = *src++;
422
0
      if ((next & 0xc0) != 0x80)
423
0
  return (-1);
424
425
0
      ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
426
427
      // Check for non-shortest form (invalid UTF-8)...
428
0
      if (ch32 < 0x80)
429
0
  return (-1);
430
431
0
      *dest++ = ch32;
432
0
    }
433
0
    else if ((ch & 0xf0) == 0xe0)
434
0
    {
435
      // Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
436
0
      next = *src++;
437
0
      if ((next & 0xc0) != 0x80)
438
0
  return (-1);
439
440
0
      ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
441
442
0
      next = *src++;
443
0
      if ((next & 0xc0) != 0x80)
444
0
  return (-1);
445
446
0
      ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
447
448
      // Check for non-shortest form (invalid UTF-8)...
449
0
      if (ch32 < 0x800)
450
0
  return (-1);
451
452
0
      *dest++ = ch32;
453
0
    }
454
0
    else if ((ch & 0xf8) == 0xf0)
455
0
    {
456
      // Four-octet UTF-8...
457
0
      next = *src++;
458
0
      if ((next & 0xc0) != 0x80)
459
0
  return (-1);
460
461
0
      ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
462
463
0
      next = *src++;
464
0
      if ((next & 0xc0) != 0x80)
465
0
  return (-1);
466
467
0
      ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
468
469
0
      next = *src++;
470
0
      if ((next & 0xc0) != 0x80)
471
0
  return (-1);
472
473
0
      ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
474
475
      // Check for non-shortest form (invalid UTF-8)...
476
0
      if (ch32 < 0x10000)
477
0
  return (-1);
478
479
0
      *dest++ = ch32;
480
0
    }
481
0
    else
482
0
    {
483
      // More than 4-octet (invalid UTF-8 sequence)...
484
0
      return (-1);
485
0
    }
486
487
    // Check for UTF-16 surrogate (illegal UTF-8)...
488
0
    if (ch32 >= 0xd800 && ch32 <= 0xdfff)
489
0
      return (-1);
490
0
  }
491
492
0
  *dest = 0;
493
494
0
  return ((ssize_t)(maxout - 1 - i));
495
0
}
496
497
498
//
499
// 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
500
//
501
// This function converts a UTF-32 (32-bit encoding of Unicode) string to a
502
// UTF-8 (8-bit encoding of Unicode) `nul`-terminated C string.
503
//
504
505
ssize_t         // O - Number of bytes or `-1` on error
506
cupsUTF32ToUTF8(
507
    char               *dest,   // O - Target string
508
    const cups_utf32_t *src,    // I - Source string
509
    const size_t       maxout)    // I - Max output in bytes
510
0
{
511
0
  char    *start;     // Start of destination string
512
0
  size_t  i;      // Looping variable
513
0
  int   swap;     // Byte-swap input to output
514
0
  cups_utf32_t  ch;     // Character value
515
516
517
  // Check for valid arguments and clear output...
518
0
  if (dest)
519
0
    *dest = '\0';
520
521
0
  if (!dest || !src || maxout < 1)
522
0
    return (-1);
523
524
  // Check for leading BOM in UTF-32 and inverted BOM...
525
0
  start = dest;
526
0
  swap  = *src == 0xfffe0000;
527
528
0
  if (*src == 0xfffe0000 || *src == 0xfeff)
529
0
    src ++;
530
531
  // Convert input UTF-32 to output UTF-8...
532
0
  for (i = maxout - 1; *src && i > 0;)
533
0
  {
534
0
    ch = *src++;
535
536
    // Byte swap input UTF-32 if necessary (only byte-swapping 24 of 32 bits)
537
0
    if (swap)
538
0
      ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
539
540
    // Check for beyond Plane 16 (invalid UTF-32)...
541
0
    if (ch > 0x10ffff)
542
0
      return (-1);
543
544
    // Convert UTF-32 character to UTF-8 character(s)...
545
0
    if (ch < 0x80)
546
0
    {
547
      // One-octet UTF-8 <= 127 (US-ASCII)...
548
0
      *dest++ = (char)ch;
549
0
      i --;
550
0
    }
551
0
    else if (ch < 0x800)
552
0
    {
553
      // Two-octet UTF-8 <= 2047 (Latin-x)...
554
0
      if (i < 2)
555
0
        return (-1);
556
557
0
      *dest++ = (char)(0xc0 | ((ch >> 6) & 0x1f));
558
0
      *dest++ = (char)(0x80 | (ch & 0x3f));
559
0
      i -= 2;
560
0
    }
561
0
    else if (ch < 0x10000)
562
0
    {
563
      // Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
564
0
      if (i < 3)
565
0
        return (-1);
566
567
0
      *dest++ = (char)(0xe0 | ((ch >> 12) & 0x0f));
568
0
      *dest++ = (char)(0x80 | ((ch >> 6) & 0x3f));
569
0
      *dest++ = (char)(0x80 | (ch & 0x3f));
570
0
      i -= 3;
571
0
    }
572
0
    else
573
0
    {
574
      // Four-octet UTF-8...
575
0
      if (i < 4)
576
0
        return (-1);
577
578
0
      *dest++ = (char)(0xf0 | ((ch >> 18) & 0x07));
579
0
      *dest++ = (char)(0x80 | ((ch >> 12) & 0x3f));
580
0
      *dest++ = (char)(0x80 | ((ch >> 6) & 0x3f));
581
0
      *dest++ = (char)(0x80 | (ch & 0x3f));
582
0
      i -= 4;
583
0
    }
584
0
  }
585
586
0
  *dest = '\0';
587
588
0
  return ((ssize_t)(dest - start));
589
0
}
590
591
592
//
593
// 'flush_map()' - Flush all character set maps out of cache.
594
//
595
596
static void
597
flush_map(void)
598
0
{
599
0
#ifdef HAVE_ICONV_H
600
0
  if (map_from_utf8 != (iconv_t)-1)
601
0
  {
602
0
    iconv_close(map_from_utf8);
603
0
    map_from_utf8 = (iconv_t)-1;
604
0
  }
605
606
0
  if (map_to_utf8 != (iconv_t)-1)
607
0
  {
608
0
    iconv_close(map_to_utf8);
609
0
    map_to_utf8 = (iconv_t)-1;
610
0
  }
611
612
0
  map_encoding = CUPS_ENCODING_AUTO;
613
0
#endif // HAVE_ICONV_H
614
0
}