Coverage Report

Created: 2025-11-16 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/MapServer/src/maptclutf.c
Line
Count
Source
1
/******************************************************************************
2
 * $Id$
3
 *
4
 * Project:  MapServer
5
 * Purpose:  Implementation of msUTF8ToUniChar()
6
 * Author:   Daniel Morissette, Thomas Bonfort
7
 *
8
 * Note:
9
 * The source code of Tcl_UtfToUniChar() was borrowed from tclUtf.c
10
 * from the Tcl/Tk project.
11
 *
12
 * Website: http://www.tcl.tk/software/tcltk/
13
 * Source download: http://prdownloads.sourceforge.net/tcl/tcl8.4.15-src.tar.gz
14
 *
15
 * See copyright and license terms below the standard MapServer license.
16
 *
17
 ******************************************************************************
18
 * Copyright (c) 1996-2007 Regents of the University of Minnesota.
19
 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
20
 *
21
 * Permission is hereby granted, free of charge, to any person obtaining a
22
 * copy of this software and associated documentation files (the "Software"),
23
 * to deal in the Software without restriction, including without limitation
24
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
25
 * and/or sell copies of the Software, and to permit persons to whom the
26
 * Software is furnished to do so, subject to the following conditions:
27
 *
28
 * The above copyright notice and this permission notice shall be included in
29
 * all copies of this Software or works derived from this Software.
30
 *
31
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
32
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
34
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
36
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
37
 * DEALINGS IN THE SOFTWARE.
38
 ****************************************************************************/
39
40
/*
41
 * tclUtf.c --
42
 *
43
 * Routines for manipulating UTF-8 strings.
44
 *
45
 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
46
 *
47
 * This software is copyrighted by the Regents of the University of
48
 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
49
 * Corporation and other parties.  The following terms apply to all files
50
 * associated with the software unless explicitly disclaimed in
51
 * individual files.
52
 *
53
 * The authors hereby grant permission to use, copy, modify, distribute,
54
 * and license this software and its documentation for any purpose, provided
55
 * that existing copyright notices are retained in all copies and that this
56
 * notice is included verbatim in any distributions. No written agreement,
57
 * license, or royalty fee is required for any of the authorized uses.
58
 * Modifications to this software may be copyrighted by their authors
59
 * and need not follow the licensing terms described here, provided that
60
 * the new terms are clearly indicated on the first page of each file where
61
 * they apply.
62
 *
63
 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
64
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
65
 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
66
 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
67
 * POSSIBILITY OF SUCH DAMAGE.
68
 *
69
 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
70
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
71
 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
72
 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
73
 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
74
 * MODIFICATIONS.
75
 *
76
 * GOVERNMENT USE: If you are acquiring this software on behalf of the
77
 * U.S. government, the Government shall have only "Restricted Rights"
78
 * in the software and related documentation as defined in the Federal
79
 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
80
 * are acquiring the software on behalf of the Department of Defense, the
81
 * software shall be classified as "Commercial Computer Software" and the
82
 * Government shall have only "Restricted Rights" as defined in Clause
83
 * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
84
 * authors grant the U.S. Government and others acting in its behalf
85
 * permission to use and distribute the software in accordance with the
86
 * terms specified in this license.
87
 */
88
89
#include "mapserver.h"
90
91
/* The source code of Tcl_UtfToUniChar() was borrowed from tclUtf.c
92
 * from the Tcl/Tk project:
93
 * Website:
94
 *   http://www.tcl.tk/software/tcltk/
95
 * Source download:
96
 *   http://prdownloads.sourceforge.net/tcl/tcl8.4.15-src.tar.gz
97
 * Original License info follows below.
98
 */
99
100
/*
101
 * tclUtf.c --
102
 *
103
 *  Routines for manipulating UTF-8 strings.
104
 *
105
 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
106
 *
107
 * See the file "license.terms" for information on usage and redistribution of
108
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
109
 *
110
 * Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp
111
 */
112
113
/******************* Tcl license.terms *********************
114
115
This software is copyrighted by the Regents of the University of
116
California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
117
Corporation and other parties.  The following terms apply to all files
118
associated with the software unless explicitly disclaimed in
119
individual files.
120
121
The authors hereby grant permission to use, copy, modify, distribute,
122
and license this software and its documentation for any purpose, provided
123
that existing copyright notices are retained in all copies and that this
124
notice is included verbatim in any distributions. No written agreement,
125
license, or royalty fee is required for any of the authorized uses.
126
Modifications to this software may be copyrighted by their authors
127
and need not follow the licensing terms described here, provided that
128
the new terms are clearly indicated on the first page of each file where
129
they apply.
130
131
IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
132
FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
133
ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
134
DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
135
POSSIBILITY OF SUCH DAMAGE.
136
137
THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
138
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
139
FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
140
IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
141
NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
142
MODIFICATIONS.
143
144
GOVERNMENT USE: If you are acquiring this software on behalf of the
145
U.S. government, the Government shall have only "Restricted Rights"
146
in the software and related documentation as defined in the Federal
147
Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
148
are acquiring the software on behalf of the Department of Defense, the
149
software shall be classified as "Commercial Computer Software" and the
150
Government shall have only "Restricted Rights" as defined in Clause
151
252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
152
authors grant the U.S. Government and others acting in its behalf
153
permission to use and distribute the software in accordance with the
154
terms specified in this license.
155
156
***********************************************************/
157
158
#define TCL_UTF_MAX 6
159
160
/*
161
 * The following structures are used when mapping between Unicode (UCS-2)
162
 * and UTF-8.
163
 */
164
165
static const unsigned char totalBytes[256] = {
166
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
167
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
168
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
169
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
170
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
171
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
174
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
175
    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
176
#if TCL_UTF_MAX > 3
177
    4, 4, 4, 4, 4, 4, 4, 4,
178
#else
179
    1, 1, 1, 1, 1, 1, 1, 1,
180
#endif
181
#if TCL_UTF_MAX > 4
182
    5, 5, 5, 5,
183
#else
184
    1, 1, 1, 1,
185
#endif
186
#if TCL_UTF_MAX > 5
187
    6, 6, 6, 6
188
#else
189
    1, 1, 1, 1
190
#endif
191
};
192
193
/*
194
 *---------------------------------------------------------------------------
195
 *
196
 * Tcl_UtfToUniChar --
197
 *
198
 *  Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
199
 *  UTF-8 sequences are converted to valid Tcl_UniChars and processing
200
 *  continues.  Equivalent to Plan 9 chartorune().
201
 *
202
 *  The caller must ensure that the source buffer is long enough that
203
 *  this routine does not run off the end and dereference non-existent
204
 *  memory looking for trail bytes.  If the source buffer is known to
205
 *  be '\0' terminated, this cannot happen.  Otherwise, the caller
206
 *  should call Tcl_UtfCharComplete() before calling this routine to
207
 *  ensure that enough bytes remain in the string.
208
 *
209
 * Results:
210
 *  *chPtr is filled with the Tcl_UniChar, and the return value is the
211
 *  number of bytes from the UTF-8 string that were consumed.
212
 *
213
 * Side effects:
214
 *  None.
215
 *
216
 *---------------------------------------------------------------------------
217
 */
218
219
int ms_Tcl_UtfToUniChar(register const char *str,
220
0
                        register unsigned int *chPtr) {
221
0
  register int byte;
222
223
  /*
224
   * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
225
   */
226
227
0
  byte = *((unsigned char *)str);
228
0
  if (byte < 0xC0) {
229
    /*
230
     * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
231
     * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
232
     * characters representing themselves.
233
     */
234
235
0
    *chPtr = byte;
236
0
    return 1;
237
0
  } else if (byte < 0xE0) {
238
0
    if ((str[1] & 0xC0) == 0x80) {
239
      /*
240
       * Two-byte-character lead-byte followed by a trail-byte.
241
       */
242
243
0
      *chPtr = (((byte & 0x1F) << 6) | (str[1] & 0x3F));
244
0
      return 2;
245
0
    }
246
    /*
247
     * A two-byte-character lead-byte not followed by trail-byte
248
     * represents itself.
249
     */
250
251
0
    *chPtr = byte;
252
0
    return 1;
253
0
  } else if (byte < 0xF0) {
254
0
    if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
255
      /*
256
       * Three-byte-character lead byte followed by two trail bytes.
257
       */
258
259
0
      *chPtr =
260
0
          (((byte & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
261
0
      return 3;
262
0
    }
263
    /*
264
     * A three-byte-character lead-byte not followed by two trail-bytes
265
     * represents itself.
266
     */
267
268
0
    *chPtr = byte;
269
0
    return 1;
270
0
  }
271
0
#if TCL_UTF_MAX > 3
272
0
  else {
273
0
    int ch, total, trail;
274
275
0
    total = totalBytes[byte];
276
0
    trail = total - 1;
277
0
    if (trail > 0) {
278
0
      ch = byte & (0x3F >> trail);
279
0
      do {
280
0
        str++;
281
0
        if ((*str & 0xC0) != 0x80) {
282
0
          *chPtr = byte;
283
0
          return 1;
284
0
        }
285
0
        ch <<= 6;
286
0
        ch |= (*str & 0x3F);
287
0
        trail--;
288
0
      } while (trail > 0);
289
0
      *chPtr = ch;
290
0
      return total;
291
0
    }
292
0
  }
293
0
#endif
294
295
0
  *chPtr = byte;
296
0
  return 1;
297
0
}
298
299
/* msUTF8ToUniChar()
300
 *
301
 *  Extract the Unicode Char represented by the UTF-8 string.  Bad
302
 *  UTF-8 sequences are converted to valid Unicode Chars and processing
303
 *  continues.
304
 *
305
 *  The caller must ensure that the source buffer is long enough that
306
 *  this routine does not run off the end and dereference non-existent
307
 *  memory looking for trail bytes.  If the source buffer is known to
308
 *  be '\0' terminated, this cannot happen.
309
 *
310
 * Results:
311
 *  *chPtr is filled with the Unicode Char value, and the return value
312
 *  is the number of bytes from the UTF-8 string that were consumed.
313
 **
314
 **/
315
int msUTF8ToUniChar(const char *str,     /* The UTF-8 string. */
316
                    unsigned int *chPtr) /* Filled with the Unicode Char
317
                                          * represented by the UTF-8 string. */
318
0
{
319
  /*check if the string is an html entity (eg &#123; or &#x12a;)*/
320
0
  int entitylgth;
321
0
  if (*str == '&' && (entitylgth = msGetUnicodeEntity(str, chPtr)) > 0)
322
0
    return entitylgth;
323
0
  return ms_Tcl_UtfToUniChar(str, chPtr);
324
0
}