/src/MapServer/src/maptclutf.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * $Id$ |
3 | | * |
4 | | * Project: MapServer |
5 | | * Purpose: Implementation of msUTF8ToUniChar() |
6 | | * Author: Daniel Morissette, Thomas Bonfort |
7 | | * |
8 | | * Note: |
9 | | * The source code of Tcl_UtfToUniChar() was borrowed from tclUtf.c |
10 | | * from the Tcl/Tk project. |
11 | | * |
12 | | * Website: http://www.tcl.tk/software/tcltk/ |
13 | | * Source download: http://prdownloads.sourceforge.net/tcl/tcl8.4.15-src.tar.gz |
14 | | * |
15 | | * See copyright and license terms below the standard MapServer license. |
16 | | * |
17 | | ****************************************************************************** |
18 | | * Copyright (c) 1996-2007 Regents of the University of Minnesota. |
19 | | * Copyright (c) 1997-1998 Sun Microsystems, Inc. |
20 | | * |
21 | | * Permission is hereby granted, free of charge, to any person obtaining a |
22 | | * copy of this software and associated documentation files (the "Software"), |
23 | | * to deal in the Software without restriction, including without limitation |
24 | | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
25 | | * and/or sell copies of the Software, and to permit persons to whom the |
26 | | * Software is furnished to do so, subject to the following conditions: |
27 | | * |
28 | | * The above copyright notice and this permission notice shall be included in |
29 | | * all copies of this Software or works derived from this Software. |
30 | | * |
31 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
32 | | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
33 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
34 | | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
35 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
36 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
37 | | * DEALINGS IN THE SOFTWARE. |
38 | | ****************************************************************************/ |
39 | | |
40 | | /* |
41 | | * tclUtf.c -- |
42 | | * |
43 | | * Routines for manipulating UTF-8 strings. |
44 | | * |
45 | | * Copyright (c) 1997-1998 Sun Microsystems, Inc. |
46 | | * |
47 | | * This software is copyrighted by the Regents of the University of |
48 | | * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState |
49 | | * Corporation and other parties. The following terms apply to all files |
50 | | * associated with the software unless explicitly disclaimed in |
51 | | * individual files. |
52 | | * |
53 | | * The authors hereby grant permission to use, copy, modify, distribute, |
54 | | * and license this software and its documentation for any purpose, provided |
55 | | * that existing copyright notices are retained in all copies and that this |
56 | | * notice is included verbatim in any distributions. No written agreement, |
57 | | * license, or royalty fee is required for any of the authorized uses. |
58 | | * Modifications to this software may be copyrighted by their authors |
59 | | * and need not follow the licensing terms described here, provided that |
60 | | * the new terms are clearly indicated on the first page of each file where |
61 | | * they apply. |
62 | | * |
63 | | * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY |
64 | | * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
65 | | * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY |
66 | | * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE |
67 | | * POSSIBILITY OF SUCH DAMAGE. |
68 | | * |
69 | | * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, |
70 | | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, |
71 | | * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE |
72 | | * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE |
73 | | * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR |
74 | | * MODIFICATIONS. |
75 | | * |
76 | | * GOVERNMENT USE: If you are acquiring this software on behalf of the |
77 | | * U.S. government, the Government shall have only "Restricted Rights" |
78 | | * in the software and related documentation as defined in the Federal |
79 | | * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you |
80 | | * are acquiring the software on behalf of the Department of Defense, the |
81 | | * software shall be classified as "Commercial Computer Software" and the |
82 | | * Government shall have only "Restricted Rights" as defined in Clause |
83 | | * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the |
84 | | * authors grant the U.S. Government and others acting in its behalf |
85 | | * permission to use and distribute the software in accordance with the |
86 | | * terms specified in this license. |
87 | | */ |
88 | | |
89 | | #include "mapserver.h" |
90 | | |
91 | | /* The source code of Tcl_UtfToUniChar() was borrowed from tclUtf.c |
92 | | * from the Tcl/Tk project: |
93 | | * Website: |
94 | | * http://www.tcl.tk/software/tcltk/ |
95 | | * Source download: |
96 | | * http://prdownloads.sourceforge.net/tcl/tcl8.4.15-src.tar.gz |
97 | | * Original License info follows below. |
98 | | */ |
99 | | |
100 | | /* |
101 | | * tclUtf.c -- |
102 | | * |
103 | | * Routines for manipulating UTF-8 strings. |
104 | | * |
105 | | * Copyright (c) 1997-1998 Sun Microsystems, Inc. |
106 | | * |
107 | | * See the file "license.terms" for information on usage and redistribution of |
108 | | * this file, and for a DISCLAIMER OF ALL WARRANTIES. |
109 | | * |
110 | | * Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp |
111 | | */ |
112 | | |
113 | | /******************* Tcl license.terms ********************* |
114 | | |
115 | | This software is copyrighted by the Regents of the University of |
116 | | California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState |
117 | | Corporation and other parties. The following terms apply to all files |
118 | | associated with the software unless explicitly disclaimed in |
119 | | individual files. |
120 | | |
121 | | The authors hereby grant permission to use, copy, modify, distribute, |
122 | | and license this software and its documentation for any purpose, provided |
123 | | that existing copyright notices are retained in all copies and that this |
124 | | notice is included verbatim in any distributions. No written agreement, |
125 | | license, or royalty fee is required for any of the authorized uses. |
126 | | Modifications to this software may be copyrighted by their authors |
127 | | and need not follow the licensing terms described here, provided that |
128 | | the new terms are clearly indicated on the first page of each file where |
129 | | they apply. |
130 | | |
131 | | IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY |
132 | | FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
133 | | ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY |
134 | | DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE |
135 | | POSSIBILITY OF SUCH DAMAGE. |
136 | | |
137 | | THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, |
138 | | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, |
139 | | FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE |
140 | | IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE |
141 | | NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR |
142 | | MODIFICATIONS. |
143 | | |
144 | | GOVERNMENT USE: If you are acquiring this software on behalf of the |
145 | | U.S. government, the Government shall have only "Restricted Rights" |
146 | | in the software and related documentation as defined in the Federal |
147 | | Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you |
148 | | are acquiring the software on behalf of the Department of Defense, the |
149 | | software shall be classified as "Commercial Computer Software" and the |
150 | | Government shall have only "Restricted Rights" as defined in Clause |
151 | | 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the |
152 | | authors grant the U.S. Government and others acting in its behalf |
153 | | permission to use and distribute the software in accordance with the |
154 | | terms specified in this license. |
155 | | |
156 | | ***********************************************************/ |
157 | | |
158 | | #define TCL_UTF_MAX 6 |
159 | | |
160 | | /* |
161 | | * The following structures are used when mapping between Unicode (UCS-2) |
162 | | * and UTF-8. |
163 | | */ |
164 | | |
165 | | static const unsigned char totalBytes[256] = { |
166 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
167 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
168 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
169 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
170 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
171 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
172 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
173 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
174 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
175 | | 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
176 | | #if TCL_UTF_MAX > 3 |
177 | | 4, 4, 4, 4, 4, 4, 4, 4, |
178 | | #else |
179 | | 1, 1, 1, 1, 1, 1, 1, 1, |
180 | | #endif |
181 | | #if TCL_UTF_MAX > 4 |
182 | | 5, 5, 5, 5, |
183 | | #else |
184 | | 1, 1, 1, 1, |
185 | | #endif |
186 | | #if TCL_UTF_MAX > 5 |
187 | | 6, 6, 6, 6 |
188 | | #else |
189 | | 1, 1, 1, 1 |
190 | | #endif |
191 | | }; |
192 | | |
193 | | /* |
194 | | *--------------------------------------------------------------------------- |
195 | | * |
196 | | * Tcl_UtfToUniChar -- |
197 | | * |
198 | | * Extract the Tcl_UniChar represented by the UTF-8 string. Bad |
199 | | * UTF-8 sequences are converted to valid Tcl_UniChars and processing |
200 | | * continues. Equivalent to Plan 9 chartorune(). |
201 | | * |
202 | | * The caller must ensure that the source buffer is long enough that |
203 | | * this routine does not run off the end and dereference non-existent |
204 | | * memory looking for trail bytes. If the source buffer is known to |
205 | | * be '\0' terminated, this cannot happen. Otherwise, the caller |
206 | | * should call Tcl_UtfCharComplete() before calling this routine to |
207 | | * ensure that enough bytes remain in the string. |
208 | | * |
209 | | * Results: |
210 | | * *chPtr is filled with the Tcl_UniChar, and the return value is the |
211 | | * number of bytes from the UTF-8 string that were consumed. |
212 | | * |
213 | | * Side effects: |
214 | | * None. |
215 | | * |
216 | | *--------------------------------------------------------------------------- |
217 | | */ |
218 | | |
219 | | int ms_Tcl_UtfToUniChar(register const char *str, |
220 | 0 | register unsigned int *chPtr) { |
221 | 0 | register int byte; |
222 | | |
223 | | /* |
224 | | * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. |
225 | | */ |
226 | |
|
227 | 0 | byte = *((unsigned char *)str); |
228 | 0 | if (byte < 0xC0) { |
229 | | /* |
230 | | * Handles properly formed UTF-8 characters between 0x01 and 0x7F. |
231 | | * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid |
232 | | * characters representing themselves. |
233 | | */ |
234 | |
|
235 | 0 | *chPtr = byte; |
236 | 0 | return 1; |
237 | 0 | } else if (byte < 0xE0) { |
238 | 0 | if ((str[1] & 0xC0) == 0x80) { |
239 | | /* |
240 | | * Two-byte-character lead-byte followed by a trail-byte. |
241 | | */ |
242 | |
|
243 | 0 | *chPtr = (((byte & 0x1F) << 6) | (str[1] & 0x3F)); |
244 | 0 | return 2; |
245 | 0 | } |
246 | | /* |
247 | | * A two-byte-character lead-byte not followed by trail-byte |
248 | | * represents itself. |
249 | | */ |
250 | | |
251 | 0 | *chPtr = byte; |
252 | 0 | return 1; |
253 | 0 | } else if (byte < 0xF0) { |
254 | 0 | if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { |
255 | | /* |
256 | | * Three-byte-character lead byte followed by two trail bytes. |
257 | | */ |
258 | |
|
259 | 0 | *chPtr = |
260 | 0 | (((byte & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); |
261 | 0 | return 3; |
262 | 0 | } |
263 | | /* |
264 | | * A three-byte-character lead-byte not followed by two trail-bytes |
265 | | * represents itself. |
266 | | */ |
267 | | |
268 | 0 | *chPtr = byte; |
269 | 0 | return 1; |
270 | 0 | } |
271 | 0 | #if TCL_UTF_MAX > 3 |
272 | 0 | else { |
273 | 0 | int ch, total, trail; |
274 | |
|
275 | 0 | total = totalBytes[byte]; |
276 | 0 | trail = total - 1; |
277 | 0 | if (trail > 0) { |
278 | 0 | ch = byte & (0x3F >> trail); |
279 | 0 | do { |
280 | 0 | str++; |
281 | 0 | if ((*str & 0xC0) != 0x80) { |
282 | 0 | *chPtr = byte; |
283 | 0 | return 1; |
284 | 0 | } |
285 | 0 | ch <<= 6; |
286 | 0 | ch |= (*str & 0x3F); |
287 | 0 | trail--; |
288 | 0 | } while (trail > 0); |
289 | 0 | *chPtr = ch; |
290 | 0 | return total; |
291 | 0 | } |
292 | 0 | } |
293 | 0 | #endif |
294 | | |
295 | 0 | *chPtr = byte; |
296 | 0 | return 1; |
297 | 0 | } |
298 | | |
299 | | /* msUTF8ToUniChar() |
300 | | * |
301 | | * Extract the Unicode Char represented by the UTF-8 string. Bad |
302 | | * UTF-8 sequences are converted to valid Unicode Chars and processing |
303 | | * continues. |
304 | | * |
305 | | * The caller must ensure that the source buffer is long enough that |
306 | | * this routine does not run off the end and dereference non-existent |
307 | | * memory looking for trail bytes. If the source buffer is known to |
308 | | * be '\0' terminated, this cannot happen. |
309 | | * |
310 | | * Results: |
311 | | * *chPtr is filled with the Unicode Char value, and the return value |
312 | | * is the number of bytes from the UTF-8 string that were consumed. |
313 | | ** |
314 | | **/ |
315 | | int msUTF8ToUniChar(const char *str, /* The UTF-8 string. */ |
316 | | unsigned int *chPtr) /* Filled with the Unicode Char |
317 | | * represented by the UTF-8 string. */ |
318 | 0 | { |
319 | | /*check if the string is an html entity (eg { or Ī)*/ |
320 | 0 | int entitylgth; |
321 | 0 | if (*str == '&' && (entitylgth = msGetUnicodeEntity(str, chPtr)) > 0) |
322 | 0 | return entitylgth; |
323 | 0 | return ms_Tcl_UtfToUniChar(str, chPtr); |
324 | 0 | } |