/src/postgres/src/backend/utils/mb/conv.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * Utility functions for conversion procs. |
4 | | * |
5 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
6 | | * Portions Copyright (c) 1994, Regents of the University of California |
7 | | * |
8 | | * IDENTIFICATION |
9 | | * src/backend/utils/mb/conv.c |
10 | | * |
11 | | *------------------------------------------------------------------------- |
12 | | */ |
13 | | #include "postgres.h" |
14 | | #include "mb/pg_wchar.h" |
15 | | |
16 | | |
17 | | /* |
18 | | * local2local: a generic single byte charset encoding |
19 | | * conversion between two ASCII-superset encodings. |
20 | | * |
21 | | * l points to the source string of length len |
22 | | * p is the output area (must be large enough!) |
23 | | * src_encoding is the PG identifier for the source encoding |
24 | | * dest_encoding is the PG identifier for the target encoding |
25 | | * tab holds conversion entries for the source charset |
26 | | * starting from 128 (0x80). each entry in the table holds the corresponding |
27 | | * code point for the target charset, or 0 if there is no equivalent code. |
28 | | * |
29 | | * Returns the number of input bytes consumed. If noError is true, this can |
30 | | * be less than 'len'. |
31 | | */ |
32 | | int |
33 | | local2local(const unsigned char *l, |
34 | | unsigned char *p, |
35 | | int len, |
36 | | int src_encoding, |
37 | | int dest_encoding, |
38 | | const unsigned char *tab, |
39 | | bool noError) |
40 | 0 | { |
41 | 0 | const unsigned char *start = l; |
42 | 0 | unsigned char c1, |
43 | 0 | c2; |
44 | |
|
45 | 0 | while (len > 0) |
46 | 0 | { |
47 | 0 | c1 = *l; |
48 | 0 | if (c1 == 0) |
49 | 0 | { |
50 | 0 | if (noError) |
51 | 0 | break; |
52 | 0 | report_invalid_encoding(src_encoding, (const char *) l, len); |
53 | 0 | } |
54 | 0 | if (!IS_HIGHBIT_SET(c1)) |
55 | 0 | *p++ = c1; |
56 | 0 | else |
57 | 0 | { |
58 | 0 | c2 = tab[c1 - HIGHBIT]; |
59 | 0 | if (c2) |
60 | 0 | *p++ = c2; |
61 | 0 | else |
62 | 0 | { |
63 | 0 | if (noError) |
64 | 0 | break; |
65 | 0 | report_untranslatable_char(src_encoding, dest_encoding, |
66 | 0 | (const char *) l, len); |
67 | 0 | } |
68 | 0 | } |
69 | 0 | l++; |
70 | 0 | len--; |
71 | 0 | } |
72 | 0 | *p = '\0'; |
73 | |
|
74 | 0 | return l - start; |
75 | 0 | } |
76 | | |
77 | | /* |
78 | | * LATINn ---> MIC when the charset's local codes map directly to MIC |
79 | | * |
80 | | * l points to the source string of length len |
81 | | * p is the output area (must be large enough!) |
82 | | * lc is the mule character set id for the local encoding |
83 | | * encoding is the PG identifier for the local encoding |
84 | | * |
85 | | * Returns the number of input bytes consumed. If noError is true, this can |
86 | | * be less than 'len'. |
87 | | */ |
88 | | int |
89 | | latin2mic(const unsigned char *l, unsigned char *p, int len, |
90 | | int lc, int encoding, bool noError) |
91 | 0 | { |
92 | 0 | const unsigned char *start = l; |
93 | 0 | int c1; |
94 | |
|
95 | 0 | while (len > 0) |
96 | 0 | { |
97 | 0 | c1 = *l; |
98 | 0 | if (c1 == 0) |
99 | 0 | { |
100 | 0 | if (noError) |
101 | 0 | break; |
102 | 0 | report_invalid_encoding(encoding, (const char *) l, len); |
103 | 0 | } |
104 | 0 | if (IS_HIGHBIT_SET(c1)) |
105 | 0 | *p++ = lc; |
106 | 0 | *p++ = c1; |
107 | 0 | l++; |
108 | 0 | len--; |
109 | 0 | } |
110 | 0 | *p = '\0'; |
111 | |
|
112 | 0 | return l - start; |
113 | 0 | } |
114 | | |
115 | | /* |
116 | | * MIC ---> LATINn when the charset's local codes map directly to MIC |
117 | | * |
118 | | * mic points to the source string of length len |
119 | | * p is the output area (must be large enough!) |
120 | | * lc is the mule character set id for the local encoding |
121 | | * encoding is the PG identifier for the local encoding |
122 | | * |
123 | | * Returns the number of input bytes consumed. If noError is true, this can |
124 | | * be less than 'len'. |
125 | | */ |
126 | | int |
127 | | mic2latin(const unsigned char *mic, unsigned char *p, int len, |
128 | | int lc, int encoding, bool noError) |
129 | 0 | { |
130 | 0 | const unsigned char *start = mic; |
131 | 0 | int c1; |
132 | |
|
133 | 0 | while (len > 0) |
134 | 0 | { |
135 | 0 | c1 = *mic; |
136 | 0 | if (c1 == 0) |
137 | 0 | { |
138 | 0 | if (noError) |
139 | 0 | break; |
140 | 0 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
141 | 0 | } |
142 | 0 | if (!IS_HIGHBIT_SET(c1)) |
143 | 0 | { |
144 | | /* easy for ASCII */ |
145 | 0 | *p++ = c1; |
146 | 0 | mic++; |
147 | 0 | len--; |
148 | 0 | } |
149 | 0 | else |
150 | 0 | { |
151 | 0 | int l = pg_mule_mblen(mic); |
152 | |
|
153 | 0 | if (len < l) |
154 | 0 | { |
155 | 0 | if (noError) |
156 | 0 | break; |
157 | 0 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
158 | 0 | len); |
159 | 0 | } |
160 | 0 | if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) |
161 | 0 | { |
162 | 0 | if (noError) |
163 | 0 | break; |
164 | 0 | report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
165 | 0 | (const char *) mic, len); |
166 | 0 | } |
167 | 0 | *p++ = mic[1]; |
168 | 0 | mic += 2; |
169 | 0 | len -= 2; |
170 | 0 | } |
171 | 0 | } |
172 | 0 | *p = '\0'; |
173 | |
|
174 | 0 | return mic - start; |
175 | 0 | } |
176 | | |
177 | | |
178 | | /* |
179 | | * latin2mic_with_table: a generic single byte charset encoding |
180 | | * conversion from a local charset to the mule internal code. |
181 | | * |
182 | | * l points to the source string of length len |
183 | | * p is the output area (must be large enough!) |
184 | | * lc is the mule character set id for the local encoding |
185 | | * encoding is the PG identifier for the local encoding |
186 | | * tab holds conversion entries for the local charset |
187 | | * starting from 128 (0x80). each entry in the table holds the corresponding |
188 | | * code point for the mule encoding, or 0 if there is no equivalent code. |
189 | | * |
190 | | * Returns the number of input bytes consumed. If noError is true, this can |
191 | | * be less than 'len'. |
192 | | */ |
193 | | int |
194 | | latin2mic_with_table(const unsigned char *l, |
195 | | unsigned char *p, |
196 | | int len, |
197 | | int lc, |
198 | | int encoding, |
199 | | const unsigned char *tab, |
200 | | bool noError) |
201 | 0 | { |
202 | 0 | const unsigned char *start = l; |
203 | 0 | unsigned char c1, |
204 | 0 | c2; |
205 | |
|
206 | 0 | while (len > 0) |
207 | 0 | { |
208 | 0 | c1 = *l; |
209 | 0 | if (c1 == 0) |
210 | 0 | { |
211 | 0 | if (noError) |
212 | 0 | break; |
213 | 0 | report_invalid_encoding(encoding, (const char *) l, len); |
214 | 0 | } |
215 | 0 | if (!IS_HIGHBIT_SET(c1)) |
216 | 0 | *p++ = c1; |
217 | 0 | else |
218 | 0 | { |
219 | 0 | c2 = tab[c1 - HIGHBIT]; |
220 | 0 | if (c2) |
221 | 0 | { |
222 | 0 | *p++ = lc; |
223 | 0 | *p++ = c2; |
224 | 0 | } |
225 | 0 | else |
226 | 0 | { |
227 | 0 | if (noError) |
228 | 0 | break; |
229 | 0 | report_untranslatable_char(encoding, PG_MULE_INTERNAL, |
230 | 0 | (const char *) l, len); |
231 | 0 | } |
232 | 0 | } |
233 | 0 | l++; |
234 | 0 | len--; |
235 | 0 | } |
236 | 0 | *p = '\0'; |
237 | |
|
238 | 0 | return l - start; |
239 | 0 | } |
240 | | |
241 | | /* |
242 | | * mic2latin_with_table: a generic single byte charset encoding |
243 | | * conversion from the mule internal code to a local charset. |
244 | | * |
245 | | * mic points to the source string of length len |
246 | | * p is the output area (must be large enough!) |
247 | | * lc is the mule character set id for the local encoding |
248 | | * encoding is the PG identifier for the local encoding |
249 | | * tab holds conversion entries for the mule internal code's second byte, |
250 | | * starting from 128 (0x80). each entry in the table holds the corresponding |
251 | | * code point for the local charset, or 0 if there is no equivalent code. |
252 | | * |
253 | | * Returns the number of input bytes consumed. If noError is true, this can |
254 | | * be less than 'len'. |
255 | | */ |
256 | | int |
257 | | mic2latin_with_table(const unsigned char *mic, |
258 | | unsigned char *p, |
259 | | int len, |
260 | | int lc, |
261 | | int encoding, |
262 | | const unsigned char *tab, |
263 | | bool noError) |
264 | 0 | { |
265 | 0 | const unsigned char *start = mic; |
266 | 0 | unsigned char c1, |
267 | 0 | c2; |
268 | |
|
269 | 0 | while (len > 0) |
270 | 0 | { |
271 | 0 | c1 = *mic; |
272 | 0 | if (c1 == 0) |
273 | 0 | { |
274 | 0 | if (noError) |
275 | 0 | break; |
276 | 0 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
277 | 0 | } |
278 | 0 | if (!IS_HIGHBIT_SET(c1)) |
279 | 0 | { |
280 | | /* easy for ASCII */ |
281 | 0 | *p++ = c1; |
282 | 0 | mic++; |
283 | 0 | len--; |
284 | 0 | } |
285 | 0 | else |
286 | 0 | { |
287 | 0 | int l = pg_mule_mblen(mic); |
288 | |
|
289 | 0 | if (len < l) |
290 | 0 | { |
291 | 0 | if (noError) |
292 | 0 | break; |
293 | 0 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
294 | 0 | len); |
295 | 0 | } |
296 | 0 | if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || |
297 | 0 | (c2 = tab[mic[1] - HIGHBIT]) == 0) |
298 | 0 | { |
299 | 0 | if (noError) |
300 | 0 | break; |
301 | 0 | report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
302 | 0 | (const char *) mic, len); |
303 | 0 | break; /* keep compiler quiet */ |
304 | 0 | } |
305 | 0 | *p++ = c2; |
306 | 0 | mic += 2; |
307 | 0 | len -= 2; |
308 | 0 | } |
309 | 0 | } |
310 | 0 | *p = '\0'; |
311 | |
|
312 | 0 | return mic - start; |
313 | 0 | } |
314 | | |
315 | | /* |
316 | | * comparison routine for bsearch() |
317 | | * this routine is intended for combined UTF8 -> local code |
318 | | */ |
319 | | static int |
320 | | compare3(const void *p1, const void *p2) |
321 | 0 | { |
322 | 0 | uint32 s1, |
323 | 0 | s2, |
324 | 0 | d1, |
325 | 0 | d2; |
326 | |
|
327 | 0 | s1 = *(const uint32 *) p1; |
328 | 0 | s2 = *((const uint32 *) p1 + 1); |
329 | 0 | d1 = ((const pg_utf_to_local_combined *) p2)->utf1; |
330 | 0 | d2 = ((const pg_utf_to_local_combined *) p2)->utf2; |
331 | 0 | return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1); |
332 | 0 | } |
333 | | |
334 | | /* |
335 | | * comparison routine for bsearch() |
336 | | * this routine is intended for local code -> combined UTF8 |
337 | | */ |
338 | | static int |
339 | | compare4(const void *p1, const void *p2) |
340 | 0 | { |
341 | 0 | uint32 v1, |
342 | 0 | v2; |
343 | |
|
344 | 0 | v1 = *(const uint32 *) p1; |
345 | 0 | v2 = ((const pg_local_to_utf_combined *) p2)->code; |
346 | 0 | return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
347 | 0 | } |
348 | | |
349 | | /* |
350 | | * store 32bit character representation into multibyte stream |
351 | | */ |
352 | | static inline unsigned char * |
353 | | store_coded_char(unsigned char *dest, uint32 code) |
354 | 0 | { |
355 | 0 | if (code & 0xff000000) |
356 | 0 | *dest++ = code >> 24; |
357 | 0 | if (code & 0x00ff0000) |
358 | 0 | *dest++ = code >> 16; |
359 | 0 | if (code & 0x0000ff00) |
360 | 0 | *dest++ = code >> 8; |
361 | 0 | if (code & 0x000000ff) |
362 | 0 | *dest++ = code; |
363 | 0 | return dest; |
364 | 0 | } |
365 | | |
366 | | /* |
367 | | * Convert a character using a conversion radix tree. |
368 | | * |
369 | | * 'l' is the length of the input character in bytes, and b1-b4 are |
370 | | * the input character's bytes. |
371 | | */ |
372 | | static inline uint32 |
373 | | pg_mb_radix_conv(const pg_mb_radix_tree *rt, |
374 | | int l, |
375 | | unsigned char b1, |
376 | | unsigned char b2, |
377 | | unsigned char b3, |
378 | | unsigned char b4) |
379 | 0 | { |
380 | 0 | if (l == 4) |
381 | 0 | { |
382 | | /* 4-byte code */ |
383 | | |
384 | | /* check code validity */ |
385 | 0 | if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || |
386 | 0 | b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || |
387 | 0 | b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || |
388 | 0 | b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) |
389 | 0 | return 0; |
390 | | |
391 | | /* perform lookup */ |
392 | 0 | if (rt->chars32) |
393 | 0 | { |
394 | 0 | uint32 idx = rt->b4root; |
395 | |
|
396 | 0 | idx = rt->chars32[b1 + idx - rt->b4_1_lower]; |
397 | 0 | idx = rt->chars32[b2 + idx - rt->b4_2_lower]; |
398 | 0 | idx = rt->chars32[b3 + idx - rt->b4_3_lower]; |
399 | 0 | return rt->chars32[b4 + idx - rt->b4_4_lower]; |
400 | 0 | } |
401 | 0 | else |
402 | 0 | { |
403 | 0 | uint16 idx = rt->b4root; |
404 | |
|
405 | 0 | idx = rt->chars16[b1 + idx - rt->b4_1_lower]; |
406 | 0 | idx = rt->chars16[b2 + idx - rt->b4_2_lower]; |
407 | 0 | idx = rt->chars16[b3 + idx - rt->b4_3_lower]; |
408 | 0 | return rt->chars16[b4 + idx - rt->b4_4_lower]; |
409 | 0 | } |
410 | 0 | } |
411 | 0 | else if (l == 3) |
412 | 0 | { |
413 | | /* 3-byte code */ |
414 | | |
415 | | /* check code validity */ |
416 | 0 | if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || |
417 | 0 | b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || |
418 | 0 | b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) |
419 | 0 | return 0; |
420 | | |
421 | | /* perform lookup */ |
422 | 0 | if (rt->chars32) |
423 | 0 | { |
424 | 0 | uint32 idx = rt->b3root; |
425 | |
|
426 | 0 | idx = rt->chars32[b2 + idx - rt->b3_1_lower]; |
427 | 0 | idx = rt->chars32[b3 + idx - rt->b3_2_lower]; |
428 | 0 | return rt->chars32[b4 + idx - rt->b3_3_lower]; |
429 | 0 | } |
430 | 0 | else |
431 | 0 | { |
432 | 0 | uint16 idx = rt->b3root; |
433 | |
|
434 | 0 | idx = rt->chars16[b2 + idx - rt->b3_1_lower]; |
435 | 0 | idx = rt->chars16[b3 + idx - rt->b3_2_lower]; |
436 | 0 | return rt->chars16[b4 + idx - rt->b3_3_lower]; |
437 | 0 | } |
438 | 0 | } |
439 | 0 | else if (l == 2) |
440 | 0 | { |
441 | | /* 2-byte code */ |
442 | | |
443 | | /* check code validity - first byte */ |
444 | 0 | if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || |
445 | 0 | b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) |
446 | 0 | return 0; |
447 | | |
448 | | /* perform lookup */ |
449 | 0 | if (rt->chars32) |
450 | 0 | { |
451 | 0 | uint32 idx = rt->b2root; |
452 | |
|
453 | 0 | idx = rt->chars32[b3 + idx - rt->b2_1_lower]; |
454 | 0 | return rt->chars32[b4 + idx - rt->b2_2_lower]; |
455 | 0 | } |
456 | 0 | else |
457 | 0 | { |
458 | 0 | uint16 idx = rt->b2root; |
459 | |
|
460 | 0 | idx = rt->chars16[b3 + idx - rt->b2_1_lower]; |
461 | 0 | return rt->chars16[b4 + idx - rt->b2_2_lower]; |
462 | 0 | } |
463 | 0 | } |
464 | 0 | else if (l == 1) |
465 | 0 | { |
466 | | /* 1-byte code */ |
467 | | |
468 | | /* check code validity - first byte */ |
469 | 0 | if (b4 < rt->b1_lower || b4 > rt->b1_upper) |
470 | 0 | return 0; |
471 | | |
472 | | /* perform lookup */ |
473 | 0 | if (rt->chars32) |
474 | 0 | return rt->chars32[b4 + rt->b1root - rt->b1_lower]; |
475 | 0 | else |
476 | 0 | return rt->chars16[b4 + rt->b1root - rt->b1_lower]; |
477 | 0 | } |
478 | 0 | return 0; /* shouldn't happen */ |
479 | 0 | } |
480 | | |
481 | | /* |
482 | | * UTF8 ---> local code |
483 | | * |
484 | | * utf: input string in UTF8 encoding (need not be null-terminated) |
485 | | * len: length of input string (in bytes) |
486 | | * iso: pointer to the output area (must be large enough!) |
487 | | (output string will be null-terminated) |
488 | | * map: conversion map for single characters |
489 | | * cmap: conversion map for combined characters |
490 | | * (optional, pass NULL if none) |
491 | | * cmapsize: number of entries in the conversion map for combined characters |
492 | | * (optional, pass 0 if none) |
493 | | * conv_func: algorithmic encoding conversion function |
494 | | * (optional, pass NULL if none) |
495 | | * encoding: PG identifier for the local encoding |
496 | | * |
497 | | * For each character, the cmap (if provided) is consulted first; if no match, |
498 | | * the map is consulted next; if still no match, the conv_func (if provided) |
499 | | * is applied. An error is raised if no match is found. |
500 | | * |
501 | | * See pg_wchar.h for more details about the data structures used here. |
502 | | * |
503 | | * Returns the number of input bytes consumed. If noError is true, this can |
504 | | * be less than 'len'. |
505 | | */ |
506 | | int |
507 | | UtfToLocal(const unsigned char *utf, int len, |
508 | | unsigned char *iso, |
509 | | const pg_mb_radix_tree *map, |
510 | | const pg_utf_to_local_combined *cmap, int cmapsize, |
511 | | utf_local_conversion_func conv_func, |
512 | | int encoding, bool noError) |
513 | 0 | { |
514 | 0 | uint32 iutf; |
515 | 0 | int l; |
516 | 0 | const pg_utf_to_local_combined *cp; |
517 | 0 | const unsigned char *start = utf; |
518 | |
|
519 | 0 | if (!PG_VALID_ENCODING(encoding)) |
520 | 0 | ereport(ERROR, |
521 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
522 | 0 | errmsg("invalid encoding number: %d", encoding))); |
523 | | |
524 | 0 | for (; len > 0; len -= l) |
525 | 0 | { |
526 | 0 | unsigned char b1 = 0; |
527 | 0 | unsigned char b2 = 0; |
528 | 0 | unsigned char b3 = 0; |
529 | 0 | unsigned char b4 = 0; |
530 | | |
531 | | /* "break" cases all represent errors */ |
532 | 0 | if (*utf == '\0') |
533 | 0 | break; |
534 | | |
535 | 0 | l = pg_utf_mblen(utf); |
536 | 0 | if (len < l) |
537 | 0 | break; |
538 | | |
539 | 0 | if (!pg_utf8_islegal(utf, l)) |
540 | 0 | break; |
541 | | |
542 | 0 | if (l == 1) |
543 | 0 | { |
544 | | /* ASCII case is easy, assume it's one-to-one conversion */ |
545 | 0 | *iso++ = *utf++; |
546 | 0 | continue; |
547 | 0 | } |
548 | | |
549 | | /* collect coded char of length l */ |
550 | 0 | if (l == 2) |
551 | 0 | { |
552 | 0 | b3 = *utf++; |
553 | 0 | b4 = *utf++; |
554 | 0 | } |
555 | 0 | else if (l == 3) |
556 | 0 | { |
557 | 0 | b2 = *utf++; |
558 | 0 | b3 = *utf++; |
559 | 0 | b4 = *utf++; |
560 | 0 | } |
561 | 0 | else if (l == 4) |
562 | 0 | { |
563 | 0 | b1 = *utf++; |
564 | 0 | b2 = *utf++; |
565 | 0 | b3 = *utf++; |
566 | 0 | b4 = *utf++; |
567 | 0 | } |
568 | 0 | else |
569 | 0 | { |
570 | 0 | elog(ERROR, "unsupported character length %d", l); |
571 | 0 | iutf = 0; /* keep compiler quiet */ |
572 | 0 | } |
573 | 0 | iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); |
574 | | |
575 | | /* First, try with combined map if possible */ |
576 | 0 | if (cmap && len > l) |
577 | 0 | { |
578 | 0 | const unsigned char *utf_save = utf; |
579 | 0 | int len_save = len; |
580 | 0 | int l_save = l; |
581 | | |
582 | | /* collect next character, same as above */ |
583 | 0 | len -= l; |
584 | |
|
585 | 0 | l = pg_utf_mblen(utf); |
586 | 0 | if (len < l) |
587 | 0 | { |
588 | | /* need more data to decide if this is a combined char */ |
589 | 0 | utf -= l_save; |
590 | 0 | break; |
591 | 0 | } |
592 | | |
593 | 0 | if (!pg_utf8_islegal(utf, l)) |
594 | 0 | { |
595 | 0 | if (!noError) |
596 | 0 | report_invalid_encoding(PG_UTF8, (const char *) utf, len); |
597 | 0 | utf -= l_save; |
598 | 0 | break; |
599 | 0 | } |
600 | | |
601 | | /* We assume ASCII character cannot be in combined map */ |
602 | 0 | if (l > 1) |
603 | 0 | { |
604 | 0 | uint32 iutf2; |
605 | 0 | uint32 cutf[2]; |
606 | |
|
607 | 0 | if (l == 2) |
608 | 0 | { |
609 | 0 | iutf2 = *utf++ << 8; |
610 | 0 | iutf2 |= *utf++; |
611 | 0 | } |
612 | 0 | else if (l == 3) |
613 | 0 | { |
614 | 0 | iutf2 = *utf++ << 16; |
615 | 0 | iutf2 |= *utf++ << 8; |
616 | 0 | iutf2 |= *utf++; |
617 | 0 | } |
618 | 0 | else if (l == 4) |
619 | 0 | { |
620 | 0 | iutf2 = *utf++ << 24; |
621 | 0 | iutf2 |= *utf++ << 16; |
622 | 0 | iutf2 |= *utf++ << 8; |
623 | 0 | iutf2 |= *utf++; |
624 | 0 | } |
625 | 0 | else |
626 | 0 | { |
627 | 0 | elog(ERROR, "unsupported character length %d", l); |
628 | 0 | iutf2 = 0; /* keep compiler quiet */ |
629 | 0 | } |
630 | | |
631 | 0 | cutf[0] = iutf; |
632 | 0 | cutf[1] = iutf2; |
633 | |
|
634 | 0 | cp = bsearch(cutf, cmap, cmapsize, |
635 | 0 | sizeof(pg_utf_to_local_combined), compare3); |
636 | |
|
637 | 0 | if (cp) |
638 | 0 | { |
639 | 0 | iso = store_coded_char(iso, cp->code); |
640 | 0 | continue; |
641 | 0 | } |
642 | 0 | } |
643 | | |
644 | | /* fail, so back up to reprocess second character next time */ |
645 | 0 | utf = utf_save; |
646 | 0 | len = len_save; |
647 | 0 | l = l_save; |
648 | 0 | } |
649 | | |
650 | | /* Now check ordinary map */ |
651 | 0 | if (map) |
652 | 0 | { |
653 | 0 | uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); |
654 | |
|
655 | 0 | if (converted) |
656 | 0 | { |
657 | 0 | iso = store_coded_char(iso, converted); |
658 | 0 | continue; |
659 | 0 | } |
660 | 0 | } |
661 | | |
662 | | /* if there's a conversion function, try that */ |
663 | 0 | if (conv_func) |
664 | 0 | { |
665 | 0 | uint32 converted = (*conv_func) (iutf); |
666 | |
|
667 | 0 | if (converted) |
668 | 0 | { |
669 | 0 | iso = store_coded_char(iso, converted); |
670 | 0 | continue; |
671 | 0 | } |
672 | 0 | } |
673 | | |
674 | | /* failed to translate this character */ |
675 | 0 | utf -= l; |
676 | 0 | if (noError) |
677 | 0 | break; |
678 | 0 | report_untranslatable_char(PG_UTF8, encoding, |
679 | 0 | (const char *) utf, len); |
680 | 0 | } |
681 | | |
682 | | /* if we broke out of loop early, must be invalid input */ |
683 | 0 | if (len > 0 && !noError) |
684 | 0 | report_invalid_encoding(PG_UTF8, (const char *) utf, len); |
685 | |
|
686 | 0 | *iso = '\0'; |
687 | |
|
688 | 0 | return utf - start; |
689 | 0 | } |
690 | | |
691 | | /* |
692 | | * local code ---> UTF8 |
693 | | * |
694 | | * iso: input string in local encoding (need not be null-terminated) |
695 | | * len: length of input string (in bytes) |
696 | | * utf: pointer to the output area (must be large enough!) |
697 | | (output string will be null-terminated) |
698 | | * map: conversion map for single characters |
699 | | * cmap: conversion map for combined characters |
700 | | * (optional, pass NULL if none) |
701 | | * cmapsize: number of entries in the conversion map for combined characters |
702 | | * (optional, pass 0 if none) |
703 | | * conv_func: algorithmic encoding conversion function |
704 | | * (optional, pass NULL if none) |
705 | | * encoding: PG identifier for the local encoding |
706 | | * |
707 | | * For each character, the map is consulted first; if no match, the cmap |
708 | | * (if provided) is consulted next; if still no match, the conv_func |
709 | | * (if provided) is applied. An error is raised if no match is found. |
710 | | * |
711 | | * See pg_wchar.h for more details about the data structures used here. |
712 | | * |
713 | | * Returns the number of input bytes consumed. If noError is true, this can |
714 | | * be less than 'len'. |
715 | | */ |
716 | | int |
717 | | LocalToUtf(const unsigned char *iso, int len, |
718 | | unsigned char *utf, |
719 | | const pg_mb_radix_tree *map, |
720 | | const pg_local_to_utf_combined *cmap, int cmapsize, |
721 | | utf_local_conversion_func conv_func, |
722 | | int encoding, |
723 | | bool noError) |
724 | 0 | { |
725 | 0 | uint32 iiso; |
726 | 0 | int l; |
727 | 0 | const pg_local_to_utf_combined *cp; |
728 | 0 | const unsigned char *start = iso; |
729 | |
|
730 | 0 | if (!PG_VALID_ENCODING(encoding)) |
731 | 0 | ereport(ERROR, |
732 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
733 | 0 | errmsg("invalid encoding number: %d", encoding))); |
734 | | |
735 | 0 | for (; len > 0; len -= l) |
736 | 0 | { |
737 | 0 | unsigned char b1 = 0; |
738 | 0 | unsigned char b2 = 0; |
739 | 0 | unsigned char b3 = 0; |
740 | 0 | unsigned char b4 = 0; |
741 | | |
742 | | /* "break" cases all represent errors */ |
743 | 0 | if (*iso == '\0') |
744 | 0 | break; |
745 | | |
746 | 0 | if (!IS_HIGHBIT_SET(*iso)) |
747 | 0 | { |
748 | | /* ASCII case is easy, assume it's one-to-one conversion */ |
749 | 0 | *utf++ = *iso++; |
750 | 0 | l = 1; |
751 | 0 | continue; |
752 | 0 | } |
753 | | |
754 | 0 | l = pg_encoding_verifymbchar(encoding, (const char *) iso, len); |
755 | 0 | if (l < 0) |
756 | 0 | break; |
757 | | |
758 | | /* collect coded char of length l */ |
759 | 0 | if (l == 1) |
760 | 0 | b4 = *iso++; |
761 | 0 | else if (l == 2) |
762 | 0 | { |
763 | 0 | b3 = *iso++; |
764 | 0 | b4 = *iso++; |
765 | 0 | } |
766 | 0 | else if (l == 3) |
767 | 0 | { |
768 | 0 | b2 = *iso++; |
769 | 0 | b3 = *iso++; |
770 | 0 | b4 = *iso++; |
771 | 0 | } |
772 | 0 | else if (l == 4) |
773 | 0 | { |
774 | 0 | b1 = *iso++; |
775 | 0 | b2 = *iso++; |
776 | 0 | b3 = *iso++; |
777 | 0 | b4 = *iso++; |
778 | 0 | } |
779 | 0 | else |
780 | 0 | { |
781 | 0 | elog(ERROR, "unsupported character length %d", l); |
782 | 0 | iiso = 0; /* keep compiler quiet */ |
783 | 0 | } |
784 | 0 | iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); |
785 | |
|
786 | 0 | if (map) |
787 | 0 | { |
788 | 0 | uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); |
789 | |
|
790 | 0 | if (converted) |
791 | 0 | { |
792 | 0 | utf = store_coded_char(utf, converted); |
793 | 0 | continue; |
794 | 0 | } |
795 | | |
796 | | /* If there's a combined character map, try that */ |
797 | 0 | if (cmap) |
798 | 0 | { |
799 | 0 | cp = bsearch(&iiso, cmap, cmapsize, |
800 | 0 | sizeof(pg_local_to_utf_combined), compare4); |
801 | |
|
802 | 0 | if (cp) |
803 | 0 | { |
804 | 0 | utf = store_coded_char(utf, cp->utf1); |
805 | 0 | utf = store_coded_char(utf, cp->utf2); |
806 | 0 | continue; |
807 | 0 | } |
808 | 0 | } |
809 | 0 | } |
810 | | |
811 | | /* if there's a conversion function, try that */ |
812 | 0 | if (conv_func) |
813 | 0 | { |
814 | 0 | uint32 converted = (*conv_func) (iiso); |
815 | |
|
816 | 0 | if (converted) |
817 | 0 | { |
818 | 0 | utf = store_coded_char(utf, converted); |
819 | 0 | continue; |
820 | 0 | } |
821 | 0 | } |
822 | | |
823 | | /* failed to translate this character */ |
824 | 0 | iso -= l; |
825 | 0 | if (noError) |
826 | 0 | break; |
827 | 0 | report_untranslatable_char(encoding, PG_UTF8, |
828 | 0 | (const char *) iso, len); |
829 | 0 | } |
830 | | |
831 | | /* if we broke out of loop early, must be invalid input */ |
832 | 0 | if (len > 0 && !noError) |
833 | 0 | report_invalid_encoding(encoding, (const char *) iso, len); |
834 | |
|
835 | 0 | *utf = '\0'; |
836 | |
|
837 | 0 | return iso - start; |
838 | 0 | } |