/src/netcdf-c/libdispatch/dutf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2018, University Corporation for Atmospheric Research |
3 | | * See netcdf/COPYRIGHT file for copying and redistribution conditions. |
4 | | */ |
5 | | |
6 | | #include "config.h" |
7 | | #ifdef HAVE_STDLIB_H |
8 | | #include <stdlib.h> |
9 | | #endif |
10 | | #ifdef HAVE_STRING_H |
11 | | #include <string.h> |
12 | | #endif |
13 | | #include "netcdf.h" |
14 | | #include "ncutf8.h" |
15 | | #include "utf8proc.h" |
16 | | |
17 | | /* Provide a wrapper around whatever utf8 library we use. */ |
18 | | |
19 | | /* |
20 | | * Check validity of a UTF8 encoded null-terminated byte string. |
21 | | * Return codes: |
22 | | * NC_NOERR -- string is valid utf8 |
23 | | * NC_ENOMEM -- out of memory |
24 | | * NC_EINVAL -- invalid argument or internal error |
25 | | * NC_EBADNAME-- not valid utf8 |
26 | | */ |
27 | | |
28 | | int nc_utf8_validate(const unsigned char* name) |
29 | 0 | { |
30 | 0 | int ncstat = NC_NOERR; |
31 | 0 | const nc_utf8proc_uint8_t *str; |
32 | 0 | nc_utf8proc_ssize_t nchars = -1; |
33 | 0 | nc_utf8proc_int32_t codepoint; |
34 | 0 | nc_utf8proc_ssize_t count; |
35 | |
|
36 | 0 | str = (const nc_utf8proc_uint8_t*)name; |
37 | 0 | while(*str) { |
38 | 0 | count = nc_utf8proc_iterate(str,nchars,&codepoint); |
39 | 0 | if(count < 0) { |
40 | 0 | switch (count) { |
41 | 0 | case UTF8PROC_ERROR_NOMEM: |
42 | 0 | case UTF8PROC_ERROR_OVERFLOW: |
43 | 0 | ncstat = NC_ENOMEM; |
44 | 0 | break; |
45 | 0 | case UTF8PROC_ERROR_INVALIDOPTS: |
46 | 0 | ncstat = NC_EINVAL; |
47 | 0 | break; |
48 | 0 | case UTF8PROC_ERROR_INVALIDUTF8: |
49 | 0 | case UTF8PROC_ERROR_NOTASSIGNED: |
50 | 0 | default: |
51 | 0 | ncstat = NC_EBADNAME; |
52 | 0 | break; |
53 | 0 | } |
54 | 0 | goto done; |
55 | 0 | } else { /* move to next char */ |
56 | 0 | str += count; |
57 | 0 | } |
58 | 0 | } |
59 | 0 | done: |
60 | 0 | return ncstat; |
61 | 0 | } |
62 | | |
63 | | /* |
64 | | * Returns a pointer to newly allocated memory of a |
65 | | * normalized version of the null-terminated string 'str'. |
66 | | * Normalized string is returned in normalp argument; |
67 | | * caller must free. |
68 | | * Return codes: |
69 | | * NC_NOERR -- success |
70 | | * NC_ENOMEM -- out of memory |
71 | | * NC_EINVAL -- illegal argument or internal error |
72 | | * NC_EBADNAME -- other failure |
73 | | */ |
74 | | int |
75 | | nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp) |
76 | 0 | { |
77 | 0 | int ncstat = NC_NOERR; |
78 | 0 | const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8; |
79 | 0 | nc_utf8proc_uint8_t* retval = NULL; |
80 | 0 | nc_utf8proc_ssize_t count; |
81 | 0 | count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE); |
82 | 0 | if(count < 0) {/* error */ |
83 | 0 | switch (count) { |
84 | 0 | case UTF8PROC_ERROR_NOMEM: |
85 | 0 | case UTF8PROC_ERROR_OVERFLOW: |
86 | 0 | ncstat = NC_ENOMEM; |
87 | 0 | break; |
88 | 0 | case UTF8PROC_ERROR_INVALIDOPTS: |
89 | 0 | ncstat = NC_EINVAL; |
90 | 0 | break; |
91 | 0 | case UTF8PROC_ERROR_INVALIDUTF8: |
92 | 0 | case UTF8PROC_ERROR_NOTASSIGNED: |
93 | 0 | default: |
94 | 0 | ncstat = NC_EBADNAME; |
95 | 0 | break; |
96 | 0 | } |
97 | 0 | goto done; |
98 | 0 | } else |
99 | 0 | if(normalp) *normalp = (unsigned char*)retval; |
100 | 0 | done: |
101 | 0 | return ncstat; |
102 | 0 | } |
103 | | |
104 | | /* |
105 | | * Convert a normalized utf8 string to utf16. This is approximate |
106 | | * because it just does the truncation version of conversion for |
107 | | * each 32-bit codepoint to get the corresponding utf16. |
108 | | * Return codes: |
109 | | * NC_NOERR -- success |
110 | | * NC_ENOMEM -- out of memory |
111 | | * NC_EINVAL -- invalid argument or internal error |
112 | | * NC_EBADNAME-- not valid utf16 |
113 | | */ |
114 | | |
115 | | int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p) |
116 | 0 | { |
117 | 0 | int ncstat = NC_NOERR; |
118 | 0 | const nc_utf8proc_uint8_t *str; |
119 | 0 | nc_utf8proc_ssize_t nchars = -1; |
120 | 0 | nc_utf8proc_int32_t codepoint; |
121 | 0 | nc_utf8proc_ssize_t count; |
122 | 0 | size_t len8, len16; |
123 | 0 | unsigned short* utf16; |
124 | 0 | unsigned short* p16; |
125 | |
|
126 | 0 | len8 = strlen((char*)s8); |
127 | 0 | utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1)); |
128 | 0 | if(utf16 == NULL) { |
129 | 0 | ncstat = NC_ENOMEM; |
130 | 0 | goto done; |
131 | 0 | } |
132 | 0 | str = (const nc_utf8proc_uint8_t*)s8; |
133 | | /* Walk the string and convert each codepoint */ |
134 | 0 | p16 = utf16; |
135 | 0 | len16 = 0; |
136 | 0 | while(*str) { |
137 | 0 | count = nc_utf8proc_iterate(str,nchars,&codepoint); |
138 | 0 | if(count < 0) { |
139 | 0 | switch (count) { |
140 | 0 | case UTF8PROC_ERROR_NOMEM: |
141 | 0 | case UTF8PROC_ERROR_OVERFLOW: |
142 | 0 | ncstat = NC_ENOMEM; |
143 | 0 | break; |
144 | 0 | case UTF8PROC_ERROR_INVALIDOPTS: |
145 | 0 | ncstat = NC_EINVAL; |
146 | 0 | break; |
147 | 0 | case UTF8PROC_ERROR_INVALIDUTF8: |
148 | 0 | case UTF8PROC_ERROR_NOTASSIGNED: |
149 | 0 | default: |
150 | 0 | ncstat = NC_EBADNAME; |
151 | 0 | break; |
152 | 0 | } |
153 | 0 | goto done; |
154 | 0 | } else { /* move to next char */ |
155 | | /* Complain if top 16 bits not zero */ |
156 | 0 | if((codepoint & 0xFFFF0000) != 0) { |
157 | 0 | ncstat = NC_EBADNAME; |
158 | 0 | goto done; |
159 | 0 | } |
160 | | /* Truncate codepoint to 16 bits and store */ |
161 | 0 | *p16++ = (unsigned short)(codepoint & 0x0000FFFF); |
162 | 0 | str += count; |
163 | 0 | len16++; |
164 | 0 | } |
165 | 0 | } |
166 | 0 | *p16++ = (unsigned short)0; |
167 | 0 | if(utf16p) |
168 | 0 | *utf16p = utf16; |
169 | 0 | else |
170 | 0 | free(utf16); |
171 | |
|
172 | 0 | if(len16p) *len16p = len16; |
173 | 0 | done: |
174 | 0 | if(ncstat) free(utf16); |
175 | 0 | return ncstat; |
176 | 0 | } |