/src/netcdf-c/libdispatch/dutf8.c
| Line | Count | Source | 
| 1 |  | /* | 
| 2 |  |  *  Copyright 2018, University Corporation for Atmospheric Research | 
| 3 |  |  *      See netcdf/COPYRIGHT file for copying and redistribution conditions. | 
| 4 |  |  */ | 
| 5 |  |  | 
| 6 |  | #include "config.h" | 
| 7 |  | #ifdef HAVE_STDLIB_H | 
| 8 |  | #include <stdlib.h> | 
| 9 |  | #endif | 
| 10 |  | #ifdef HAVE_STRING_H | 
| 11 |  | #include <string.h> | 
| 12 |  | #endif | 
| 13 |  | #include "netcdf.h" | 
| 14 |  | #include "ncutf8.h" | 
| 15 |  | #include "utf8proc.h" | 
| 16 |  |  | 
| 17 |  | /* Provide a wrapper around whatever utf8 library we use. */ | 
| 18 |  |  | 
| 19 |  | /* | 
| 20 |  |  * Check validity of a UTF8 encoded null-terminated byte string. | 
| 21 |  |  * Return codes: | 
| 22 |  |  * NC_NOERR -- string is valid utf8 | 
| 23 |  |  * NC_ENOMEM -- out of memory | 
| 24 |  |  * NC_EINVAL -- invalid argument or internal error | 
| 25 |  |  * NC_EBADNAME-- not valid utf8 | 
| 26 |  |  */ | 
| 27 |  |  | 
| 28 |  | int nc_utf8_validate(const unsigned char* name) | 
| 29 | 0 | { | 
| 30 | 0 |     int ncstat = NC_NOERR; | 
| 31 | 0 |     const nc_utf8proc_uint8_t *str; | 
| 32 | 0 |     nc_utf8proc_ssize_t nchars = -1; | 
| 33 | 0 |     nc_utf8proc_int32_t codepoint; | 
| 34 | 0 |     nc_utf8proc_ssize_t count; | 
| 35 |  | 
 | 
| 36 | 0 |     str = (const nc_utf8proc_uint8_t*)name; | 
| 37 | 0 |     while(*str) { | 
| 38 | 0 |         count = nc_utf8proc_iterate(str,nchars,&codepoint); | 
| 39 | 0 |   if(count < 0) { | 
| 40 | 0 |       switch (count) { | 
| 41 | 0 |       case UTF8PROC_ERROR_NOMEM: | 
| 42 | 0 |       case UTF8PROC_ERROR_OVERFLOW: | 
| 43 | 0 |     ncstat = NC_ENOMEM; | 
| 44 | 0 |     break; | 
| 45 | 0 |       case UTF8PROC_ERROR_INVALIDOPTS: | 
| 46 | 0 |     ncstat = NC_EINVAL; | 
| 47 | 0 |     break; | 
| 48 | 0 |       case UTF8PROC_ERROR_INVALIDUTF8: | 
| 49 | 0 |       case UTF8PROC_ERROR_NOTASSIGNED: | 
| 50 | 0 |       default: | 
| 51 | 0 |     ncstat = NC_EBADNAME; | 
| 52 | 0 |     break; | 
| 53 | 0 |       } | 
| 54 | 0 |       goto done; | 
| 55 | 0 |   } else { /* move to next char */ | 
| 56 | 0 |       str += count; | 
| 57 | 0 |   } | 
| 58 | 0 |     } | 
| 59 | 0 | done: | 
| 60 | 0 |     return ncstat; | 
| 61 | 0 | } | 
| 62 |  |  | 
| 63 |  | /* | 
| 64 |  |  * Returns a pointer to newly allocated memory of a | 
| 65 |  |  * normalized version of the null-terminated string 'str'. | 
| 66 |  |  * Normalized string is returned in normalp argument; | 
| 67 |  |  * caller must free. | 
| 68 |  |  * Return codes: | 
| 69 |  |  * NC_NOERR -- success | 
| 70 |  |  * NC_ENOMEM -- out of memory | 
| 71 |  |  * NC_EINVAL -- illegal argument or internal error | 
| 72 |  |  * NC_EBADNAME -- other failure | 
| 73 |  |  */ | 
| 74 |  | int | 
| 75 |  | nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp) | 
| 76 | 0 | { | 
| 77 | 0 |     int ncstat = NC_NOERR; | 
| 78 | 0 |     const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8; | 
| 79 | 0 |     nc_utf8proc_uint8_t* retval = NULL; | 
| 80 | 0 |     nc_utf8proc_ssize_t count; | 
| 81 | 0 |     count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE); | 
| 82 | 0 |     if(count < 0) {/* error */ | 
| 83 | 0 |   switch (count) { | 
| 84 | 0 |   case UTF8PROC_ERROR_NOMEM: | 
| 85 | 0 |   case UTF8PROC_ERROR_OVERFLOW: | 
| 86 | 0 |   ncstat = NC_ENOMEM; | 
| 87 | 0 |       break; | 
| 88 | 0 |   case UTF8PROC_ERROR_INVALIDOPTS: | 
| 89 | 0 |       ncstat = NC_EINVAL; | 
| 90 | 0 |       break; | 
| 91 | 0 |   case UTF8PROC_ERROR_INVALIDUTF8: | 
| 92 | 0 |   case UTF8PROC_ERROR_NOTASSIGNED: | 
| 93 | 0 |   default: | 
| 94 | 0 |       ncstat = NC_EBADNAME; | 
| 95 | 0 |       break; | 
| 96 | 0 |   } | 
| 97 | 0 |   goto done; | 
| 98 | 0 |     } else | 
| 99 | 0 |   if(normalp) *normalp = (unsigned char*)retval; | 
| 100 | 0 | done: | 
| 101 | 0 |     return ncstat; | 
| 102 | 0 | } | 
| 103 |  |  | 
| 104 |  | /* | 
| 105 |  |  * Convert a normalized utf8 string to utf16. This is approximate | 
| 106 |  |  * because it just does the truncation version of conversion for | 
| 107 |  |  * each 32-bit codepoint to get the corresponding utf16. | 
| 108 |  |  * Return codes: | 
| 109 |  |  * NC_NOERR -- success | 
| 110 |  |  * NC_ENOMEM -- out of memory | 
| 111 |  |  * NC_EINVAL -- invalid argument or internal error | 
| 112 |  |  * NC_EBADNAME-- not valid utf16 | 
| 113 |  |  */ | 
| 114 |  |  | 
| 115 |  | int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p) | 
| 116 | 0 | { | 
| 117 | 0 |     int ncstat = NC_NOERR; | 
| 118 | 0 |     const nc_utf8proc_uint8_t *str; | 
| 119 | 0 |     nc_utf8proc_ssize_t nchars = -1; | 
| 120 | 0 |     nc_utf8proc_int32_t codepoint; | 
| 121 | 0 |     nc_utf8proc_ssize_t count; | 
| 122 | 0 |     size_t len8, len16; | 
| 123 | 0 |     unsigned short* utf16; | 
| 124 | 0 |     unsigned short* p16; | 
| 125 |  | 
 | 
| 126 | 0 |     len8 = strlen((char*)s8); | 
| 127 | 0 |     utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1)); | 
| 128 | 0 |     if(utf16 == NULL) { | 
| 129 | 0 |       ncstat = NC_ENOMEM; | 
| 130 | 0 |       goto done; | 
| 131 | 0 |     } | 
| 132 | 0 |     str = (const nc_utf8proc_uint8_t*)s8; | 
| 133 |  |     /* Walk the string and convert each codepoint */ | 
| 134 | 0 |     p16 = utf16; | 
| 135 | 0 |     len16 = 0; | 
| 136 | 0 |     while(*str) { | 
| 137 | 0 |       count = nc_utf8proc_iterate(str,nchars,&codepoint); | 
| 138 | 0 |       if(count < 0) { | 
| 139 | 0 |       switch (count) { | 
| 140 | 0 |       case UTF8PROC_ERROR_NOMEM: | 
| 141 | 0 |       case UTF8PROC_ERROR_OVERFLOW: | 
| 142 | 0 |           ncstat = NC_ENOMEM; | 
| 143 | 0 |           break; | 
| 144 | 0 |       case UTF8PROC_ERROR_INVALIDOPTS: | 
| 145 | 0 |           ncstat = NC_EINVAL; | 
| 146 | 0 |           break; | 
| 147 | 0 |       case UTF8PROC_ERROR_INVALIDUTF8: | 
| 148 | 0 |       case UTF8PROC_ERROR_NOTASSIGNED: | 
| 149 | 0 |       default: | 
| 150 | 0 |           ncstat = NC_EBADNAME; | 
| 151 | 0 |           break; | 
| 152 | 0 |       } | 
| 153 | 0 |       goto done; | 
| 154 | 0 |       } else { /* move to next char */ | 
| 155 |  |       /* Complain if top 16 bits not zero */ | 
| 156 | 0 |       if((codepoint & (nc_utf8proc_int32_t)0xFFFF0000) != 0) { | 
| 157 | 0 |             ncstat = NC_EBADNAME; | 
| 158 | 0 |             goto done; | 
| 159 | 0 |       } | 
| 160 |  |       /* Truncate codepoint to 16 bits and store */ | 
| 161 | 0 |       *p16++ = (unsigned short)(codepoint & 0x0000FFFF); | 
| 162 | 0 |       str += count; | 
| 163 | 0 |       len16++; | 
| 164 | 0 |       } | 
| 165 | 0 |     } | 
| 166 | 0 |     *p16++ = (unsigned short)0; | 
| 167 | 0 |     if(utf16p) | 
| 168 | 0 |       *utf16p = utf16; | 
| 169 | 0 |     else | 
| 170 | 0 |       free(utf16); | 
| 171 |  | 
 | 
| 172 | 0 |     if(len16p) *len16p = len16; | 
| 173 | 0 |  done: | 
| 174 | 0 |     if(ncstat) free(utf16); | 
| 175 | 0 |     return ncstat; | 
| 176 | 0 | } |