Coverage Report

Created: 2023-05-28 06:42

/src/netcdf-c/libdispatch/dutf8.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright 2018, University Corporation for Atmospheric Research
3
 *      See netcdf/COPYRIGHT file for copying and redistribution conditions.
4
 */
5
6
#include "config.h"
7
#ifdef HAVE_STDLIB_H
8
#include <stdlib.h>
9
#endif
10
#ifdef HAVE_STRING_H
11
#include <string.h>
12
#endif
13
#include "netcdf.h"
14
#include "ncutf8.h"
15
#include "utf8proc.h"
16
17
/* Provide a wrapper around whatever utf8 library we use. */
18
19
/*
20
 * Check validity of a UTF8 encoded null-terminated byte string.
21
 * Return codes:
22
 * NC_NOERR -- string is valid utf8
23
 * NC_ENOMEM -- out of memory
24
 * NC_EINVAL -- invalid argument or internal error
25
 * NC_EBADNAME-- not valid utf8
26
 */
27
28
int nc_utf8_validate(const unsigned char* name)
29
0
{
30
0
    int ncstat = NC_NOERR;
31
0
    const nc_utf8proc_uint8_t *str;
32
0
    nc_utf8proc_ssize_t nchars = -1;
33
0
    nc_utf8proc_int32_t codepoint;
34
0
    nc_utf8proc_ssize_t count;
35
36
0
    str = (const nc_utf8proc_uint8_t*)name;
37
0
    while(*str) {
38
0
        count = nc_utf8proc_iterate(str,nchars,&codepoint);
39
0
  if(count < 0) {
40
0
      switch (count) {
41
0
      case UTF8PROC_ERROR_NOMEM:
42
0
      case UTF8PROC_ERROR_OVERFLOW:
43
0
    ncstat = NC_ENOMEM;
44
0
    break;
45
0
      case UTF8PROC_ERROR_INVALIDOPTS:
46
0
    ncstat = NC_EINVAL;
47
0
    break;
48
0
      case UTF8PROC_ERROR_INVALIDUTF8:
49
0
      case UTF8PROC_ERROR_NOTASSIGNED:
50
0
      default:
51
0
    ncstat = NC_EBADNAME;
52
0
    break;
53
0
      }
54
0
      goto done;
55
0
  } else { /* move to next char */
56
0
      str += count;
57
0
  }
58
0
    }
59
0
done:
60
0
    return ncstat;
61
0
}
62
63
/*
64
 * Returns a pointer to newly allocated memory of a
65
 * normalized version of the null-terminated string 'str'.
66
 * Normalized string is returned in normalp argument;
67
 * caller must free.
68
 * Return codes:
69
 * NC_NOERR -- success
70
 * NC_ENOMEM -- out of memory
71
 * NC_EINVAL -- illegal argument or internal error
72
 * NC_EBADNAME -- other failure
73
 */
74
int
75
nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp)
76
0
{
77
0
    int ncstat = NC_NOERR;
78
0
    const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8;
79
0
    nc_utf8proc_uint8_t* retval = NULL;
80
0
    nc_utf8proc_ssize_t count;
81
0
    count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE);
82
0
    if(count < 0) {/* error */
83
0
  switch (count) {
84
0
  case UTF8PROC_ERROR_NOMEM:
85
0
  case UTF8PROC_ERROR_OVERFLOW:
86
0
  ncstat = NC_ENOMEM;
87
0
      break;
88
0
  case UTF8PROC_ERROR_INVALIDOPTS:
89
0
      ncstat = NC_EINVAL;
90
0
      break;
91
0
  case UTF8PROC_ERROR_INVALIDUTF8:
92
0
  case UTF8PROC_ERROR_NOTASSIGNED:
93
0
  default:
94
0
      ncstat = NC_EBADNAME;
95
0
      break;
96
0
  }
97
0
  goto done;
98
0
    } else
99
0
  if(normalp) *normalp = (unsigned char*)retval;
100
0
done:
101
0
    return ncstat;
102
0
}
103
104
/*
105
 * Convert a normalized utf8 string to utf16. This is approximate
106
 * because it just does the truncation version of conversion for
107
 * each 32-bit codepoint to get the corresponding utf16.
108
 * Return codes:
109
 * NC_NOERR -- success
110
 * NC_ENOMEM -- out of memory
111
 * NC_EINVAL -- invalid argument or internal error
112
 * NC_EBADNAME-- not valid utf16
113
 */
114
115
int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p)
116
0
{
117
0
    int ncstat = NC_NOERR;
118
0
    const nc_utf8proc_uint8_t *str;
119
0
    nc_utf8proc_ssize_t nchars = -1;
120
0
    nc_utf8proc_int32_t codepoint;
121
0
    nc_utf8proc_ssize_t count;
122
0
    size_t len8, len16;
123
0
    unsigned short* utf16;
124
0
    unsigned short* p16;
125
126
0
    len8 = strlen((char*)s8);
127
0
    utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1));
128
0
    if(utf16 == NULL) {
129
0
      ncstat = NC_ENOMEM;
130
0
      goto done;
131
0
    }
132
0
    str = (const nc_utf8proc_uint8_t*)s8;
133
    /* Walk the string and convert each codepoint */
134
0
    p16 = utf16;
135
0
    len16 = 0;
136
0
    while(*str) {
137
0
      count = nc_utf8proc_iterate(str,nchars,&codepoint);
138
0
      if(count < 0) {
139
0
      switch (count) {
140
0
      case UTF8PROC_ERROR_NOMEM:
141
0
      case UTF8PROC_ERROR_OVERFLOW:
142
0
          ncstat = NC_ENOMEM;
143
0
          break;
144
0
      case UTF8PROC_ERROR_INVALIDOPTS:
145
0
          ncstat = NC_EINVAL;
146
0
          break;
147
0
      case UTF8PROC_ERROR_INVALIDUTF8:
148
0
      case UTF8PROC_ERROR_NOTASSIGNED:
149
0
      default:
150
0
          ncstat = NC_EBADNAME;
151
0
          break;
152
0
      }
153
0
      goto done;
154
0
      } else { /* move to next char */
155
      /* Complain if top 16 bits not zero */
156
0
      if((codepoint & 0xFFFF0000) != 0) {
157
0
            ncstat = NC_EBADNAME;
158
0
            goto done;
159
0
      }
160
      /* Truncate codepoint to 16 bits and store */
161
0
      *p16++ = (unsigned short)(codepoint & 0x0000FFFF);
162
0
      str += count;
163
0
      len16++;
164
0
      }
165
0
    }
166
0
    *p16++ = (unsigned short)0;
167
0
    if(utf16p)
168
0
      *utf16p = utf16;
169
0
    else
170
0
      free(utf16);
171
172
0
    if(len16p) *len16p = len16;
173
0
 done:
174
0
    if(ncstat) free(utf16);
175
0
    return ncstat;
176
0
}