/src/netcdf-c/libdispatch/dutf8.c

Source
/*
 *  Copyright 2018, University Corporation for Atmospheric Research
 *      See netcdf/COPYRIGHT file for copying and redistribution conditions.
 */

#include "config.h"
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "netcdf.h"
#include "ncutf8.h"
#include "utf8proc.h"

/* Provide a wrapper around whatever utf8 library we use. */

/*
 * Check validity of a UTF8 encoded null-terminated byte string.
 * Return codes:
 * NC_NOERR -- string is valid utf8
 * NC_ENOMEM -- out of memory
 * NC_EINVAL -- invalid argument or internal error
 * NC_EBADNAME-- not valid utf8
 */

int nc_utf8_validate(const unsigned char* name)
{
    int ncstat = NC_NOERR;
    const nc_utf8proc_uint8_t *str;
    nc_utf8proc_ssize_t nchars = -1;
    nc_utf8proc_int32_t codepoint;
    nc_utf8proc_ssize_t count;

    str = (const nc_utf8proc_uint8_t*)name;
    while(*str) {
        count = nc_utf8proc_iterate(str,nchars,&codepoint);
  if(count < 0) {
      switch (count) {
      case UTF8PROC_ERROR_NOMEM:
      case UTF8PROC_ERROR_OVERFLOW:
    ncstat = NC_ENOMEM;
    break;
      case UTF8PROC_ERROR_INVALIDOPTS:
    ncstat = NC_EINVAL;
    break;
      case UTF8PROC_ERROR_INVALIDUTF8:
      case UTF8PROC_ERROR_NOTASSIGNED:
      default:
    ncstat = NC_EBADNAME;
    break;
      }
      goto done;
  } else { /* move to next char */
      str += count;
  }
    }
done:
    return ncstat;
}

/*
 * Returns a pointer to newly allocated memory of a
 * normalized version of the null-terminated string 'str'.
 * Normalized string is returned in normalp argument;
 * caller must free.
 * Return codes:
 * NC_NOERR -- success
 * NC_ENOMEM -- out of memory
 * NC_EINVAL -- illegal argument or internal error
 * NC_EBADNAME -- other failure
 */
int
nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp)
{
    int ncstat = NC_NOERR;
    const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8;
    nc_utf8proc_uint8_t* retval = NULL;
    nc_utf8proc_ssize_t count;
    count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE);
    if(count < 0) {/* error */
  switch (count) {
  case UTF8PROC_ERROR_NOMEM:
  case UTF8PROC_ERROR_OVERFLOW:
  ncstat = NC_ENOMEM;
      break;
  case UTF8PROC_ERROR_INVALIDOPTS:
      ncstat = NC_EINVAL;
      break;
  case UTF8PROC_ERROR_INVALIDUTF8:
  case UTF8PROC_ERROR_NOTASSIGNED:
  default:
      ncstat = NC_EBADNAME;
      break;
  }
  goto done;
    } else
  if(normalp) *normalp = (unsigned char*)retval;
done:
    return ncstat;
}

/*
 * Convert a normalized utf8 string to utf16. This is approximate
 * because it just does the truncation version of conversion for
 * each 32-bit codepoint to get the corresponding utf16.
 * Return codes:
 * NC_NOERR -- success
 * NC_ENOMEM -- out of memory
 * NC_EINVAL -- invalid argument or internal error
 * NC_EBADNAME-- not valid utf16
 */

int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p)
{
    int ncstat = NC_NOERR;
    const nc_utf8proc_uint8_t *str;
    nc_utf8proc_ssize_t nchars = -1;
    nc_utf8proc_int32_t codepoint;
    nc_utf8proc_ssize_t count;
    size_t len8, len16;
    unsigned short* utf16;
    unsigned short* p16;

    len8 = strlen((char*)s8);
    utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1));
    if(utf16 == NULL) {
      ncstat = NC_ENOMEM;
      goto done;
    }
    str = (const nc_utf8proc_uint8_t*)s8;
    /* Walk the string and convert each codepoint */
    p16 = utf16;
    len16 = 0;
    while(*str) {
      count = nc_utf8proc_iterate(str,nchars,&codepoint);
      if(count < 0) {
      switch (count) {
      case UTF8PROC_ERROR_NOMEM:
      case UTF8PROC_ERROR_OVERFLOW:
          ncstat = NC_ENOMEM;
          break;
      case UTF8PROC_ERROR_INVALIDOPTS:
          ncstat = NC_EINVAL;
          break;
      case UTF8PROC_ERROR_INVALIDUTF8:
      case UTF8PROC_ERROR_NOTASSIGNED:
      default:
          ncstat = NC_EBADNAME;
          break;
      }
      goto done;
      } else { /* move to next char */
      /* Complain if top 16 bits not zero */
      if((codepoint & (nc_utf8proc_int32_t)0xFFFF0000) != 0) {
            ncstat = NC_EBADNAME;
            goto done;
      }
      /* Truncate codepoint to 16 bits and store */
      *p16++ = (unsigned short)(codepoint & 0x0000FFFF);
      str += count;
      len16++;
      }
    }
    *p16++ = (unsigned short)0;
    if(utf16p)
      *utf16p = utf16;
    else
      free(utf16);

    if(len16p) *len16p = len16;
 done:
    if(ncstat) free(utf16);
    return ncstat;
}

Line	Count	Source
1		/*
2		* Copyright 2018, University Corporation for Atmospheric Research
3		* See netcdf/COPYRIGHT file for copying and redistribution conditions.
4		*/
5
6		#include "config.h"
7		#ifdef HAVE_STDLIB_H
8		#include <stdlib.h>
9		#endif
10		#ifdef HAVE_STRING_H
11		#include <string.h>
12		#endif
13		#include "netcdf.h"
14		#include "ncutf8.h"
15		#include "utf8proc.h"
16
17		/* Provide a wrapper around whatever utf8 library we use. */
18
19		/*
20		* Check validity of a UTF8 encoded null-terminated byte string.
21		* Return codes:
22		* NC_NOERR -- string is valid utf8
23		* NC_ENOMEM -- out of memory
24		* NC_EINVAL -- invalid argument or internal error
25		* NC_EBADNAME-- not valid utf8
26		*/
27
28		int nc_utf8_validate(const unsigned char* name)
29	0	{
30	0	int ncstat = NC_NOERR;
31	0	const nc_utf8proc_uint8_t *str;
32	0	nc_utf8proc_ssize_t nchars = -1;
33	0	nc_utf8proc_int32_t codepoint;
34	0	nc_utf8proc_ssize_t count;
35
36	0	str = (const nc_utf8proc_uint8_t*)name;
37	0	while(*str) {
38	0	count = nc_utf8proc_iterate(str,nchars,&codepoint);
39	0	if(count < 0) {
40	0	switch (count) {
41	0	case UTF8PROC_ERROR_NOMEM:
42	0	case UTF8PROC_ERROR_OVERFLOW:
43	0	ncstat = NC_ENOMEM;
44	0	break;
45	0	case UTF8PROC_ERROR_INVALIDOPTS:
46	0	ncstat = NC_EINVAL;
47	0	break;
48	0	case UTF8PROC_ERROR_INVALIDUTF8:
49	0	case UTF8PROC_ERROR_NOTASSIGNED:
50	0	default:
51	0	ncstat = NC_EBADNAME;
52	0	break;
53	0	}
54	0	goto done;
55	0	} else { /* move to next char */
56	0	str += count;
57	0	}
58	0	}
59	0	done:
60	0	return ncstat;
61	0	}
62
63		/*
64		* Returns a pointer to newly allocated memory of a
65		* normalized version of the null-terminated string 'str'.
66		* Normalized string is returned in normalp argument;
67		* caller must free.
68		* Return codes:
69		* NC_NOERR -- success
70		* NC_ENOMEM -- out of memory
71		* NC_EINVAL -- illegal argument or internal error
72		* NC_EBADNAME -- other failure
73		*/
74		int
75		nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp)
76	0	{
77	0	int ncstat = NC_NOERR;
78	0	const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8;
79	0	nc_utf8proc_uint8_t* retval = NULL;
80	0	nc_utf8proc_ssize_t count;
81	0	count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| UTF8PROC_COMPOSE);
82	0	if(count < 0) {/* error */
83	0	switch (count) {
84	0	case UTF8PROC_ERROR_NOMEM:
85	0	case UTF8PROC_ERROR_OVERFLOW:
86	0	ncstat = NC_ENOMEM;
87	0	break;
88	0	case UTF8PROC_ERROR_INVALIDOPTS:
89	0	ncstat = NC_EINVAL;
90	0	break;
91	0	case UTF8PROC_ERROR_INVALIDUTF8:
92	0	case UTF8PROC_ERROR_NOTASSIGNED:
93	0	default:
94	0	ncstat = NC_EBADNAME;
95	0	break;
96	0	}
97	0	goto done;
98	0	} else
99	0	if(normalp) normalp = (unsigned char)retval;
100	0	done:
101	0	return ncstat;
102	0	}
103
104		/*
105		* Convert a normalized utf8 string to utf16. This is approximate
106		* because it just does the truncation version of conversion for
107		* each 32-bit codepoint to get the corresponding utf16.
108		* Return codes:
109		* NC_NOERR -- success
110		* NC_ENOMEM -- out of memory
111		* NC_EINVAL -- invalid argument or internal error
112		* NC_EBADNAME-- not valid utf16
113		*/
114
115		int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p)
116	0	{
117	0	int ncstat = NC_NOERR;
118	0	const nc_utf8proc_uint8_t *str;
119	0	nc_utf8proc_ssize_t nchars = -1;
120	0	nc_utf8proc_int32_t codepoint;
121	0	nc_utf8proc_ssize_t count;
122	0	size_t len8, len16;
123	0	unsigned short* utf16;
124	0	unsigned short* p16;
125
126	0	len8 = strlen((char*)s8);
127	0	utf16 = (unsigned short)malloc(sizeof(unsigned short)(len8+1));
128	0	if(utf16 == NULL) {
129	0	ncstat = NC_ENOMEM;
130	0	goto done;
131	0	}
132	0	str = (const nc_utf8proc_uint8_t*)s8;
133		/* Walk the string and convert each codepoint */
134	0	p16 = utf16;
135	0	len16 = 0;
136	0	while(*str) {
137	0	count = nc_utf8proc_iterate(str,nchars,&codepoint);
138	0	if(count < 0) {
139	0	switch (count) {
140	0	case UTF8PROC_ERROR_NOMEM:
141	0	case UTF8PROC_ERROR_OVERFLOW:
142	0	ncstat = NC_ENOMEM;
143	0	break;
144	0	case UTF8PROC_ERROR_INVALIDOPTS:
145	0	ncstat = NC_EINVAL;
146	0	break;
147	0	case UTF8PROC_ERROR_INVALIDUTF8:
148	0	case UTF8PROC_ERROR_NOTASSIGNED:
149	0	default:
150	0	ncstat = NC_EBADNAME;
151	0	break;
152	0	}
153	0	goto done;
154	0	} else { /* move to next char */
155		/* Complain if top 16 bits not zero */
156	0	if((codepoint & (nc_utf8proc_int32_t)0xFFFF0000) != 0) {
157	0	ncstat = NC_EBADNAME;
158	0	goto done;
159	0	}
160		/* Truncate codepoint to 16 bits and store */
161	0	*p16++ = (unsigned short)(codepoint & 0x0000FFFF);
162	0	str += count;
163	0	len16++;
164	0	}
165	0	}
166	0	*p16++ = (unsigned short)0;
167	0	if(utf16p)
168	0	*utf16p = utf16;
169	0	else
170	0	free(utf16);
171
172	0	if(len16p) *len16p = len16;
173	0	done:
174	0	if(ncstat) free(utf16);
175	0	return ncstat;
176	0	}

Coverage Report

Created: 2025-10-28 07:06