/src/libgit2/src/util/utf8.c

Source
/*
 * Copyright (C) the libgit2 contributors. All rights reserved.
 *
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
 */

#include "utf8.h"

#include "git2_util.h"

/*
 * git_utf8_iterate is taken from the utf8proc project,
 * http://www.public-software-group.org/utf8proc
 *
 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the ""Software""),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

static const uint8_t utf8proc_utf8class[256] = {
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
};

static int utf8_charlen(const uint8_t *str, size_t str_len)
{
  uint8_t length;
  size_t i;

  length = utf8proc_utf8class[str[0]];
  if (!length)
    return -1;

  if (str_len > 0 && length > str_len)
    return -1;

  for (i = 1; i < length; i++) {
    if ((str[i] & 0xC0) != 0x80)
      return -1;
  }

  return (int)length;
}

int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len)
{
  const uint8_t *str = (const uint8_t *)_str;
  uint32_t uc = 0;
  int length;

  *out = 0;

  if ((length = utf8_charlen(str, str_len)) < 0)
    return -1;

  switch (length) {
    case 1:
      uc = str[0];
      break;
    case 2:
      uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
      if (uc < 0x80) uc = -1;
      break;
    case 3:
      uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
        + (str[2] & 0x3F);
      if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
          (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
      break;
    case 4:
      uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
        + ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
      if (uc < 0x10000 || uc >= 0x110000) uc = -1;
      break;
    default:
      return -1;
  }

  if ((uc & 0xFFFF) >= 0xFFFE)
    return -1;

  *out = uc;
  return length;
}

size_t git_utf8_char_length(const char *_str, size_t str_len)
{
  const uint8_t *str = (const uint8_t *)_str;
  size_t offset = 0, count = 0;

  while (offset < str_len) {
    int length = utf8_charlen(str + offset, str_len - offset);

    if (length < 0)
      length = 1;

    offset += length;
    count++;
  }

  return count;
}

size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
{
  const uint8_t *str = (const uint8_t *)_str;
  size_t offset = 0;

  while (offset < str_len) {
    int length = utf8_charlen(str + offset, str_len - offset);

    if (length < 0)
      break;

    offset += length;
  }

  return offset;
}

Line	Count	Source
1		/*
2		* Copyright (C) the libgit2 contributors. All rights reserved.
3		*
4		* This file is part of libgit2, distributed under the GNU GPL v2 with
5		* a Linking Exception. For full terms see the included COPYING file.
6		*/
7
8		#include "utf8.h"
9
10		#include "git2_util.h"
11
12		/*
13		* git_utf8_iterate is taken from the utf8proc project,
14		* http://www.public-software-group.org/utf8proc
15		*
16		* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
17		*
18		* Permission is hereby granted, free of charge, to any person obtaining a
19		* copy of this software and associated documentation files (the ""Software""),
20		* to deal in the Software without restriction, including without limitation
21		* the rights to use, copy, modify, merge, publish, distribute, sublicense,
22		* and/or sell copies of the Software, and to permit persons to whom the
23		* Software is furnished to do so, subject to the following conditions:
24		*
25		* The above copyright notice and this permission notice shall be included in
26		* all copies or substantial portions of the Software.
27		*
28		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33		* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
34		* DEALINGS IN THE SOFTWARE.
35		*/
36
37		static const uint8_t utf8proc_utf8class[256] = {
38		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
53		4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
54		};
55
56		static int utf8_charlen(const uint8_t *str, size_t str_len)
57	0	{
58	0	uint8_t length;
59	0	size_t i;
60
61	0	length = utf8proc_utf8class[str[0]];
62	0	if (!length)
63	0	return -1;
64
65	0	if (str_len > 0 && length > str_len)
66	0	return -1;
67
68	0	for (i = 1; i < length; i++) {
69	0	if ((str[i] & 0xC0) != 0x80)
70	0	return -1;
71	0	}
72
73	0	return (int)length;
74	0	}
75
76		int git_utf8_iterate(uint32_t out, const char _str, size_t str_len)
77	0	{
78	0	const uint8_t str = (const uint8_t )_str;
79	0	uint32_t uc = 0;
80	0	int length;
81
82	0	*out = 0;
83
84	0	if ((length = utf8_charlen(str, str_len)) < 0)
85	0	return -1;
86
87	0	switch (length) {
88	0	case 1:
89	0	uc = str[0];
90	0	break;
91	0	case 2:
92	0	uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
93	0	if (uc < 0x80) uc = -1;
94	0	break;
95	0	case 3:
96	0	uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
97	0	+ (str[2] & 0x3F);
98	0	if (uc < 0x800 \|\| (uc >= 0xD800 && uc < 0xE000) \|\|
99	0	(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
100	0	break;
101	0	case 4:
102	0	uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
103	0	+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
104	0	if (uc < 0x10000 \|\| uc >= 0x110000) uc = -1;
105	0	break;
106	0	default:
107	0	return -1;
108	0	}
109
110	0	if ((uc & 0xFFFF) >= 0xFFFE)
111	0	return -1;
112
113	0	*out = uc;
114	0	return length;
115	0	}
116
117		size_t git_utf8_char_length(const char *_str, size_t str_len)
118	0	{
119	0	const uint8_t str = (const uint8_t )_str;
120	0	size_t offset = 0, count = 0;
121
122	0	while (offset < str_len) {
123	0	int length = utf8_charlen(str + offset, str_len - offset);
124
125	0	if (length < 0)
126	0	length = 1;
127
128	0	offset += length;
129	0	count++;
130	0	}
131
132	0	return count;
133	0	}
134
135		size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
136	0	{
137	0	const uint8_t str = (const uint8_t )_str;
138	0	size_t offset = 0;
139
140	0	while (offset < str_len) {
141	0	int length = utf8_charlen(str + offset, str_len - offset);
142
143	0	if (length < 0)
144	0	break;
145
146	0	offset += length;
147	0	}
148
149	0	return offset;
150	0	}

Coverage Report

Created: 2026-01-10 06:55