/src/libgit2/src/util/utf8.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) the libgit2 contributors. All rights reserved. |
3 | | * |
4 | | * This file is part of libgit2, distributed under the GNU GPL v2 with |
5 | | * a Linking Exception. For full terms see the included COPYING file. |
6 | | */ |
7 | | |
8 | | #include "utf8.h" |
9 | | |
10 | | #include "git2_util.h" |
11 | | |
12 | | /* |
13 | | * git_utf8_iterate is taken from the utf8proc project, |
14 | | * http://www.public-software-group.org/utf8proc |
15 | | * |
16 | | * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany |
17 | | * |
18 | | * Permission is hereby granted, free of charge, to any person obtaining a |
19 | | * copy of this software and associated documentation files (the ""Software""), |
20 | | * to deal in the Software without restriction, including without limitation |
21 | | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
22 | | * and/or sell copies of the Software, and to permit persons to whom the |
23 | | * Software is furnished to do so, subject to the following conditions: |
24 | | * |
25 | | * The above copyright notice and this permission notice shall be included in |
26 | | * all copies or substantial portions of the Software. |
27 | | * |
28 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
29 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
30 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
31 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
32 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
33 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
34 | | * DEALINGS IN THE SOFTWARE. |
35 | | */ |
36 | | |
37 | | static const uint8_t utf8proc_utf8class[256] = { |
38 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
39 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
40 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
41 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
42 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
43 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
44 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
45 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
46 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
47 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
48 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
49 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
50 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
51 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
52 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
53 | | 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 |
54 | | }; |
55 | | |
56 | | static int utf8_charlen(const uint8_t *str, size_t str_len) |
57 | 0 | { |
58 | 0 | uint8_t length; |
59 | 0 | size_t i; |
60 | |
|
61 | 0 | length = utf8proc_utf8class[str[0]]; |
62 | 0 | if (!length) |
63 | 0 | return -1; |
64 | | |
65 | 0 | if (str_len > 0 && length > str_len) |
66 | 0 | return -1; |
67 | | |
68 | 0 | for (i = 1; i < length; i++) { |
69 | 0 | if ((str[i] & 0xC0) != 0x80) |
70 | 0 | return -1; |
71 | 0 | } |
72 | | |
73 | 0 | return (int)length; |
74 | 0 | } |
75 | | |
76 | | int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len) |
77 | 0 | { |
78 | 0 | const uint8_t *str = (const uint8_t *)_str; |
79 | 0 | uint32_t uc = 0; |
80 | 0 | int length; |
81 | |
|
82 | 0 | *out = 0; |
83 | |
|
84 | 0 | if ((length = utf8_charlen(str, str_len)) < 0) |
85 | 0 | return -1; |
86 | | |
87 | 0 | switch (length) { |
88 | 0 | case 1: |
89 | 0 | uc = str[0]; |
90 | 0 | break; |
91 | 0 | case 2: |
92 | 0 | uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); |
93 | 0 | if (uc < 0x80) uc = -1; |
94 | 0 | break; |
95 | 0 | case 3: |
96 | 0 | uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) |
97 | 0 | + (str[2] & 0x3F); |
98 | 0 | if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || |
99 | 0 | (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; |
100 | 0 | break; |
101 | 0 | case 4: |
102 | 0 | uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) |
103 | 0 | + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); |
104 | 0 | if (uc < 0x10000 || uc >= 0x110000) uc = -1; |
105 | 0 | break; |
106 | 0 | default: |
107 | 0 | return -1; |
108 | 0 | } |
109 | | |
110 | 0 | if ((uc & 0xFFFF) >= 0xFFFE) |
111 | 0 | return -1; |
112 | | |
113 | 0 | *out = uc; |
114 | 0 | return length; |
115 | 0 | } |
116 | | |
117 | | size_t git_utf8_char_length(const char *_str, size_t str_len) |
118 | 0 | { |
119 | 0 | const uint8_t *str = (const uint8_t *)_str; |
120 | 0 | size_t offset = 0, count = 0; |
121 | |
|
122 | 0 | while (offset < str_len) { |
123 | 0 | int length = utf8_charlen(str + offset, str_len - offset); |
124 | |
|
125 | 0 | if (length < 0) |
126 | 0 | length = 1; |
127 | |
|
128 | 0 | offset += length; |
129 | 0 | count++; |
130 | 0 | } |
131 | |
|
132 | 0 | return count; |
133 | 0 | } |
134 | | |
135 | | size_t git_utf8_valid_buf_length(const char *_str, size_t str_len) |
136 | 0 | { |
137 | 0 | const uint8_t *str = (const uint8_t *)_str; |
138 | 0 | size_t offset = 0; |
139 | |
|
140 | 0 | while (offset < str_len) { |
141 | 0 | int length = utf8_charlen(str + offset, str_len - offset); |
142 | |
|
143 | 0 | if (length < 0) |
144 | 0 | break; |
145 | | |
146 | 0 | offset += length; |
147 | 0 | } |
148 | |
|
149 | 0 | return offset; |
150 | 0 | } |