Coverage Report

Created: 2025-07-01 06:50

/src/openvswitch/lib/unicode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2009, 2010 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
19
#include "unicode.h"
20
21
#include <inttypes.h>
22
23
#include "openvswitch/dynamic-string.h"
24
#include "util.h"
25
26
/* Returns the unicode code point corresponding to leading surrogate 'leading'
27
 * and trailing surrogate 'trailing'.  The return value will not make any
28
 * sense if 'leading' or 'trailing' are not in the correct ranges for leading
29
 * or trailing surrogates. */
30
int
31
utf16_decode_surrogate_pair(int leading, int trailing)
32
0
{
33
    /*
34
     *  Leading surrogate:         110110wwwwxxxxxx
35
     * Trailing surrogate:         110111xxxxxxxxxx
36
     *         Code point: 000uuuuuxxxxxxxxxxxxxxxx
37
     */
38
0
    int w = (leading >> 6) & 0xf;
39
0
    int u = w + 1;
40
0
    int x0 = leading & 0x3f;
41
0
    int x1 = trailing & 0x3ff;
42
0
    return (u << 16) | (x0 << 10) | x1;
43
0
}
44
45
/* Returns the number of Unicode characters in UTF-8 string 's'. */
46
size_t
47
utf8_length(const char *s_)
48
0
{
49
0
    const uint8_t *s;
50
0
    size_t length;
51
52
0
    length = 0;
53
0
    for (s = (const uint8_t *) s_; *s != '\0'; s++) {
54
        /* The most-significant bits of the first byte in a character are one
55
         * of 2#01, 2#00, or 2#11.  2#10 is a continuation byte. */
56
0
        length += (*s & 0xc0) != 0x80;
57
0
    }
58
0
    return length;
59
0
}
60
61
static char *
62
invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp)
63
0
{
64
0
    struct ds msg;
65
0
    int i;
66
67
0
    if (lengthp) {
68
0
        *lengthp = 0;
69
0
    }
70
71
0
    ds_init(&msg);
72
0
    ds_put_cstr(&msg, "invalid UTF-8 sequence");
73
0
    for (i = 0; i < n; i++) {
74
0
        ds_put_format(&msg, " 0x%02"PRIx8, s[i]);
75
0
    }
76
0
    return ds_steal_cstr(&msg);
77
0
}
78
79
struct utf8_sequence {
80
    uint8_t octets[5][2];
81
};
82
83
static const struct utf8_sequence *
84
lookup_utf8_sequence(uint8_t c)
85
0
{
86
0
    static const struct utf8_sequence seqs[] = {
87
0
        { { { 0x01, 0x7f },
88
0
            { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },
89
90
0
        { { { 0xc2, 0xdf }, { 0x80, 0xbf },
91
0
            { 0, 0 }, { 0, 0 }, { 0, 0 } } },
92
93
0
        { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
94
0
            {0,0}, {0, 0 } } },
95
96
0
        { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
97
0
            { 0, 0 }, { 0, 0 } } },
98
99
0
        { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
100
0
            { 0, 0 }, { 0, 0 } } },
101
102
0
        { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
103
0
            { 0, 0 }, { 0, 0 } } },
104
105
0
        { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
106
0
            { 0, 0 } } },
107
108
0
        { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
109
0
            { 0, 0 } } },
110
111
0
        { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
112
0
            { 0, 0 } } },
113
0
    };
114
115
0
    size_t i;
116
117
0
    for (i = 0; i < ARRAY_SIZE(seqs); i++) {
118
0
        const uint8_t *o = seqs[i].octets[0];
119
0
        if (c >= o[0] && c <= o[1]) {
120
0
            return &seqs[i];
121
0
        }
122
0
    }
123
0
    return NULL;
124
0
}
125
126
/* Checks that 's' is a valid, null-terminated UTF-8 string.  If so, returns a
127
 * null pointer and sets '*lengthp' to the number of Unicode characters in
128
 * 's'.  If not, returns an error message that the caller must free and sets
129
 * '*lengthp' to 0.
130
 *
131
 * 'lengthp' may be NULL if the length is not needed. */
132
char *
133
utf8_validate(const char *s_, size_t *lengthp)
134
0
{
135
0
    size_t length = 0;
136
0
    const uint8_t *s;
137
138
0
    for (s = (const uint8_t *) s_; *s != '\0'; ) {
139
0
        length++;
140
0
        if (s[0] < 0x80) {
141
0
            s++;
142
0
        } else {
143
0
            const struct utf8_sequence *seq;
144
0
            int i;
145
146
0
            seq = lookup_utf8_sequence(s[0]);
147
0
            if (!seq) {
148
0
                return invalid_utf8_sequence(s, 1, lengthp);
149
0
            }
150
151
0
            for (i = 1; seq->octets[i][0]; i++) {
152
0
                const uint8_t *o = seq->octets[i];
153
0
                if (s[i] < o[0] || s[i] > o[1]) {
154
0
                    return invalid_utf8_sequence(s, i + 1, lengthp);
155
0
                }
156
0
            }
157
0
            s += i;
158
0
        }
159
0
    }
160
0
    if (lengthp) {
161
0
        *lengthp = length;
162
0
    }
163
0
    return NULL;
164
0
}