Coverage Report

Created: 2026-02-26 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/unicode.cpp
Line
Count
Source
1
#include "unicode.h"
2
3
// implementation adopted from src/unicode.cpp
4
5
0
size_t utf8_sequence_length(unsigned char first_byte) {
6
0
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
7
0
    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
8
0
    return lookup[highbits];
9
0
}
10
11
0
utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
12
0
    if (offset >= input.size()) {
13
0
        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
14
0
    }
15
16
    // ASCII fast path
17
0
    if (!(input[offset] & 0x80)) {
18
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
19
0
    }
20
21
    // Invalid: continuation byte as first byte
22
0
    if (!(input[offset] & 0x40)) {
23
0
        return utf8_parse_result(utf8_parse_result::INVALID);
24
0
    }
25
26
    // 2-byte sequence
27
0
    if (!(input[offset] & 0x20)) {
28
0
        if (offset + 1 >= input.size()) {
29
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
30
0
        }
31
0
        if ((input[offset + 1] & 0xc0) != 0x80) {
32
0
            return utf8_parse_result(utf8_parse_result::INVALID);
33
0
        }
34
0
        auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
35
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
36
0
    }
37
38
    // 3-byte sequence
39
0
    if (!(input[offset] & 0x10)) {
40
0
        if (offset + 2 >= input.size()) {
41
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
42
0
        }
43
0
        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
44
0
            return utf8_parse_result(utf8_parse_result::INVALID);
45
0
        }
46
0
        auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
47
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
48
0
    }
49
50
    // 4-byte sequence
51
0
    if (!(input[offset] & 0x08)) {
52
0
        if (offset + 3 >= input.size()) {
53
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
54
0
        }
55
0
        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
56
0
            return utf8_parse_result(utf8_parse_result::INVALID);
57
0
        }
58
0
        auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
59
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
60
0
    }
61
62
    // Invalid first byte
63
0
    return utf8_parse_result(utf8_parse_result::INVALID);
64
0
}