Coverage Report

Created: 2026-03-21 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/unicode.cpp
Line
Count
Source
1
#include "unicode.h"
2
3
#include <algorithm>
4
#include <cassert>
5
#include <stdexcept>
6
#include <string>
7
#include <vector>
8
9
// implementation adopted from src/unicode.cpp
10
11
0
size_t common_utf8_sequence_length(unsigned char first_byte) {
12
0
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
13
0
    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
14
0
    return lookup[highbits];
15
0
}
16
17
0
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) {
18
0
    if (offset >= input.size()) {
19
0
        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
20
0
    }
21
22
    // ASCII fast path
23
0
    if (!(input[offset] & 0x80)) {
24
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
25
0
    }
26
27
    // Invalid: continuation byte as first byte
28
0
    if (!(input[offset] & 0x40)) {
29
0
        return utf8_parse_result(utf8_parse_result::INVALID);
30
0
    }
31
32
    // 2-byte sequence
33
0
    if (!(input[offset] & 0x20)) {
34
0
        if (offset + 1 >= input.size()) {
35
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
36
0
        }
37
0
        if ((input[offset + 1] & 0xc0) != 0x80) {
38
0
            return utf8_parse_result(utf8_parse_result::INVALID);
39
0
        }
40
0
        auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
41
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
42
0
    }
43
44
    // 3-byte sequence
45
0
    if (!(input[offset] & 0x10)) {
46
0
        if (offset + 2 >= input.size()) {
47
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
48
0
        }
49
0
        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
50
0
            return utf8_parse_result(utf8_parse_result::INVALID);
51
0
        }
52
0
        auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
53
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
54
0
    }
55
56
    // 4-byte sequence
57
0
    if (!(input[offset] & 0x08)) {
58
0
        if (offset + 3 >= input.size()) {
59
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
60
0
        }
61
0
        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
62
0
            return utf8_parse_result(utf8_parse_result::INVALID);
63
0
        }
64
0
        auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
65
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
66
0
    }
67
68
    // Invalid first byte
69
0
    return utf8_parse_result(utf8_parse_result::INVALID);
70
0
}
71
72
0
bool common_utf8_is_complete(const std::string & s) {
73
0
    if (s.empty()) {
74
0
        return true;
75
0
    }
76
0
    for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
77
0
        unsigned char c = s[s.size() - i];
78
0
        if ((c & 0xC0) != 0x80) {
79
0
            int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
80
0
            return i >= expected;
81
0
        }
82
0
    }
83
0
    return false;
84
0
}
85
86
0
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
87
0
    std::string result;
88
0
    for (size_t i = 0; i < cps.size(); ++i) {
89
0
        result.append(common_unicode_cpt_to_utf8(cps[i]));
90
0
    }
91
0
    return result;
92
0
}
93
94
0
std::string common_unicode_cpt_to_utf8(uint32_t cpt) {
95
0
    std::string result;
96
97
0
    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
98
0
        result.push_back(cpt);
99
0
        return result;
100
0
    }
101
0
    if (0x80 <= cpt && cpt <= 0x7ff) {
102
0
        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
103
0
        result.push_back(0x80 | (cpt & 0x3f));
104
0
        return result;
105
0
    }
106
0
    if (0x800 <= cpt && cpt <= 0xffff) {
107
0
        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
108
0
        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
109
0
        result.push_back(0x80 | (cpt & 0x3f));
110
0
        return result;
111
0
    }
112
0
    if (0x10000 <= cpt && cpt <= 0x10ffff) {
113
0
        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
114
0
        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
115
0
        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
116
0
        result.push_back(0x80 | (cpt & 0x3f));
117
0
        return result;
118
0
    }
119
120
0
    throw std::invalid_argument("invalid codepoint");
121
0
}
122
123
124