Coverage Report

Created: 2026-03-07 06:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/common/unicode.cpp
Line
Count
Source
1
#include "unicode.h"
2
#include <cassert>
3
#include <stdexcept>
4
#include <vector>
5
#include <string>
6
7
// implementation adopted from src/unicode.cpp
8
9
0
size_t common_utf8_sequence_length(unsigned char first_byte) {
10
0
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
11
0
    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
12
0
    return lookup[highbits];
13
0
}
14
15
0
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) {
16
0
    if (offset >= input.size()) {
17
0
        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
18
0
    }
19
20
    // ASCII fast path
21
0
    if (!(input[offset] & 0x80)) {
22
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
23
0
    }
24
25
    // Invalid: continuation byte as first byte
26
0
    if (!(input[offset] & 0x40)) {
27
0
        return utf8_parse_result(utf8_parse_result::INVALID);
28
0
    }
29
30
    // 2-byte sequence
31
0
    if (!(input[offset] & 0x20)) {
32
0
        if (offset + 1 >= input.size()) {
33
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
34
0
        }
35
0
        if ((input[offset + 1] & 0xc0) != 0x80) {
36
0
            return utf8_parse_result(utf8_parse_result::INVALID);
37
0
        }
38
0
        auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
39
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
40
0
    }
41
42
    // 3-byte sequence
43
0
    if (!(input[offset] & 0x10)) {
44
0
        if (offset + 2 >= input.size()) {
45
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
46
0
        }
47
0
        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
48
0
            return utf8_parse_result(utf8_parse_result::INVALID);
49
0
        }
50
0
        auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
51
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
52
0
    }
53
54
    // 4-byte sequence
55
0
    if (!(input[offset] & 0x08)) {
56
0
        if (offset + 3 >= input.size()) {
57
0
            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
58
0
        }
59
0
        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
60
0
            return utf8_parse_result(utf8_parse_result::INVALID);
61
0
        }
62
0
        auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
63
0
        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
64
0
    }
65
66
    // Invalid first byte
67
0
    return utf8_parse_result(utf8_parse_result::INVALID);
68
0
}
69
70
0
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
71
0
    std::string result;
72
0
    for (size_t i = 0; i < cps.size(); ++i) {
73
0
        result.append(common_unicode_cpt_to_utf8(cps[i]));
74
0
    }
75
0
    return result;
76
0
}
77
78
0
std::string common_unicode_cpt_to_utf8(uint32_t cpt) {
79
0
    std::string result;
80
81
0
    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
82
0
        result.push_back(cpt);
83
0
        return result;
84
0
    }
85
0
    if (0x80 <= cpt && cpt <= 0x7ff) {
86
0
        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
87
0
        result.push_back(0x80 | (cpt & 0x3f));
88
0
        return result;
89
0
    }
90
0
    if (0x800 <= cpt && cpt <= 0xffff) {
91
0
        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
92
0
        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
93
0
        result.push_back(0x80 | (cpt & 0x3f));
94
0
        return result;
95
0
    }
96
0
    if (0x10000 <= cpt && cpt <= 0x10ffff) {
97
0
        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
98
0
        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
99
0
        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
100
0
        result.push_back(0x80 | (cpt & 0x3f));
101
0
        return result;
102
0
    }
103
104
0
    throw std::invalid_argument("invalid codepoint");
105
0
}
106
107
108