/src/llama.cpp/common/unicode.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include <cstdint> |
4 | | #include <string_view> |
5 | | |
6 | | // UTF-8 parsing utilities for streaming-aware unicode support |
7 | | |
8 | | struct utf8_parse_result { |
9 | | uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS) |
10 | | size_t bytes_consumed; // How many bytes this codepoint uses (1-4) |
11 | | enum status { SUCCESS, INCOMPLETE, INVALID } status; |
12 | | |
13 | | utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0) |
14 | 0 | : codepoint(cp), bytes_consumed(bytes), status(s) {} |
15 | | }; |
16 | | |
17 | | // Determine the expected length of a UTF-8 sequence from its first byte |
18 | | // Returns 0 for invalid first bytes |
19 | | size_t utf8_sequence_length(unsigned char first_byte); |
20 | | |
21 | | // Parse a single UTF-8 codepoint from input |
22 | | utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset); |