/src/llama.cpp/common/unicode.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include <cstdint> |
4 | | #include <string_view> |
5 | | #include <vector> |
6 | | #include <string> |
7 | | |
8 | | // UTF-8 parsing utilities for streaming-aware unicode support |
9 | | |
10 | | struct utf8_parse_result { |
11 | | uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS) |
12 | | size_t bytes_consumed; // How many bytes this codepoint uses (1-4) |
13 | | enum status { SUCCESS, INCOMPLETE, INVALID } status; |
14 | | |
15 | | utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0) |
16 | 0 | : codepoint(cp), bytes_consumed(bytes), status(s) {} |
17 | | }; |
18 | | |
19 | | // Determine the expected length of a UTF-8 sequence from its first byte |
20 | | // Returns 0 for invalid first bytes |
21 | | size_t common_utf8_sequence_length(unsigned char first_byte); |
22 | | |
23 | | // Check if a string ends with a complete UTF-8 sequence. |
24 | | bool common_utf8_is_complete(const std::string & s); |
25 | | |
26 | | // Parse a single UTF-8 codepoint from input |
27 | | utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset); |
28 | | |
29 | | std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps); |
30 | | std::string common_unicode_cpt_to_utf8(uint32_t cpt); |