/src/llama.cpp/src/llama-vocab.cpp
Line | Count | Source |
1 | | #include "llama-vocab.h" |
2 | | |
3 | | #include "ggml.h" |
4 | | #include "gguf.h" |
5 | | #include "llama-impl.h" |
6 | | #include "llama-model-loader.h" |
7 | | |
8 | | #include "unicode.h" |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cassert> |
12 | | #include <cctype> |
13 | | #include <cfloat> |
14 | | #include <cmath> |
15 | | #include <cstdarg> |
16 | | #include <cstring> |
17 | | #include <forward_list> |
18 | | #include <limits> |
19 | | #include <map> |
20 | | #include <queue> |
21 | | #include <set> |
22 | | #include <unordered_map> |
23 | | |
24 | | // |
25 | | // helpers |
26 | | // |
27 | | |
28 | | struct naive_trie { |
29 | 0 | naive_trie() : has_value(false), value(0) { |
30 | 0 | } |
31 | 0 | void insert(const char * key, size_t len, int32_t value = 0) { |
32 | 0 | if (len == 0) { |
33 | 0 | this->has_value = true; |
34 | 0 | this->value = value; |
35 | 0 | return; |
36 | 0 | } |
37 | 0 | char c = key[0]; |
38 | 0 | auto res = children.find(c); |
39 | 0 | if (res != children.end()) { |
40 | 0 | res->second.insert(key + 1, len - 1, value); |
41 | 0 | } else { |
42 | 0 | auto res = children.insert(std::make_pair(c, naive_trie())); |
43 | 0 | res.first->second.insert(key + 1, len - 1, value); |
44 | 0 | } |
45 | 0 | } |
46 | 0 | std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const { |
47 | 0 | if (len == 0 || offset == len) { |
48 | 0 | return std::make_pair(key, offset); |
49 | 0 | } |
50 | 0 | char c = key[offset]; |
51 | 0 | auto res = children.find(c); |
52 | 0 | if (res != children.end()) { |
53 | 0 | return res->second.get_longest_prefix(key, len, offset + 1); |
54 | 0 | } |
55 | | |
56 | 0 | return std::make_pair(key, offset); |
57 | 0 | } |
58 | 0 | const struct naive_trie * traverse(const char c) const { |
59 | 0 | auto res = children.find(c); |
60 | 0 | if (res != children.end()) { |
61 | 0 | return &res->second; |
62 | 0 | } |
63 | | |
64 | 0 | return NULL; |
65 | 0 | } |
66 | | std::map<char, struct naive_trie> children; |
67 | | bool has_value; |
68 | | llama_token value; |
69 | | }; |
70 | | |
71 | | // |
72 | | // tokenizers |
73 | | // |
74 | | |
75 | | struct llm_tokenizer { |
76 | 0 | llm_tokenizer() {} |
77 | 0 | virtual ~llm_tokenizer() = default; |
78 | | }; |
79 | | |
80 | | struct llm_symbol { |
81 | | using index = int; |
82 | | index prev; |
83 | | index next; |
84 | | const char * text; |
85 | | size_t n; |
86 | | }; |
87 | | |
88 | | static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable"); |
89 | | |
90 | | // |
91 | | // SPM tokenizer |
92 | | // original implementation: |
93 | | // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 |
94 | | // |
95 | | |
96 | | struct llm_bigram_spm { |
97 | | struct comparator { |
98 | 0 | bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) { |
99 | 0 | return (l.score < r.score) || (l.score == r.score && l.left > r.left); |
100 | 0 | } |
101 | | }; |
102 | | using queue_storage = std::vector<llm_bigram_spm>; |
103 | | using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>; |
104 | | llm_symbol::index left; |
105 | | llm_symbol::index right; |
106 | | float score; |
107 | | size_t size; |
108 | | }; |
109 | | |
110 | | struct llm_tokenizer_spm : llm_tokenizer { |
111 | 0 | llm_tokenizer_spm(const llama_vocab & /*vocab*/) {} |
112 | | }; |
113 | | |
114 | | struct llm_tokenizer_spm_session { |
115 | 0 | llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {} |
116 | | |
117 | 0 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
118 | | // split string into utf8 chars |
119 | 0 | int index = 0; |
120 | 0 | size_t offs = 0; |
121 | 0 | while (offs < text.size()) { |
122 | 0 | llm_symbol sym; |
123 | 0 | size_t len = unicode_len_utf8(text[offs]); |
124 | 0 | sym.text = text.c_str() + offs; |
125 | 0 | sym.n = std::min(len, text.size() - offs); |
126 | 0 | offs += sym.n; |
127 | 0 | sym.prev = index - 1; |
128 | 0 | sym.next = offs == text.size() ? -1 : index + 1; |
129 | 0 | index++; |
130 | 0 | symbols.emplace_back(sym); |
131 | 0 | } |
132 | | |
133 | | // seed the work queue with all possible 2-character tokens. |
134 | 0 | for (int i = 1; i < (int) symbols.size(); ++i) { |
135 | 0 | try_add_bigram(i - 1, i); |
136 | 0 | } |
137 | | |
138 | | // keep substituting the highest frequency pairs for as long as we can. |
139 | 0 | while (!work_queue.empty()) { |
140 | 0 | auto bigram = work_queue.top(); |
141 | 0 | work_queue.pop(); |
142 | |
|
143 | 0 | auto & left_sym = symbols[bigram.left]; |
144 | 0 | auto & right_sym = symbols[bigram.right]; |
145 | | |
146 | | // if one of the symbols already got merged, skip it. |
147 | 0 | if (left_sym.n == 0 || right_sym.n == 0 || |
148 | 0 | left_sym.n + right_sym.n != bigram.size) { |
149 | 0 | continue; |
150 | 0 | } |
151 | | |
152 | | // merge the right sym into the left one |
153 | 0 | left_sym.n += right_sym.n; |
154 | 0 | right_sym.n = 0; |
155 | | |
156 | | //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); |
157 | | |
158 | | // remove the right sym from the chain |
159 | 0 | left_sym.next = right_sym.next; |
160 | 0 | if (right_sym.next >= 0) { |
161 | 0 | symbols[right_sym.next].prev = bigram.left; |
162 | 0 | } |
163 | | |
164 | | // find more substitutions |
165 | 0 | try_add_bigram(left_sym.prev, bigram.left); |
166 | 0 | try_add_bigram(bigram.left, left_sym.next); |
167 | 0 | } |
168 | |
|
169 | 0 | for (int i = 0; i != -1; i = symbols[i].next) { |
170 | 0 | auto & symbol = symbols[i]; |
171 | 0 | resegment(symbol, output); |
172 | 0 | } |
173 | 0 | } |
174 | | |
175 | | private: |
176 | 0 | void resegment(llm_symbol & symbol, std::vector<llama_token> & output) { |
177 | 0 | auto text = std::string(symbol.text, symbol.n); |
178 | 0 | auto token = vocab.text_to_token(text); |
179 | | |
180 | | // Do we need to support is_unused? |
181 | 0 | if (token != LLAMA_TOKEN_NULL) { |
182 | 0 | output.push_back(token); |
183 | 0 | return; |
184 | 0 | } |
185 | | |
186 | 0 | const auto p = rev_merge.find(text); |
187 | |
|
188 | 0 | if (p == rev_merge.end()) { |
189 | | // output any symbols that did not form tokens as bytes. |
190 | 0 | output.reserve(output.size() + symbol.n); |
191 | 0 | for (int j = 0; j < (int)symbol.n; ++j) { |
192 | 0 | llama_token id = vocab.byte_to_token(symbol.text[j]); |
193 | 0 | output.push_back(id); |
194 | 0 | } |
195 | 0 | return; |
196 | 0 | } |
197 | | |
198 | 0 | resegment(symbols[p->second.first], output); |
199 | 0 | resegment(symbols[p->second.second], output); |
200 | 0 | } |
201 | | |
202 | 0 | void try_add_bigram(int left, int right) { |
203 | 0 | if (left == -1 || right == -1) { |
204 | 0 | return; |
205 | 0 | } |
206 | 0 | const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); |
207 | 0 | auto token = vocab.text_to_token(text); |
208 | |
|
209 | 0 | if (token == LLAMA_TOKEN_NULL) { |
210 | 0 | return; |
211 | 0 | } |
212 | | |
213 | 0 | if (static_cast<uint32_t>(token) >= vocab.n_tokens()) { |
214 | 0 | return; |
215 | 0 | } |
216 | | |
217 | 0 | const auto & tok_data = vocab.get_token_data(token); |
218 | |
|
219 | 0 | llm_bigram_spm bigram; |
220 | 0 | bigram.left = left; |
221 | 0 | bigram.right = right; |
222 | 0 | bigram.score = tok_data.score; |
223 | 0 | bigram.size = text.size(); |
224 | |
|
225 | 0 | work_queue.push(bigram); |
226 | | |
227 | | // Do we need to support is_unused? |
228 | 0 | rev_merge[text] = std::make_pair(left, right); |
229 | 0 | } |
230 | | |
231 | | const llama_vocab & vocab; |
232 | | // currently unused |
233 | | // const llm_tokenizer_spm * spm_tokenizer; |
234 | | |
235 | | std::vector<llm_symbol> symbols; |
236 | | llm_bigram_spm::queue work_queue; |
237 | | std::map<std::string, std::pair<int, int>> rev_merge; |
238 | | }; |
239 | | |
240 | | // |
241 | | // BPE tokenizer |
242 | | // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] |
243 | | // tried to simplify unicode stuff, so most likely does not work 100% correctly! |
244 | | // |
245 | | |
246 | | // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused |
247 | | |
248 | | template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>> |
249 | | class llama_priority_queue : public std::priority_queue<T, Container, Compare> { |
250 | | public: |
251 | | using std::priority_queue<T, Container, Compare>::priority_queue; |
252 | | |
253 | 0 | T pop_move() { |
254 | 0 | T item = std::move(this->c.front()); |
255 | 0 | std::pop_heap(this->c.begin(), this->c.end(), this->comp); |
256 | 0 | this->c.pop_back(); |
257 | 0 | return item; |
258 | 0 | } |
259 | | |
260 | | void pop() = delete; |
261 | | }; |
262 | | |
263 | | struct llm_bigram_bpe { |
264 | | struct comparator { |
265 | 0 | bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const { |
266 | 0 | return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); |
267 | 0 | } |
268 | | }; |
269 | | |
270 | | using queue_storage = std::vector<llm_bigram_bpe>; |
271 | | using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>; |
272 | | llm_symbol::index left; |
273 | | llm_symbol::index right; |
274 | | std::string text; |
275 | | int rank; |
276 | | size_t size; |
277 | | }; |
278 | | |
279 | | struct llm_tokenizer_bpe : llm_tokenizer { |
280 | 0 | llm_tokenizer_bpe(const llama_vocab & vocab) { |
281 | 0 | GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE); |
282 | 0 | switch (vocab.get_pre_type()) { |
283 | 0 | case LLAMA_VOCAB_PRE_TYPE_LLAMA3: |
284 | 0 | regex_exprs = { |
285 | | // original regex from tokenizer.json |
286 | | //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
287 | | |
288 | | // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 |
289 | 0 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
290 | 0 | }; |
291 | 0 | break; |
292 | 0 | case LLAMA_VOCAB_PRE_TYPE_DBRX: |
293 | 0 | case LLAMA_VOCAB_PRE_TYPE_SMAUG: |
294 | 0 | regex_exprs = { |
295 | | // same as llama3 |
296 | 0 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
297 | 0 | }; |
298 | 0 | break; |
299 | 0 | case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: |
300 | 0 | regex_exprs = { |
301 | 0 | "[\r\n]", |
302 | 0 | "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", |
303 | 0 | "\\s?[!-/:-~!-/:-~‘-‟ -。]+", |
304 | 0 | "\\s+$", |
305 | 0 | "[一-龥ࠀ-一가-]+", |
306 | 0 | "\\p{N}+", |
307 | 0 | }; |
308 | 0 | break; |
309 | 0 | case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM: |
310 | 0 | case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE: |
311 | 0 | regex_exprs = { |
312 | 0 | "\\p{N}{1,3}", |
313 | 0 | "[一-龥-ゟ゠-ヿ]+", |
314 | 0 | "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+", |
315 | 0 | }; |
316 | 0 | break; |
317 | 0 | case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: |
318 | 0 | regex_exprs = { |
319 | 0 | "[\r\n]", |
320 | 0 | "\\s?\\p{L}+", |
321 | 0 | "\\s?\\p{P}+", |
322 | 0 | "[一-龥ࠀ-一가-]+", |
323 | 0 | "\\p{N}", |
324 | 0 | }; |
325 | 0 | break; |
326 | 0 | case LLAMA_VOCAB_PRE_TYPE_FALCON: |
327 | 0 | regex_exprs = { |
328 | 0 | "[\\p{P}\\$\\+<=>\\^~\\|`]+", |
329 | 0 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", |
330 | 0 | "[0-9][0-9][0-9]", |
331 | 0 | }; |
332 | 0 | break; |
333 | 0 | case LLAMA_VOCAB_PRE_TYPE_STARCODER: |
334 | 0 | case LLAMA_VOCAB_PRE_TYPE_REFACT: |
335 | 0 | case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: |
336 | 0 | case LLAMA_VOCAB_PRE_TYPE_SMOLLM: |
337 | 0 | case LLAMA_VOCAB_PRE_TYPE_CODESHELL: |
338 | 0 | case LLAMA_VOCAB_PRE_TYPE_EXAONE: |
339 | 0 | case LLAMA_VOCAB_PRE_TYPE_MINERVA: |
340 | 0 | regex_exprs = { |
341 | 0 | "\\p{N}", |
342 | 0 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", |
343 | 0 | }; |
344 | 0 | break; |
345 | 0 | case LLAMA_VOCAB_PRE_TYPE_GPT2: |
346 | 0 | case LLAMA_VOCAB_PRE_TYPE_MPT: |
347 | 0 | case LLAMA_VOCAB_PRE_TYPE_OLMO: |
348 | 0 | case LLAMA_VOCAB_PRE_TYPE_JAIS: |
349 | 0 | case LLAMA_VOCAB_PRE_TYPE_TRILLION: |
350 | 0 | case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING: |
351 | 0 | regex_exprs = { |
352 | 0 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", |
353 | 0 | }; |
354 | 0 | break; |
355 | 0 | case LLAMA_VOCAB_PRE_TYPE_STABLELM2: |
356 | 0 | case LLAMA_VOCAB_PRE_TYPE_QWEN2: |
357 | 0 | case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: |
358 | 0 | regex_exprs = { |
359 | | // original regex from tokenizer.json |
360 | | // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
361 | 0 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
362 | 0 | }; |
363 | 0 | break; |
364 | 0 | case LLAMA_VOCAB_PRE_TYPE_PORO: |
365 | 0 | case LLAMA_VOCAB_PRE_TYPE_BLOOM: |
366 | 0 | case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: |
367 | 0 | regex_exprs = { |
368 | 0 | " ?[^(\\s|.,!?…。,、।۔،)]+", |
369 | 0 | }; |
370 | 0 | break; |
371 | 0 | case LLAMA_VOCAB_PRE_TYPE_CHATGLM4: |
372 | 0 | regex_exprs = { |
373 | 0 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
374 | 0 | }; |
375 | 0 | break; |
376 | 0 | case LLAMA_VOCAB_PRE_TYPE_VIKING: |
377 | 0 | regex_exprs = { |
378 | 0 | " ?[^(\\s|.,!?…。,、।۔،)]+", |
379 | 0 | "\\p{N}", |
380 | 0 | }; |
381 | 0 | break; |
382 | 0 | case LLAMA_VOCAB_PRE_TYPE_TEKKEN: |
383 | | // original regex from tokenizer.json |
384 | | // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
385 | 0 | regex_exprs = { |
386 | 0 | "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
387 | 0 | }; |
388 | 0 | break; |
389 | 0 | case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: |
390 | | // Note: in theory, the special token (sentinel and image token) regex_exprs below |
391 | | // are unnecessary, as they are split in `tokenizer_st_partition` anyway. |
392 | | // However, since the upstream pre-tokenizer uses them, they are also |
393 | | // included here (see https://huggingface.co/facebook/chameleon-7b). |
394 | 0 | regex_exprs = { |
395 | 0 | "<sentinel:[0-9]+>", // Sentinel tokens |
396 | 0 | "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens |
397 | 0 | "([\\t\\n]| | )", // directly from tokenizer.json |
398 | 0 | "\\p{N}", // Individual digits |
399 | 0 | "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated |
400 | 0 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", |
401 | 0 | }; |
402 | 0 | break; |
403 | 0 | case LLAMA_VOCAB_PRE_TYPE_GPT4O: |
404 | 0 | case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2: |
405 | 0 | regex_exprs = { |
406 | | // original regex from tokenizer.json |
407 | | // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
408 | 0 | "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
409 | 0 | }; |
410 | 0 | break; |
411 | 0 | case LLAMA_VOCAB_PRE_TYPE_KIMI_K2: |
412 | 0 | regex_exprs = { |
413 | | // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp |
414 | | // The custom handler implements all K2 patterns with proper Han character exclusion |
415 | 0 | "\\p{Han}+", |
416 | 0 | }; |
417 | 0 | break; |
418 | 0 | case LLAMA_VOCAB_PRE_TYPE_SUPERBPE: |
419 | 0 | regex_exprs = { |
420 | 0 | "\\p{N}+", |
421 | 0 | "(?=(\\d{3})+(?!\\d))", |
422 | 0 | }; |
423 | 0 | break; |
424 | 0 | case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE: |
425 | 0 | regex_exprs = { |
426 | | // original regex from tokenizer.json |
427 | | // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" |
428 | | // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?) |
429 | 0 | "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", |
430 | 0 | }; |
431 | 0 | break; |
432 | 0 | case LLAMA_VOCAB_PRE_TYPE_SEED_CODER: |
433 | 0 | regex_exprs = { |
434 | | // original regex from tokenizer.json |
435 | | // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+" |
436 | 0 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
437 | 0 | }; |
438 | 0 | break; |
439 | 0 | case LLAMA_VOCAB_PRE_TYPE_GROK_2: |
440 | 0 | regex_exprs = { |
441 | | // original regex from tokenizer.json |
442 | | // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
443 | 0 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
444 | 0 | }; |
445 | 0 | break; |
446 | 0 | case LLAMA_VOCAB_PRE_TYPE_AFMOE: |
447 | 0 | regex_exprs = { |
448 | | // Digit handling - uses custom implementation in unicode.cpp |
449 | | // Groups digits with leading 1-2 based on total length modulo 3 |
450 | 0 | "\\p{AFMoE_digits}", |
451 | | // CJK and Asian scripts (using direct Unicode literals) |
452 | 0 | "[一-鿿㐀-䶿豈--ゟ゠-ヿ・-゚⼀-เ--ក-က-႟ꩠ-ꩿꧠ-가-ᄀ-ᇿ]+", |
453 | | // Main BPE pattern |
454 | 0 | "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
455 | 0 | }; |
456 | 0 | break; |
457 | 0 | default: |
458 | | // default regex for BPE tokenization pre-processing |
459 | 0 | regex_exprs = { |
460 | 0 | "[\\p{P}\\$\\+<=>\\^~\\|]+", |
461 | 0 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", |
462 | 0 | "\\p{N}+", |
463 | 0 | "[0-9][0-9][0-9]", |
464 | 0 | }; |
465 | 0 | break; |
466 | 0 | } |
467 | 0 | } |
468 | | |
469 | | std::vector<std::string> regex_exprs; |
470 | | }; |
471 | | |
472 | | struct llm_tokenizer_bpe_session { |
473 | 0 | llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} |
474 | | |
475 | 0 | static void append(const llama_token token_id, std::vector<llama_token> & output) { |
476 | 0 | output.push_back(token_id); |
477 | 0 | } |
478 | | |
479 | 0 | bool append_bos(std::vector<llama_token> & output) const { |
480 | 0 | if (vocab.get_add_bos()) { |
481 | 0 | GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL); |
482 | 0 | output.push_back(vocab.token_bos()); |
483 | 0 | return true; |
484 | 0 | } |
485 | 0 | return false; |
486 | 0 | } |
487 | | |
488 | 0 | bool append_eos(std::vector<llama_token> & output) const { |
489 | 0 | if (vocab.get_add_eos()) { |
490 | 0 | GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL); |
491 | 0 | output.push_back(vocab.token_eos()); |
492 | 0 | return true; |
493 | 0 | } |
494 | 0 | return false; |
495 | 0 | } |
496 | | |
497 | 0 | void check_double_bos_eos(const std::vector<llama_token> & output) const { |
498 | 0 | if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) { |
499 | 0 | LLAMA_LOG_WARN( |
500 | 0 | "%s: Added a BOS token to the prompt as specified by the model but the prompt " |
501 | 0 | "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " |
502 | 0 | "Are you sure this is what you want?\n", __FUNCTION__); |
503 | 0 | } |
504 | 0 | if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) { |
505 | 0 | LLAMA_LOG_WARN( |
506 | 0 | "%s: Added a EOS token to the prompt as specified by the model but the prompt " |
507 | 0 | "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " |
508 | 0 | "Are you sure this is what you want?\n", __FUNCTION__); |
509 | 0 | } |
510 | 0 | } |
511 | | |
512 | 0 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
513 | 0 | int final_prev_index = -1; |
514 | 0 | const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs); |
515 | |
|
516 | 0 | symbols_final.clear(); |
517 | |
|
518 | 0 | for (const auto & word : word_collection) { |
519 | 0 | work_queue = llm_bigram_bpe::queue(); |
520 | 0 | symbols.clear(); |
521 | |
|
522 | 0 | int index = 0; |
523 | 0 | size_t offset = 0; |
524 | | |
525 | | //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { |
526 | 0 | if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) { |
527 | 0 | symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); |
528 | 0 | offset = word.size(); |
529 | 0 | } |
530 | |
|
531 | 0 | while (offset < word.size()) { |
532 | 0 | llm_symbol sym; |
533 | 0 | size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset])); |
534 | 0 | sym.text = word.c_str() + offset; |
535 | 0 | sym.n = char_len; |
536 | 0 | offset += sym.n; |
537 | 0 | sym.prev = index - 1; |
538 | 0 | sym.next = offset == word.size() ? -1 : index + 1; |
539 | 0 | index++; |
540 | 0 | symbols.emplace_back(sym); |
541 | 0 | } |
542 | 0 | for (int i = 1; i < (int) symbols.size(); ++i) { |
543 | 0 | add_new_bigram(i - 1, i); |
544 | 0 | } |
545 | | |
546 | | // build token(s) |
547 | 0 | while (!work_queue.empty()) { |
548 | 0 | auto bigram = work_queue.pop_move(); |
549 | |
|
550 | 0 | auto & left_symbol = symbols[bigram.left]; |
551 | 0 | auto & right_symbol = symbols[bigram.right]; |
552 | |
|
553 | 0 | if (left_symbol.n == 0 || right_symbol.n == 0) { |
554 | 0 | continue; |
555 | 0 | } |
556 | 0 | std::string left_token = std::string(left_symbol.text, left_symbol.n); |
557 | 0 | std::string right_token = std::string(right_symbol.text, right_symbol.n); |
558 | 0 | if (left_token + right_token != bigram.text) { |
559 | 0 | continue; // Skip this bigram if it's outdated |
560 | 0 | } |
561 | | |
562 | | // merge the right sym into the left one |
563 | 0 | left_symbol.n += right_symbol.n; |
564 | 0 | right_symbol.n = 0; |
565 | | |
566 | | // remove the right sym from the chain |
567 | 0 | left_symbol.next = right_symbol.next; |
568 | 0 | if (right_symbol.next >= 0) { |
569 | 0 | symbols[right_symbol.next].prev = bigram.left; |
570 | 0 | } |
571 | |
|
572 | 0 | add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol |
573 | 0 | add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol |
574 | 0 | } |
575 | | |
576 | | // add the finished tokens to the final list keeping correct order for next and prev |
577 | 0 | for (auto & sym : symbols) { |
578 | 0 | if (sym.n > 0) { |
579 | 0 | sym.prev = final_prev_index; |
580 | 0 | sym.next = -1; |
581 | 0 | if (final_prev_index != -1) { |
582 | 0 | symbols_final[final_prev_index].next = symbols_final.size(); |
583 | 0 | } |
584 | 0 | symbols_final.emplace_back(sym); |
585 | 0 | final_prev_index = symbols_final.size() - 1; |
586 | 0 | } |
587 | 0 | } |
588 | 0 | } |
589 | |
|
590 | 0 | symbols = symbols_final; |
591 | |
|
592 | 0 | if (!symbols.empty()) { |
593 | 0 | for (int i = 0; i != -1; i = symbols[i].next) { |
594 | 0 | auto & symbol = symbols[i]; |
595 | 0 | if (symbol.n == 0) { |
596 | 0 | continue; |
597 | 0 | } |
598 | | |
599 | 0 | const std::string str = std::string(symbol.text, symbol.n); |
600 | 0 | const auto token = vocab.text_to_token(str); |
601 | |
|
602 | 0 | if (token == LLAMA_TOKEN_NULL) { |
603 | 0 | for (auto j = str.begin(); j != str.end(); ++j) { |
604 | 0 | std::string byte_str(1, *j); |
605 | 0 | auto token_multibyte = vocab.text_to_token(byte_str); |
606 | 0 | if (token_multibyte != LLAMA_TOKEN_NULL) { |
607 | 0 | output.push_back(token_multibyte); |
608 | 0 | } |
609 | 0 | } |
610 | 0 | } else { |
611 | 0 | output.push_back(token); |
612 | 0 | } |
613 | 0 | } |
614 | 0 | } |
615 | 0 | } |
616 | | |
617 | | private: |
618 | 0 | void add_new_bigram(int left, int right) { |
619 | 0 | if (left == -1 || right == -1) { |
620 | 0 | return; |
621 | 0 | } |
622 | 0 | std::string left_token = std::string(symbols[left].text, symbols[left].n); |
623 | 0 | std::string right_token = std::string(symbols[right].text, symbols[right].n); |
624 | |
|
625 | 0 | int rank_found = -1; |
626 | |
|
627 | 0 | rank_found = vocab.find_bpe_rank(left_token, right_token); |
628 | |
|
629 | 0 | if (rank_found < 0) { |
630 | 0 | return; |
631 | 0 | } |
632 | | |
633 | 0 | llm_bigram_bpe bigram; |
634 | |
|
635 | 0 | bigram.left = left; |
636 | 0 | bigram.right = right; |
637 | 0 | bigram.text = left_token + right_token; |
638 | 0 | bigram.size = left_token.size() + right_token.size(); |
639 | 0 | bigram.rank = rank_found; |
640 | |
|
641 | 0 | work_queue.push(bigram); |
642 | 0 | } |
643 | | |
644 | | const llama_vocab & vocab; |
645 | | const llm_tokenizer_bpe & tokenizer; |
646 | | |
647 | | std::vector<llm_symbol> symbols; |
648 | | std::vector<llm_symbol> symbols_final; |
649 | | llm_bigram_bpe::queue work_queue; |
650 | | }; |
651 | | |
652 | | // |
653 | | // WPM tokenizer |
654 | | // |
655 | | |
656 | | struct llm_tokenizer_wpm : llm_tokenizer { |
657 | 0 | llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {} |
658 | | }; |
659 | | |
660 | | struct llm_tokenizer_wpm_session { |
661 | 0 | llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {} |
662 | | |
663 | 0 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
664 | | // normalize and split by whitespace |
665 | 0 | std::vector<std::string> words = preprocess(text); |
666 | | // bos token prepended already |
667 | | |
668 | | // find the longest tokens that form the words |
669 | 0 | for (const std::string & word : words) { |
670 | | // skip empty words |
671 | 0 | if (word.size() == 0) { |
672 | 0 | continue; |
673 | 0 | } |
674 | | |
675 | | // prepend phantom space |
676 | 0 | const std::string word1 = "\xe2\x96\x81" + word; |
677 | 0 | const int n = word1.size(); |
678 | |
|
679 | 0 | const size_t current_tokens = output.size(); |
680 | | |
681 | | // we're at the start of a new word |
682 | | // move through character position in word |
683 | 0 | for (int i = 0; i < n; ++i) { |
684 | | // loop through possible match length |
685 | 0 | bool match = false; |
686 | 0 | for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) { |
687 | 0 | auto id = vocab.text_to_token(word1.substr(i, j - i)); |
688 | 0 | if (id != LLAMA_TOKEN_NULL) { |
689 | 0 | output.push_back(id); |
690 | 0 | match = true; |
691 | 0 | i = j - 1; |
692 | 0 | break; |
693 | 0 | } |
694 | 0 | } |
695 | |
|
696 | 0 | if (!match) { // discard all |
697 | 0 | output.resize(current_tokens); |
698 | 0 | break; // and discard next tokens |
699 | 0 | } |
700 | 0 | } |
701 | | |
702 | | // we didn't find any matches for this word |
703 | 0 | if (current_tokens == output.size()) { |
704 | 0 | output.push_back(vocab.token_unk()); |
705 | 0 | } |
706 | 0 | } |
707 | 0 | } |
708 | | |
709 | | // TODO: reduce string copies by using cpts_offs array |
710 | 0 | static std::vector<std::string> preprocess(const std::string & text) { |
711 | 0 | const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); |
712 | 0 | std::vector<std::string> words(1, ""); |
713 | |
|
714 | 0 | for (const uint32_t cpt : cpts_nfd) { |
715 | 0 | const auto flags = unicode_cpt_flags_from_cpt(cpt); |
716 | |
|
717 | 0 | if (flags.is_whitespace) { |
718 | 0 | if (words.back().size()) { // finish previous word if any |
719 | 0 | words.emplace_back(); |
720 | 0 | } |
721 | 0 | continue; |
722 | 0 | } |
723 | | |
724 | 0 | assert (!flags.is_separator); |
725 | 0 | if (cpt == 0 || cpt == 0xFFFD || flags.is_control) { |
726 | 0 | continue; |
727 | 0 | } |
728 | | |
729 | 0 | const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); |
730 | 0 | if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { |
731 | 0 | if (words.back().size()) { // finish previous word if any |
732 | 0 | words.emplace_back(); |
733 | 0 | } |
734 | 0 | words.back() = s; // single char word |
735 | 0 | words.emplace_back(); // start a new word |
736 | 0 | } else { |
737 | 0 | words.back() += s; // append char to word |
738 | 0 | } |
739 | 0 | } |
740 | |
|
741 | 0 | if (!words.back().size()) { |
742 | 0 | words.pop_back(); |
743 | 0 | } |
744 | |
|
745 | 0 | return words; |
746 | 0 | } |
747 | | |
748 | 0 | static bool is_chinese_char(uint32_t cpt) { |
749 | 0 | return |
750 | 0 | (cpt >= 0x04E00 && cpt <= 0x09FFF) || |
751 | 0 | (cpt >= 0x03400 && cpt <= 0x04DBF) || |
752 | 0 | (cpt >= 0x20000 && cpt <= 0x2A6DF) || |
753 | 0 | (cpt >= 0x2A700 && cpt <= 0x2B73F) || |
754 | 0 | (cpt >= 0x2B740 && cpt <= 0x2B81F) || |
755 | 0 | (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920 |
756 | 0 | (cpt >= 0x0F900 && cpt <= 0x0FAFF) || |
757 | 0 | (cpt >= 0x2F800 && cpt <= 0x2FA1F); |
758 | | //(cpt >= 0x3000 && cpt <= 0x303F) || |
759 | | //(cpt >= 0xFF00 && cpt <= 0xFFEF); |
760 | 0 | } |
761 | | |
762 | | private: |
763 | | const llama_vocab & vocab; |
764 | | // currently unused |
765 | | // const llm_tokenizer_wpm * wpm_tokenizer; |
766 | | }; |
767 | | |
768 | | // |
769 | | // UGM tokenizer |
770 | | // |
771 | | |
772 | | struct llm_tokenizer_ugm : llm_tokenizer { |
773 | 0 | llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) { |
774 | 0 | if (precompiled_charsmap.size() > 0) { |
775 | 0 | size_t charsmap_offset = 0; |
776 | | |
777 | | // First four bytes of precompiled_charsmap contains length of binary |
778 | | // blob containing XOR-compressed compact double array (XCDA) entries |
779 | 0 | uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0]; |
780 | 0 | charsmap_offset += sizeof(xcda_blob_size); |
781 | 0 | if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) { |
782 | 0 | throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); |
783 | 0 | } |
784 | | |
785 | | // Next xcda_blob_size bytes contain entries of XOR-compressed compact |
786 | | // double array (XCDA). Each entry is bit-packed into a 32-bit integer. |
787 | 0 | xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset]; |
788 | 0 | xcda_array_size = xcda_blob_size / sizeof(uint32_t); |
789 | 0 | charsmap_offset += xcda_blob_size; |
790 | | |
791 | | // Remaining bytes of precompiled charsmap contain null-terminated |
792 | | // replacement strings for prefixes matched by the XCDA. |
793 | 0 | prefix_replacements = &precompiled_charsmap[charsmap_offset]; |
794 | 0 | prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset; |
795 | 0 | } |
796 | | |
797 | 0 | for (uint32_t id = 0; id < vocab.n_tokens(); ++id) { |
798 | 0 | const auto & token_data = vocab.get_token_data(id); |
799 | |
|
800 | 0 | if (vocab.is_normal(id)) { |
801 | 0 | min_score = std::min<float>(min_score, token_data.score); |
802 | 0 | max_score = std::max<float>(max_score, token_data.score); |
803 | 0 | } |
804 | |
|
805 | 0 | if (vocab.is_normal(id) || |
806 | 0 | vocab.is_user_defined(id) || |
807 | 0 | vocab.is_unused(id)) { |
808 | 0 | token_matcher.insert(token_data.text.data(), token_data.text.size(), id); |
809 | 0 | } |
810 | |
|
811 | 0 | if (vocab.is_user_defined(id)) { |
812 | 0 | user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size()); |
813 | 0 | } |
814 | 0 | } |
815 | |
|
816 | 0 | unknown_token_score = min_score - unknown_token_score_penalty; |
817 | 0 | } |
818 | | |
819 | | // escaped space symbol - U+2581 (Lower One Eighth Block) |
820 | | const std::string escaped_space = "\xE2\x96\x81"; |
821 | | |
822 | | const char * prefix_replacements = NULL; |
823 | | size_t prefix_replacements_size = 0; |
824 | | |
825 | | const uint32_t * xcda_array = NULL; |
826 | | size_t xcda_array_size = 0; |
827 | | |
828 | | struct naive_trie user_defined_token_matcher; |
829 | | |
830 | | float min_score = FLT_MAX; |
831 | | float max_score = -FLT_MAX; |
832 | | |
833 | | float unknown_token_score_penalty = 10.0; |
834 | | float unknown_token_score; |
835 | | |
836 | | struct naive_trie token_matcher; |
837 | | }; |
838 | | |
839 | | struct llm_tokenizer_ugm_session { |
840 | 0 | llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} |
841 | | |
842 | | /* This implementation is based on SentencePiece optimized Viterbi algorithm for |
843 | | * unigram language models. The general idea is to: |
844 | | * - move along the input sequence in steps of one UTF code point, |
845 | | * - at each step find all possible tokenizations of the prefix by |
846 | | * traversing the tokens trie, |
847 | | * - for each tokenization store the best one so far (by higher score) |
848 | | * - use the position in sequence after given token as an index to store |
849 | | * results |
850 | | * - if there was no valid tokenization of the current UTF code point |
851 | | * then use unknown token with additional score penalty |
852 | | * After processing the whole sequence we backtrack from the end to get |
853 | | * the best tokenization. |
854 | | */ |
855 | 0 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
856 | | // get current size of output (for reversal later) |
857 | 0 | size_t output_size = output.size(); |
858 | | |
859 | | // normalize the input first |
860 | 0 | std::string normalized; |
861 | 0 | normalize(text, &normalized); |
862 | 0 | size_t input_len = normalized.size(); |
863 | 0 | if (input_len == 0) { |
864 | 0 | return; |
865 | 0 | } |
866 | | |
867 | | // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores |
868 | 0 | std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX}); |
869 | | // at the beginning tokenization score is zero |
870 | 0 | tokenization_results[0] = { vocab.token_unk(), 0, 0 }; |
871 | |
|
872 | 0 | for (size_t input_offset = 0; input_offset < input_len;) { |
873 | 0 | size_t prefix_offset = input_offset; |
874 | | // calculate how many code units are in the currently processed UTF code point |
875 | 0 | size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset); |
876 | | |
877 | | // traverse the token matcher trie to find a matching token |
878 | 0 | bool single_codepoint_token_found = false; |
879 | 0 | const struct best_tokenization & current_best = tokenization_results[input_offset]; |
880 | 0 | const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]); |
881 | |
|
882 | 0 | while (prefix_offset <= input_len && node != NULL) { |
883 | | // check if we found valid token in prefix |
884 | 0 | if (node->has_value) { |
885 | | // check if it corresponds to the whole UTF code point |
886 | 0 | if (prefix_offset - input_offset == n_utf8_code_units) { |
887 | 0 | single_codepoint_token_found = true; |
888 | 0 | } |
889 | 0 | llama_token token_id = node->value; |
890 | 0 | const auto & token_data = vocab.get_token_data(token_id); |
891 | | |
892 | | // we set the user-defined token scores to 0 to make them more likely to be selected |
893 | | // (normal token scores are log probabilities, so they are negative) |
894 | | // score type is double here to make tokenization results exactly |
895 | | // the same as in the HF tokenizer using SentencePiece |
896 | 0 | const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score; |
897 | 0 | const double challenger_score = current_best.score_sum + token_score; |
898 | 0 | struct best_tokenization & current_champ = tokenization_results[prefix_offset]; |
899 | 0 | if (challenger_score > current_champ.score_sum) { |
900 | 0 | struct best_tokenization challenger = { token_id, input_offset, challenger_score }; |
901 | 0 | current_champ = challenger; |
902 | 0 | } |
903 | 0 | } |
904 | 0 | node = node->traverse(normalized[prefix_offset++]); |
905 | 0 | } |
906 | | |
907 | | // if we didn't find a valid token corresponding to the whole UTF code point |
908 | | // then use unknown token as the tokenization of this UTF code point |
909 | 0 | if (!single_codepoint_token_found) { |
910 | 0 | const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score; |
911 | 0 | prefix_offset = input_offset + n_utf8_code_units; |
912 | 0 | struct best_tokenization & current_champ = tokenization_results[prefix_offset]; |
913 | 0 | if (challenger_score > current_champ.score_sum) { |
914 | 0 | struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score }; |
915 | 0 | current_champ = challenger; |
916 | 0 | } |
917 | 0 | } |
918 | | |
919 | | // move to the next UTF code point |
920 | 0 | input_offset += n_utf8_code_units; |
921 | 0 | } |
922 | | |
923 | | // now backtrack from the end to gather token ids of the best tokenization |
924 | | // merge sequences of consecutive unknown tokens into single unknown tokens |
925 | 0 | bool is_prev_unknown = false; |
926 | 0 | for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) { |
927 | 0 | bool is_unknown = tokenization.token_id == vocab.token_unk(); |
928 | 0 | if (!(is_prev_unknown && is_unknown)) { |
929 | 0 | output.push_back(tokenization.token_id); |
930 | 0 | } |
931 | 0 | if (tokenization.input_offset == 0) { |
932 | 0 | break; |
933 | 0 | } |
934 | 0 | is_prev_unknown = is_unknown; |
935 | 0 | } |
936 | | |
937 | | // reverse the output since we added tokens starting from the end of the input |
938 | 0 | std::reverse(output.begin() + output_size, output.end()); |
939 | 0 | } |
940 | | |
941 | | private: |
942 | | |
943 | | // helper structure for returning normalization results |
944 | | struct normalization_result { |
945 | | const char * normalized; |
946 | | size_t normalized_len; |
947 | | size_t consumed_input; |
948 | | }; |
949 | | |
950 | 0 | void normalize(const std::string& input, std::string * normalized) { |
951 | 0 | normalized->clear(); |
952 | 0 | normalized->reserve(input.size() * 3); |
953 | |
|
954 | 0 | const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " "; |
955 | |
|
956 | 0 | const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix(); |
957 | 0 | const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix(); |
958 | 0 | const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces(); |
959 | |
|
960 | 0 | bool is_space_prepended = false; |
961 | 0 | bool processing_non_ws = false; |
962 | |
|
963 | 0 | size_t input_len = input.size(); |
964 | |
|
965 | 0 | for (size_t input_offset = 0; input_offset < input_len; ) { |
966 | 0 | auto norm_res = normalize_prefix(input, input_offset); |
967 | 0 | for (size_t i = 0; i < norm_res.normalized_len; i++) { |
968 | 0 | char c = norm_res.normalized[i]; |
969 | 0 | if (c != ' ') { |
970 | 0 | if (!processing_non_ws) { |
971 | 0 | processing_non_ws = true; |
972 | 0 | if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) { |
973 | 0 | normalized->append(space); |
974 | 0 | is_space_prepended = true; |
975 | 0 | } |
976 | 0 | } |
977 | 0 | normalized->push_back(c); |
978 | 0 | } else { |
979 | 0 | if (processing_non_ws) { |
980 | 0 | processing_non_ws = false; |
981 | 0 | } |
982 | 0 | if (!shall_merge_spaces) { |
983 | 0 | normalized->append(space); |
984 | 0 | } |
985 | 0 | } |
986 | 0 | } |
987 | |
|
988 | 0 | input_offset += norm_res.consumed_input; |
989 | 0 | } |
990 | |
|
991 | 0 | if (shall_append_space) { |
992 | 0 | normalized->append(space); |
993 | 0 | } |
994 | 0 | } |
995 | | |
996 | | /* |
997 | | * This structure is a view wrapper for XOR-compressed double array (XCDA) |
998 | | * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries. |
999 | | * Each bit-packed entry contains: |
1000 | | * - BASE array value in bits 10-30 |
1001 | | * - LCHECK array value in bits 0-7 |
1002 | | * - LEAF array value in bit 9 |
1003 | | * Entries containing indexes of replacement sequences have set bit 31 |
1004 | | */ |
1005 | | struct xcda_array_view { |
1006 | | public: |
1007 | 0 | xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) { |
1008 | 0 | } |
1009 | 0 | uint32_t get_base(size_t index) { |
1010 | 0 | uint32_t packed_node = get_node(index); |
1011 | 0 | return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6); |
1012 | 0 | } |
1013 | 0 | uint32_t get_lcheck(size_t index) { |
1014 | 0 | uint32_t packed_node = get_node(index); |
1015 | 0 | return packed_node & ((1U << 31) | 0xff); |
1016 | 0 | } |
1017 | 0 | bool get_leaf(size_t index) { |
1018 | 0 | uint32_t packed_node = get_node(index); |
1019 | 0 | return (packed_node >> 8) & 1; |
1020 | 0 | } |
1021 | 0 | uint32_t get_value(size_t index) { |
1022 | 0 | uint32_t packed_node = get_node(index); |
1023 | 0 | return packed_node & ((1U << 31) - 1); |
1024 | 0 | } |
1025 | | private: |
1026 | 0 | uint32_t get_node(size_t index) { |
1027 | 0 | if (index >= xcda_array_size) { |
1028 | 0 | throw std::runtime_error("Index out of array bounds in XCDA array!"); |
1029 | 0 | } |
1030 | 0 | return xcda_array[index]; |
1031 | 0 | } |
1032 | | const uint32_t * xcda_array; |
1033 | | size_t xcda_array_size; |
1034 | | }; |
1035 | | |
1036 | | // this structure stores the best tokenization so far at input_offset |
1037 | | struct best_tokenization { |
1038 | | llama_token token_id; |
1039 | | size_t input_offset; |
1040 | | double score_sum; |
1041 | | }; |
1042 | | |
1043 | 0 | struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { |
1044 | 0 | if (input_offset == input.size()) { |
1045 | 0 | return { &input[input_offset], 0, 0 }; |
1046 | 0 | } |
1047 | | |
1048 | | // if input prefix matches some user-defined token return this token as normalization result |
1049 | 0 | auto user_defined_token_match = |
1050 | 0 | tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); |
1051 | 0 | if (user_defined_token_match.second > 0) { |
1052 | 0 | return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second }; |
1053 | 0 | } |
1054 | | |
1055 | 0 | size_t longest_prefix_length = 0; |
1056 | 0 | size_t longest_prefix_offset = 0; |
1057 | |
|
1058 | 0 | if (tokenizer.xcda_array_size > 0) { |
1059 | 0 | struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size); |
1060 | | |
1061 | | // Find the longest normalized sequence matching the input prefix by walking |
1062 | | // the XOR-compressed compact double array (XCDA) starting from the root node |
1063 | | // We find the index of the next node by calculating BASE[s] ^ c where s is |
1064 | | // the index of the previous node and c is a numerical character value |
1065 | 0 | uint32_t node_index = 0; |
1066 | | // get BASE of the root node |
1067 | 0 | node_index = xcda_view.get_base(node_index); |
1068 | 0 | for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) { |
1069 | 0 | unsigned char c = input[prefix_offset]; |
1070 | 0 | if (c == 0) { |
1071 | 0 | break; |
1072 | 0 | } |
1073 | 0 | node_index ^= c; |
1074 | | // if value of LCHECK is not c it means that this is not a child of |
1075 | | // the previous node, so we stop matching |
1076 | 0 | if (xcda_view.get_lcheck(node_index) != c) { |
1077 | 0 | break; |
1078 | 0 | } |
1079 | 0 | bool is_leaf = xcda_view.get_leaf(node_index); |
1080 | | // get BASE of the current node |
1081 | 0 | node_index ^= xcda_view.get_base(node_index); |
1082 | | // if LEAF of the current node is true, it means that its BASE points to the node |
1083 | | // containing index of replacement sequence for currently matched input prefix |
1084 | 0 | if (is_leaf) |
1085 | 0 | { |
1086 | 0 | longest_prefix_length = prefix_offset - input_offset + 1; |
1087 | | // get index of replacement sequence for currently matched input prefix |
1088 | 0 | longest_prefix_offset = xcda_view.get_value(node_index); |
1089 | 0 | } |
1090 | 0 | } |
1091 | 0 | } |
1092 | |
|
1093 | 0 | if (longest_prefix_length > 0) { |
1094 | | // we have a match, so return the replacement sequence |
1095 | 0 | if (longest_prefix_offset >= tokenizer.prefix_replacements_size) { |
1096 | 0 | throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); |
1097 | 0 | } |
1098 | 0 | const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset]; |
1099 | 0 | return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length }; |
1100 | 0 | } |
1101 | | |
1102 | | // check if the input prefix contains a valid sequence of UTF-8 code units |
1103 | 0 | try { |
1104 | | // if yes, return this sequence unmodified |
1105 | 0 | size_t prefix_offset = input_offset; |
1106 | 0 | unicode_cpt_from_utf8(input, prefix_offset); |
1107 | 0 | return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset }; |
1108 | 0 | } catch (std::invalid_argument & /*ex*/) { |
1109 | | // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER |
1110 | 0 | return { "\xEF\xBF\xBD", 3, 1 }; |
1111 | 0 | } |
1112 | 0 | } |
1113 | | |
1114 | | const llama_vocab & vocab; |
1115 | | const llm_tokenizer_ugm & tokenizer; |
1116 | | }; |
1117 | | |
1118 | | // |
1119 | | // RWKV tokenizer |
1120 | | // |
1121 | | |
1122 | 0 | static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) { |
1123 | 0 | std::vector<uint8_t> output; |
1124 | 0 | output.reserve(escaped.size()); |
1125 | | |
1126 | | // Parser state |
1127 | 0 | bool escaping = false; |
1128 | 0 | uint8_t hex_remaining = 0; |
1129 | 0 | uint8_t hex_acc = 0; |
1130 | | |
1131 | | // Step through characters, performing parsing |
1132 | 0 | for (const char & c : escaped) { |
1133 | | // If we're parsing a hex code, interpret the next character |
1134 | 0 | if (hex_remaining != 0) { |
1135 | 0 | uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0'); |
1136 | 0 | hex_acc = (hex_acc << 4) + value; |
1137 | |
|
1138 | 0 | hex_remaining -= 1; |
1139 | 0 | if (hex_remaining == 0) { |
1140 | 0 | output.push_back(hex_acc); |
1141 | 0 | hex_acc = 0; |
1142 | 0 | } |
1143 | |
|
1144 | 0 | continue; |
1145 | 0 | } |
1146 | | |
1147 | | // If we got an escape character, interpret it |
1148 | 0 | if (escaping) { |
1149 | 0 | if (c == 't') { |
1150 | 0 | output.push_back('\t'); |
1151 | 0 | } else if (c == 'n') { |
1152 | 0 | output.push_back('\n'); |
1153 | 0 | } else if (c == 'r') { |
1154 | 0 | output.push_back('\r'); |
1155 | 0 | } else if (c == 'x') { |
1156 | 0 | hex_remaining = 2; |
1157 | 0 | } else { |
1158 | 0 | output.push_back(c); |
1159 | 0 | } |
1160 | |
|
1161 | 0 | escaping = false; |
1162 | 0 | continue; |
1163 | 0 | } |
1164 | | |
1165 | 0 | if (c == '\\') { |
1166 | 0 | escaping = true; |
1167 | 0 | continue; |
1168 | 0 | } |
1169 | | |
1170 | 0 | output.push_back(c); |
1171 | 0 | } |
1172 | |
|
1173 | 0 | return output; |
1174 | 0 | } |
1175 | | |
1176 | | struct llm_tokenizer_rwkv : llm_tokenizer { |
1177 | 0 | llm_tokenizer_rwkv(const llama_vocab & vocab) { |
1178 | | // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. |
1179 | | // For now, we decode the vocab here into the lookup we'll use for tokenization. |
1180 | | |
1181 | | // build trie |
1182 | 0 | for (uint32_t id = 0; id < vocab.n_tokens(); ++id) { |
1183 | 0 | const auto & data = vocab.get_token_data(id); |
1184 | 0 | const auto text = llama_unescape_rwkv_token(data.text); |
1185 | 0 | token_matcher.insert((const char *) text.data(), text.size(), id); |
1186 | 0 | } |
1187 | 0 | } |
1188 | | |
1189 | | struct naive_trie token_matcher; |
1190 | | }; |
1191 | | |
1192 | | struct llm_tokenizer_rwkv_session { |
1193 | 0 | llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} |
1194 | | |
1195 | 0 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
1196 | 0 | uint32_t position = 0; |
1197 | 0 | while (position < text.size()) { |
1198 | 0 | const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]); |
1199 | 0 | if (node == NULL) { |
1200 | | // no matching token found, add unknown token |
1201 | 0 | output.push_back(vocab.token_unk()); |
1202 | 0 | position += 1; |
1203 | 0 | continue; |
1204 | 0 | } |
1205 | | |
1206 | | // traverse the trie to find the longest matching token |
1207 | 0 | uint32_t token_id = 0; |
1208 | 0 | uint32_t token_length = 0; |
1209 | 0 | while (node != NULL) { |
1210 | 0 | if (node->has_value) { |
1211 | 0 | token_id = node->value; |
1212 | 0 | token_length = position + 1; |
1213 | 0 | } |
1214 | 0 | node = node->traverse(text[++position]); |
1215 | 0 | } |
1216 | | |
1217 | | // add the longest matching token |
1218 | 0 | output.push_back(token_id); |
1219 | 0 | position = token_length; |
1220 | 0 | } |
1221 | 0 | } |
1222 | | |
1223 | | private: |
1224 | | const llama_vocab & vocab; |
1225 | | const llm_tokenizer_rwkv & tokenizer; |
1226 | | }; |
1227 | | |
1228 | | struct llm_tokenizer_plamo2 : llm_tokenizer { |
1229 | 0 | llm_tokenizer_plamo2(const llama_vocab & vocab) { |
1230 | 0 | build(vocab); |
1231 | 0 | } |
1232 | | |
1233 | 0 | void build(const llama_vocab & vocab) { |
1234 | | // Reset internal structures |
1235 | 0 | tokens_.clear(); |
1236 | 0 | bytes_.assign(256, 0); |
1237 | 0 | to_suffix_id_.clear(); |
1238 | 0 | table_.clear(); |
1239 | | |
1240 | | // Build token list and byte mapping |
1241 | 0 | std::unordered_map<std::string, float> suffix_to_score; |
1242 | 0 | std::unordered_map<std::string, llama_token> token_to_id; |
1243 | |
|
1244 | 0 | for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) { |
1245 | 0 | const auto & entry = vocab.get_token_data(token_id); |
1246 | 0 | tokens_.push_back(entry.text); |
1247 | 0 | token_to_id[entry.text] = static_cast<llama_token>(token_id); |
1248 | | |
1249 | | // Handle byte tokens |
1250 | 0 | if (vocab.is_byte(token_id)) { |
1251 | 0 | if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') { |
1252 | 0 | std::string hex_str = entry.text.substr(3, 2); |
1253 | 0 | int byte_val = std::stoi(hex_str, nullptr, 16); |
1254 | 0 | bytes_[byte_val] = static_cast<llama_token>(token_id); |
1255 | 0 | } |
1256 | 0 | continue; |
1257 | 0 | } |
1258 | | |
1259 | | // Add token and all its suffixes to suffix_to_score |
1260 | 0 | suffix_to_score[entry.text] = entry.score; |
1261 | | |
1262 | | // Extract suffixes character by character (UTF-8 aware) |
1263 | 0 | std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text); |
1264 | 0 | for (size_t i = 1; i < cpts.size(); ++i) { |
1265 | 0 | std::string suffix; |
1266 | 0 | for (size_t j = i; j < cpts.size(); ++j) { |
1267 | 0 | suffix += unicode_cpt_to_utf8(cpts[j]); |
1268 | 0 | } |
1269 | 0 | if (suffix_to_score.find(suffix) == suffix_to_score.end()) { |
1270 | 0 | suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN(); |
1271 | 0 | } |
1272 | 0 | } |
1273 | 0 | } |
1274 | | |
1275 | | // Check that all byte tokens are set |
1276 | 0 | for (int i = 0; i < 256; ++i) { |
1277 | 0 | if (bytes_[i] == 0) { |
1278 | 0 | throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set"); |
1279 | 0 | } |
1280 | 0 | } |
1281 | | |
1282 | | // Build suffix list in lexicographical order of reversed strings |
1283 | 0 | std::vector<std::string> suffixes; |
1284 | 0 | suffixes.reserve(suffix_to_score.size() + 1); |
1285 | 0 | for (const auto & pair : suffix_to_score) { |
1286 | 0 | suffixes.push_back(pair.first); |
1287 | 0 | } |
1288 | 0 | suffixes.push_back(""); // Empty suffix |
1289 | |
|
1290 | 0 | std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) { |
1291 | 0 | std::string rev_a(a.rbegin(), a.rend()); |
1292 | 0 | std::string rev_b(b.rbegin(), b.rend()); |
1293 | 0 | return rev_a < rev_b; |
1294 | 0 | }); |
1295 | | |
1296 | | // Build suffix_to_id and to_suffix_id_ |
1297 | 0 | std::unordered_map<std::string, int32_t> suffix_to_id; |
1298 | 0 | int32_t num_pieces = 0; |
1299 | |
|
1300 | 0 | for (const auto & suffix : suffixes) { |
1301 | 0 | suffix_to_id[suffix] = num_pieces; |
1302 | 0 | if (!suffix.empty()) { |
1303 | 0 | std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix); |
1304 | |
|
1305 | 0 | std::string remaining; |
1306 | 0 | for (size_t i = 1; i < cpts.size(); ++i) { |
1307 | 0 | remaining += unicode_cpt_to_utf8(cpts[i]); |
1308 | 0 | } |
1309 | |
|
1310 | 0 | int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining]; |
1311 | 0 | to_suffix_id_[piece_code] = num_pieces; |
1312 | | |
1313 | | // Count number of pieces for this suffix |
1314 | 0 | int32_t pieces_for_suffix = 1; // sentinel row |
1315 | 0 | for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) { |
1316 | 0 | std::string piece; |
1317 | 0 | for (int32_t i = 0; i < piece_length; ++i) { |
1318 | 0 | piece += unicode_cpt_to_utf8(cpts[i]); |
1319 | 0 | } |
1320 | 0 | if (suffix_to_score.find(piece) != suffix_to_score.end()) { |
1321 | 0 | pieces_for_suffix++; |
1322 | 0 | } |
1323 | 0 | } |
1324 | 0 | num_pieces += pieces_for_suffix; |
1325 | 0 | } else { |
1326 | 0 | num_pieces++; // Empty suffix contributes one piece (sentinel row) |
1327 | 0 | } |
1328 | 0 | } |
1329 | | |
1330 | | // Build flattened table |
1331 | 0 | table_.resize(num_pieces, std::vector<int32_t>(4, 0)); |
1332 | 0 | int32_t table_idx = 0; |
1333 | |
|
1334 | 0 | for (const auto & suffix : suffixes) { |
1335 | | // Add all prefixes of the suffix to the table (in decreasing order of length) |
1336 | 0 | std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix); |
1337 | 0 | for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) { |
1338 | 0 | std::string piece; |
1339 | 0 | for (int32_t i = 0; i < piece_length; ++i) { |
1340 | 0 | piece += unicode_cpt_to_utf8(cpts[i]); |
1341 | 0 | } |
1342 | |
|
1343 | 0 | auto score_it = suffix_to_score.find(piece); |
1344 | 0 | if (score_it == suffix_to_score.end()) { |
1345 | 0 | continue; |
1346 | 0 | } |
1347 | | |
1348 | 0 | table_[table_idx][TABLE_PIECE_LENGTH] = piece_length; |
1349 | 0 | auto token_it = token_to_id.find(piece); |
1350 | 0 | table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1; |
1351 | |
|
1352 | 0 | float score = score_it->second; |
1353 | 0 | table_[table_idx][TABLE_SCORE] = std::isfinite(score) ? |
1354 | 0 | static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE; |
1355 | 0 | table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece]; |
1356 | |
|
1357 | 0 | table_idx++; |
1358 | 0 | } |
1359 | | |
1360 | | // Add sentinel row |
1361 | 0 | table_[table_idx][TABLE_PIECE_LENGTH] = 1; |
1362 | 0 | table_[table_idx][TABLE_TOKEN_ID] = -1; |
1363 | 0 | table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE; |
1364 | 0 | table_idx++; |
1365 | 0 | } |
1366 | 0 | } |
1367 | | |
1368 | 0 | std::vector<llama_token> encode(const std::string & text) const { |
1369 | 0 | std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text); |
1370 | | // Skip the first code point if it is a BOM (Byte Order Mark) |
1371 | 0 | if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) { |
1372 | 0 | unicode_data.erase(unicode_data.begin()); |
1373 | 0 | } |
1374 | |
|
1375 | 0 | if (unicode_data.empty()) { |
1376 | 0 | return {}; |
1377 | 0 | } |
1378 | | |
1379 | 0 | const size_t data_len = unicode_data.size(); |
1380 | | |
1381 | | // Initialize scores array (dynamic programming) |
1382 | 0 | std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60); |
1383 | 0 | scores[data_len] = 0; |
1384 | | |
1385 | | // Path array to track best tokenization |
1386 | 0 | std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0)); |
1387 | |
|
1388 | 0 | int32_t suffix_id = 0; |
1389 | | |
1390 | | // Process from end to beginning |
1391 | 0 | for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) { |
1392 | 0 | uint32_t c = unicode_data[i]; |
1393 | | |
1394 | | // Find next suffix ID |
1395 | 0 | for (size_t p = suffix_id; p < table_.size(); ++p) { |
1396 | 0 | int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID]; |
1397 | 0 | auto it = to_suffix_id_.find(piece_code); |
1398 | 0 | suffix_id = (it != to_suffix_id_.end()) ? it->second : 0; |
1399 | |
|
1400 | 0 | if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) { |
1401 | 0 | break; |
1402 | 0 | } |
1403 | 0 | } |
1404 | | |
1405 | | // Update best path |
1406 | 0 | for (size_t p = suffix_id; p < table_.size(); ++p) { |
1407 | 0 | int32_t score = table_[p][TABLE_SCORE]; |
1408 | 0 | if (score > INVALID_SCORE) { |
1409 | 0 | int32_t piece_length = table_[p][TABLE_PIECE_LENGTH]; |
1410 | 0 | int64_t s = scores[i + piece_length] - score; |
1411 | |
|
1412 | 0 | if (s < scores[i]) { |
1413 | 0 | scores[i] = s; |
1414 | 0 | path[i][PATH_TOKEN_LENGTH] = piece_length; |
1415 | 0 | path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID]; |
1416 | 0 | path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1; |
1417 | |
|
1418 | 0 | if (score == UNKNOWN_SCORE) { |
1419 | | // Add UTF-8 byte count |
1420 | 0 | path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); |
1421 | 0 | } |
1422 | 0 | } |
1423 | 0 | } |
1424 | |
|
1425 | 0 | if (score == UNKNOWN_SCORE) { |
1426 | 0 | break; |
1427 | 0 | } |
1428 | 0 | } |
1429 | 0 | } |
1430 | | |
1431 | | // Decode the best path |
1432 | 0 | std::vector<llama_token> token_ids; |
1433 | 0 | token_ids.reserve(path[0][PATH_NUM_TOKENS]); |
1434 | |
|
1435 | 0 | int pos = 0; |
1436 | 0 | while (pos < static_cast<int>(data_len)) { |
1437 | 0 | if (path[pos][PATH_TOKEN_ID] >= 0) { |
1438 | 0 | token_ids.push_back(path[pos][PATH_TOKEN_ID]); |
1439 | 0 | } else { |
1440 | | // Fall back to byte tokens |
1441 | 0 | uint32_t c = unicode_data[pos]; |
1442 | 0 | int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); |
1443 | |
|
1444 | 0 | for (int i = 0; i < s; ++i) { |
1445 | 0 | uint8_t b; |
1446 | 0 | if (s == 1) { |
1447 | 0 | b = c; |
1448 | 0 | } else { |
1449 | 0 | if (i == 0) { |
1450 | 0 | b = (0xF00 >> s) & 0xFF; |
1451 | 0 | } else { |
1452 | 0 | b = 0x80; |
1453 | 0 | } |
1454 | 0 | } |
1455 | 0 | token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]); |
1456 | 0 | } |
1457 | 0 | } |
1458 | |
|
1459 | 0 | assert(path[pos][PATH_TOKEN_LENGTH] > 0); |
1460 | 0 | pos += path[pos][PATH_TOKEN_LENGTH]; |
1461 | 0 | } |
1462 | |
|
1463 | 0 | return token_ids; |
1464 | 0 | } |
1465 | | private: |
1466 | | // Constants for table structure |
1467 | | static constexpr int32_t TABLE_PIECE_LENGTH = 0; |
1468 | | static constexpr int32_t TABLE_TOKEN_ID = 1; |
1469 | | static constexpr int32_t TABLE_SCORE = 2; |
1470 | | static constexpr int32_t TABLE_PIECE_ID = 3; |
1471 | | |
1472 | | // Constants for path array |
1473 | | static constexpr int32_t PATH_TOKEN_LENGTH = 0; |
1474 | | static constexpr int32_t PATH_TOKEN_ID = 1; |
1475 | | static constexpr int32_t PATH_NUM_TOKENS = 2; |
1476 | | |
1477 | | // Score constants |
1478 | | static constexpr int32_t INVALID_SCORE = -20000000; |
1479 | | static constexpr int32_t UNKNOWN_SCORE = -10000000; |
1480 | | |
1481 | | // List of tokens in the vocabulary |
1482 | | std::vector<std::string> tokens_; |
1483 | | |
1484 | | // Mapping from byte code point to token ID (for byte fallback) |
1485 | | std::vector<llama_token> bytes_; |
1486 | | |
1487 | | // Mapping from piece code to suffix ID |
1488 | | std::unordered_map<int64_t, int32_t> to_suffix_id_; |
1489 | | |
1490 | | // Flattened table representing the Trie structure |
1491 | | // Each row contains: [piece_length, token_id, score, piece_id] |
1492 | | std::vector<std::vector<int32_t>> table_; |
1493 | | }; |
1494 | | |
1495 | | struct llm_tokenizer_plamo2_session { |
1496 | 0 | llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {} |
1497 | | |
1498 | 0 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
1499 | 0 | std::vector<llama_token> tokens = tokenizer.encode(text); |
1500 | 0 | output.insert(output.end(), tokens.begin(), tokens.end()); |
1501 | 0 | } |
1502 | | |
1503 | | private: |
1504 | | const llm_tokenizer_plamo2 & tokenizer; |
1505 | | }; |
1506 | | |
1507 | | // |
1508 | | // impl |
1509 | | // |
1510 | | |
1511 | | typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { |
1512 | | FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, |
1513 | | FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT |
1514 | | } FRAGMENT_BUFFER_VARIANT_TYPE; |
1515 | | |
1516 | | struct fragment_buffer_variant { |
1517 | | fragment_buffer_variant(llama_token _token) |
1518 | | : |
1519 | 0 | type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), |
1520 | 0 | token(_token), |
1521 | 0 | raw_text(_dummy), |
1522 | 0 | offset(0), |
1523 | 0 | length(0) {} |
1524 | | |
1525 | | fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) |
1526 | | : |
1527 | 0 | type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), |
1528 | 0 | token((llama_token) - 1), |
1529 | 0 | raw_text(_raw_text), |
1530 | 0 | offset(_offset), |
1531 | 0 | length(_length){ |
1532 | 0 | GGML_ASSERT(_offset >= 0); |
1533 | 0 | GGML_ASSERT(_length >= 1); |
1534 | 0 | GGML_ASSERT(offset + length <= raw_text.length()); |
1535 | 0 | } |
1536 | | |
1537 | | const FRAGMENT_BUFFER_VARIANT_TYPE type; |
1538 | | const llama_token token; |
1539 | | const std::string _dummy; |
1540 | | const std::string & raw_text; |
1541 | | const uint64_t offset; |
1542 | | const uint64_t length; |
1543 | | }; |
1544 | | |
1545 | | struct llama_vocab::impl { |
1546 | | uint32_t n_token_types = 0; // for BERT-style token types |
1547 | | |
1548 | | std::string tokenizer_model; |
1549 | | std::string tokenizer_pre; |
1550 | | |
1551 | | enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; |
1552 | | enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
1553 | | |
1554 | | int max_token_len = 0; // used for optimizing longest token search |
1555 | | |
1556 | | // default LLaMA special tokens |
1557 | | // TODO: should we set all of these to LLAMA_TOKEN_NULL? |
1558 | | llama_token special_bos_id = 1; |
1559 | | llama_token special_eos_id = 2; |
1560 | | llama_token special_eot_id = LLAMA_TOKEN_NULL; |
1561 | | llama_token special_eom_id = LLAMA_TOKEN_NULL; |
1562 | | llama_token special_unk_id = 0; |
1563 | | llama_token special_sep_id = LLAMA_TOKEN_NULL; |
1564 | | llama_token special_pad_id = LLAMA_TOKEN_NULL; |
1565 | | llama_token special_mask_id = LLAMA_TOKEN_NULL; |
1566 | | |
1567 | | llama_token linefeed_id = 13; |
1568 | | |
1569 | | // fim tokens |
1570 | | llama_token special_fim_pre_id = LLAMA_TOKEN_NULL; |
1571 | | llama_token special_fim_suf_id = LLAMA_TOKEN_NULL; |
1572 | | llama_token special_fim_mid_id = LLAMA_TOKEN_NULL; |
1573 | | llama_token special_fim_pad_id = LLAMA_TOKEN_NULL; |
1574 | | llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo |
1575 | | llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator |
1576 | | |
1577 | | // tokenizer flags |
1578 | | bool add_space_prefix = false; |
1579 | | bool add_bos = false; |
1580 | | bool add_eos = false; |
1581 | | bool add_sep = false; |
1582 | | bool ignore_merges = false; |
1583 | | bool clean_spaces = false; // clean_up_tokenization_spaces |
1584 | | bool remove_extra_whitespaces = false; |
1585 | | bool escape_whitespaces = true; |
1586 | | bool treat_whitespace_as_suffix = false; |
1587 | | |
1588 | | std::unordered_map<std::string, llama_token> token_to_id; |
1589 | | std::vector<token_data> id_to_token; |
1590 | | |
1591 | | std::vector<llama_token> cache_special_tokens; |
1592 | | std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true); |
1593 | | struct pair_hash { |
1594 | 0 | size_t operator()(const std::pair<std::string, std::string> & p) const { |
1595 | 0 | return std::hash<std::string>{}(p.first) ^ //create some hash for pair |
1596 | 0 | (std::hash<std::string>{}(p.second) << 1); |
1597 | 0 | } |
1598 | | }; |
1599 | | std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks; |
1600 | | |
1601 | | // set of all tokens that cause "end of generation" |
1602 | | std::set<llama_token> special_eog_ids; |
1603 | | |
1604 | | std::unique_ptr<llm_tokenizer> tokenizer; |
1605 | | |
1606 | | std::vector<char> precompiled_charsmap; |
1607 | | |
1608 | 862 | impl(const llama_vocab & vocab) : vocab(vocab) { |
1609 | 862 | } |
1610 | | |
1611 | 807 | ~impl() = default; |
1612 | | |
1613 | | void load(llama_model_loader & ml, const LLM_KV & kv); |
1614 | | |
1615 | | enum llama_vocab_type get_type() const; |
1616 | | |
1617 | | std::string type_name() const; |
1618 | | |
1619 | | bool is_normal (llama_token id) const; |
1620 | | bool is_unknown (llama_token id) const; |
1621 | | bool is_control (llama_token id) const; |
1622 | | bool is_byte (llama_token id) const; |
1623 | | bool is_user_defined(llama_token id) const; |
1624 | | bool is_unused (llama_token id) const; |
1625 | | bool is_eog (llama_token id) const; |
1626 | | |
1627 | | uint8_t token_to_byte(llama_token id) const; |
1628 | | |
1629 | | llama_token_attr token_get_attr(llama_token id) const; |
1630 | | |
1631 | | void init_tokenizer(enum llama_vocab_type type); |
1632 | | |
1633 | | void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const; |
1634 | | |
1635 | | std::string token_to_piece_for_cache( |
1636 | | llama_token token, |
1637 | | bool special) const; |
1638 | | |
1639 | | |
1640 | | std::vector<llama_token> tokenize( |
1641 | | const std::string & raw_text, |
1642 | | bool add_special, |
1643 | | bool parse_special = false) const; |
1644 | | |
1645 | | int32_t tokenize( |
1646 | | const char * text, |
1647 | | int32_t text_len, |
1648 | | llama_token * tokens, |
1649 | | int32_t n_tokens_max, |
1650 | | bool add_special, |
1651 | | bool parse_special) const; |
1652 | | |
1653 | | // does not write null-terminator to buf |
1654 | | int32_t token_to_piece( |
1655 | | llama_token token, |
1656 | | char * buf, |
1657 | | int32_t length, |
1658 | | int32_t lstrip, |
1659 | | bool special) const; |
1660 | | |
1661 | | // use cached data |
1662 | | const std::string & token_to_piece(llama_token token) const; |
1663 | | |
1664 | | int32_t detokenize( |
1665 | | const llama_token * tokens, |
1666 | | int32_t n_tokens, |
1667 | | char * text, |
1668 | | int32_t text_len_max, |
1669 | | bool remove_special, |
1670 | | bool unparse_special) const; |
1671 | | |
1672 | | std::string detokenize( |
1673 | | const std::vector<llama_token> & tokens, |
1674 | | bool special) const; |
1675 | | |
1676 | | void print_info() const; |
1677 | | |
1678 | | private: |
1679 | | const llama_vocab & vocab; |
1680 | | }; |
1681 | | |
1682 | 0 | void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { |
1683 | 0 | struct gguf_context * ctx = ml.meta.get(); |
1684 | | |
1685 | | // determine vocab type |
1686 | 0 | { |
1687 | 0 | ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); |
1688 | 0 | ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); |
1689 | |
|
1690 | 0 | ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false); |
1691 | |
|
1692 | 0 | if (tokenizer_model == "no_vocab" || tokenizer_model == "none") { |
1693 | 0 | type = LLAMA_VOCAB_TYPE_NONE; |
1694 | | |
1695 | | // default special tokens |
1696 | 0 | special_bos_id = LLAMA_TOKEN_NULL; |
1697 | 0 | special_eos_id = LLAMA_TOKEN_NULL; |
1698 | 0 | special_unk_id = LLAMA_TOKEN_NULL; |
1699 | 0 | special_sep_id = LLAMA_TOKEN_NULL; |
1700 | 0 | special_pad_id = LLAMA_TOKEN_NULL; |
1701 | 0 | special_mask_id = LLAMA_TOKEN_NULL; |
1702 | 0 | linefeed_id = LLAMA_TOKEN_NULL; |
1703 | | |
1704 | | // read vocab size from metadata |
1705 | 0 | uint32_t n_tokens = 0; |
1706 | 0 | if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) { |
1707 | 0 | LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens); |
1708 | 0 | id_to_token.resize(n_tokens); |
1709 | 0 | } |
1710 | |
|
1711 | 0 | return; |
1712 | 0 | } |
1713 | | |
1714 | 0 | if (tokenizer_model == "llama") { |
1715 | 0 | type = LLAMA_VOCAB_TYPE_SPM; |
1716 | | |
1717 | | // default special tokens |
1718 | 0 | special_bos_id = 1; |
1719 | 0 | special_eos_id = 2; |
1720 | 0 | special_unk_id = 0; |
1721 | 0 | special_sep_id = LLAMA_TOKEN_NULL; |
1722 | 0 | special_pad_id = LLAMA_TOKEN_NULL; |
1723 | 0 | special_mask_id = LLAMA_TOKEN_NULL; |
1724 | 0 | } else if (tokenizer_model == "bert") { |
1725 | 0 | type = LLAMA_VOCAB_TYPE_WPM; |
1726 | | |
1727 | | // default special tokens |
1728 | 0 | special_bos_id = 101; |
1729 | 0 | special_eos_id = LLAMA_TOKEN_NULL; |
1730 | 0 | special_unk_id = 100; |
1731 | 0 | special_sep_id = 102; |
1732 | 0 | special_pad_id = 0; |
1733 | 0 | special_mask_id = 103; |
1734 | |
|
1735 | 0 | add_sep = true; |
1736 | 0 | } else if (tokenizer_model == "gpt2") { |
1737 | 0 | type = LLAMA_VOCAB_TYPE_BPE; |
1738 | | |
1739 | | // read bpe merges and populate bpe ranks |
1740 | 0 | const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); |
1741 | 0 | if (merges_keyidx == -1) { |
1742 | 0 | throw std::runtime_error("cannot find tokenizer merges in model file\n"); |
1743 | 0 | } |
1744 | | |
1745 | 0 | const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); |
1746 | 0 | for (int i = 0; i < n_merges; i++) { |
1747 | 0 | const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); |
1748 | | //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); |
1749 | |
|
1750 | 0 | std::string first; |
1751 | 0 | std::string second; |
1752 | |
|
1753 | 0 | const size_t pos = word.find(' ', 1); |
1754 | |
|
1755 | 0 | if (pos != std::string::npos) { |
1756 | 0 | first = word.substr(0, pos); |
1757 | 0 | second = word.substr(pos + 1); |
1758 | 0 | } |
1759 | |
|
1760 | 0 | bpe_ranks.emplace(std::make_pair(first, second), i); |
1761 | 0 | } |
1762 | | |
1763 | | // default special tokens |
1764 | 0 | special_bos_id = 11; |
1765 | 0 | special_eos_id = 11; |
1766 | 0 | special_unk_id = LLAMA_TOKEN_NULL; |
1767 | 0 | special_sep_id = LLAMA_TOKEN_NULL; |
1768 | 0 | special_pad_id = LLAMA_TOKEN_NULL; |
1769 | 0 | special_mask_id = LLAMA_TOKEN_NULL; |
1770 | 0 | } else if (tokenizer_model == "t5") { |
1771 | 0 | type = LLAMA_VOCAB_TYPE_UGM; |
1772 | | |
1773 | | // default special tokens |
1774 | 0 | special_bos_id = LLAMA_TOKEN_NULL; |
1775 | 0 | special_eos_id = 1; |
1776 | 0 | special_unk_id = 2; |
1777 | 0 | special_sep_id = LLAMA_TOKEN_NULL; |
1778 | 0 | special_pad_id = 0; |
1779 | 0 | special_mask_id = LLAMA_TOKEN_NULL; |
1780 | |
|
1781 | 0 | const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); |
1782 | 0 | if (precompiled_charsmap_keyidx != -1) { |
1783 | 0 | const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); |
1784 | 0 | GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8); |
1785 | |
|
1786 | 0 | const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); |
1787 | 0 | const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); |
1788 | 0 | precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); |
1789 | | #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
1790 | | // correct endiannes of data in precompiled_charsmap binary blob |
1791 | | uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0]; |
1792 | | *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); |
1793 | | assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap); |
1794 | | size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t); |
1795 | | uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)]; |
1796 | | for (size_t i = 0; i < xcda_array_size; ++i) { |
1797 | | xcda_array[i] = __builtin_bswap32(xcda_array[i]); |
1798 | | } |
1799 | | #endif |
1800 | 0 | } |
1801 | 0 | } else if (tokenizer_model == "rwkv") { |
1802 | 0 | type = LLAMA_VOCAB_TYPE_RWKV; |
1803 | | |
1804 | | // default special tokens |
1805 | 0 | special_bos_id = LLAMA_TOKEN_NULL; |
1806 | 0 | special_eos_id = LLAMA_TOKEN_NULL; |
1807 | 0 | special_unk_id = LLAMA_TOKEN_NULL; |
1808 | 0 | special_sep_id = LLAMA_TOKEN_NULL; |
1809 | 0 | special_pad_id = LLAMA_TOKEN_NULL; |
1810 | 0 | } else if (tokenizer_model == "plamo2") { |
1811 | 0 | type = LLAMA_VOCAB_TYPE_PLAMO2; |
1812 | | |
1813 | | // PLaMo-2 default special tokens (these will be overridden by model config) |
1814 | 0 | special_bos_id = 1; // <|plamo:bos|> |
1815 | 0 | special_eos_id = 2; // <|plamo:eos|> |
1816 | 0 | special_unk_id = 0; // <|plamo:unk|> |
1817 | 0 | special_sep_id = LLAMA_TOKEN_NULL; |
1818 | 0 | special_pad_id = 3; // <|plamo:pad|> |
1819 | 0 | special_mask_id = LLAMA_TOKEN_NULL; |
1820 | 0 | } else { |
1821 | 0 | throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); |
1822 | 0 | } |
1823 | | |
1824 | | // for now, only BPE models have pre-tokenizers |
1825 | 0 | if (type == LLAMA_VOCAB_TYPE_BPE) { |
1826 | 0 | add_space_prefix = false; |
1827 | 0 | clean_spaces = true; |
1828 | 0 | if (tokenizer_pre.empty()) { |
1829 | 0 | LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); |
1830 | 0 | LLAMA_LOG_WARN("%s: \n", __func__); |
1831 | 0 | LLAMA_LOG_WARN("%s: ************************************ \n", __func__); |
1832 | 0 | LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__); |
1833 | 0 | LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); |
1834 | 0 | LLAMA_LOG_WARN("%s: ************************************ \n", __func__); |
1835 | 0 | LLAMA_LOG_WARN("%s: \n", __func__); |
1836 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
1837 | 0 | } else if (tokenizer_pre == "default") { |
1838 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
1839 | 0 | } else if ( |
1840 | 0 | tokenizer_pre == "llama3" || |
1841 | 0 | tokenizer_pre == "llama-v3" || |
1842 | 0 | tokenizer_pre == "llama-bpe"|| |
1843 | 0 | tokenizer_pre == "falcon3" || |
1844 | 0 | tokenizer_pre == "falcon-h1" || |
1845 | 0 | tokenizer_pre == "pixtral" || |
1846 | 0 | tokenizer_pre == "midm-2.0" || |
1847 | 0 | tokenizer_pre == "lfm2") { |
1848 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; |
1849 | 0 | ignore_merges = true; |
1850 | 0 | add_bos = true; |
1851 | 0 | } else if ( |
1852 | 0 | tokenizer_pre == "deepseek-llm") { |
1853 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; |
1854 | 0 | clean_spaces = false; |
1855 | 0 | } else if ( |
1856 | 0 | tokenizer_pre == "deepseek-coder") { |
1857 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; |
1858 | 0 | clean_spaces = false; |
1859 | 0 | } else if ( |
1860 | 0 | tokenizer_pre == "deepseek-v3") { |
1861 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; |
1862 | 0 | clean_spaces = false; |
1863 | 0 | } else if ( |
1864 | 0 | tokenizer_pre == "falcon") { |
1865 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON; |
1866 | 0 | } else if ( |
1867 | 0 | tokenizer_pre == "mpt") { |
1868 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_MPT; |
1869 | 0 | } else if ( |
1870 | 0 | tokenizer_pre == "starcoder") { |
1871 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER; |
1872 | 0 | } else if ( |
1873 | 0 | tokenizer_pre == "gpt-2" || |
1874 | 0 | tokenizer_pre == "phi-2" || |
1875 | 0 | tokenizer_pre == "jina-es" || |
1876 | 0 | tokenizer_pre == "jina-de" || |
1877 | 0 | tokenizer_pre == "gigachat" || |
1878 | 0 | tokenizer_pre == "jina-v2-es" || |
1879 | 0 | tokenizer_pre == "jina-v2-de" || |
1880 | 0 | tokenizer_pre == "a.x-4.0" || |
1881 | 0 | tokenizer_pre == "mellum") { |
1882 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |
1883 | 0 | } else if ( |
1884 | 0 | tokenizer_pre == "jina-v1-en" || |
1885 | 0 | tokenizer_pre == "jina-v2-code" || |
1886 | 0 | tokenizer_pre == "roberta-bpe") { |
1887 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |
1888 | 0 | add_sep = true; |
1889 | 0 | } else if ( |
1890 | 0 | tokenizer_pre == "refact") { |
1891 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; |
1892 | 0 | } else if ( |
1893 | 0 | tokenizer_pre == "command-r") { |
1894 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; |
1895 | 0 | clean_spaces = false; |
1896 | 0 | } else if ( |
1897 | 0 | tokenizer_pre == "qwen2" || |
1898 | 0 | tokenizer_pre == "deepseek-r1-qwen") { |
1899 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; |
1900 | 0 | clean_spaces = false; |
1901 | 0 | } else if ( |
1902 | 0 | tokenizer_pre == "stablelm2") { |
1903 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2; |
1904 | 0 | } else if ( |
1905 | 0 | tokenizer_pre == "olmo") { |
1906 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO; |
1907 | 0 | } else if ( |
1908 | 0 | tokenizer_pre == "dbrx") { |
1909 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX; |
1910 | 0 | } else if ( |
1911 | 0 | tokenizer_pre == "smaug-bpe") { |
1912 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG; |
1913 | 0 | } else if ( |
1914 | 0 | tokenizer_pre == "poro-chat") { |
1915 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_PORO; |
1916 | 0 | clean_spaces = false; |
1917 | 0 | } else if ( |
1918 | 0 | tokenizer_pre == "glm4" || |
1919 | 0 | tokenizer_pre == "chatglm-bpe") { |
1920 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; |
1921 | 0 | special_bos_id = LLAMA_TOKEN_NULL; |
1922 | 0 | } else if ( |
1923 | 0 | tokenizer_pre == "viking") { |
1924 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING; |
1925 | 0 | clean_spaces = false; |
1926 | 0 | } else if ( |
1927 | 0 | tokenizer_pre == "jais") { |
1928 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS; |
1929 | 0 | } else if ( |
1930 | 0 | tokenizer_pre == "tekken") { |
1931 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN; |
1932 | 0 | clean_spaces = false; |
1933 | 0 | ignore_merges = true; |
1934 | 0 | add_bos = true; |
1935 | 0 | } else if ( |
1936 | 0 | tokenizer_pre == "smollm") { |
1937 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM; |
1938 | 0 | clean_spaces = false; |
1939 | 0 | } else if ( |
1940 | 0 | tokenizer_pre == "codeshell") { |
1941 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL; |
1942 | 0 | } else if ( |
1943 | 0 | tokenizer_pre == "bloom") { |
1944 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM; |
1945 | 0 | } else if ( |
1946 | 0 | tokenizer_pre == "gpt3-finnish") { |
1947 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; |
1948 | 0 | } else if ( |
1949 | 0 | tokenizer_pre == "exaone") { |
1950 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE; |
1951 | 0 | } else if ( |
1952 | 0 | tokenizer_pre == "exaone4") { |
1953 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |
1954 | 0 | } else if ( |
1955 | 0 | tokenizer_pre == "chameleon") { |
1956 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; |
1957 | 0 | add_bos = true; |
1958 | 0 | clean_spaces = false; |
1959 | 0 | } else if ( |
1960 | 0 | tokenizer_pre == "minerva-7b") { |
1961 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA; |
1962 | 0 | } else if ( |
1963 | 0 | tokenizer_pre == "megrez") { |
1964 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; |
1965 | 0 | } else if ( |
1966 | 0 | tokenizer_pre == "gpt-4o" || |
1967 | 0 | tokenizer_pre == "llama4") { |
1968 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; |
1969 | 0 | clean_spaces = false; |
1970 | 0 | } else if ( |
1971 | 0 | tokenizer_pre == "superbpe") { |
1972 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE; |
1973 | 0 | clean_spaces = false; |
1974 | 0 | } else if ( |
1975 | 0 | tokenizer_pre == "trillion") { |
1976 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; |
1977 | 0 | clean_spaces = false; |
1978 | 0 | } else if ( |
1979 | 0 | tokenizer_pre == "granite-docling") { |
1980 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING; |
1981 | 0 | clean_spaces = false; |
1982 | 0 | } else if ( |
1983 | 0 | tokenizer_pre == "bailingmoe" || |
1984 | 0 | tokenizer_pre == "bailingmoe2" || |
1985 | 0 | tokenizer_pre == "llada-moe") { |
1986 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; |
1987 | 0 | clean_spaces = false; |
1988 | 0 | } else if ( |
1989 | 0 | tokenizer_pre == "seed-coder") { |
1990 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; |
1991 | 0 | clean_spaces = false; |
1992 | 0 | } else if ( |
1993 | 0 | tokenizer_pre == "hunyuan") { |
1994 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; |
1995 | 0 | clean_spaces = false; |
1996 | 0 | } else if ( |
1997 | 0 | tokenizer_pre == "hunyuan-dense") { |
1998 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE; |
1999 | 0 | clean_spaces = false; |
2000 | 0 | } else if ( |
2001 | 0 | tokenizer_pre == "kimi-k2") { |
2002 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2; |
2003 | 0 | clean_spaces = false; |
2004 | 0 | } else if ( |
2005 | 0 | tokenizer_pre == "grok-2") { |
2006 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2; |
2007 | 0 | clean_spaces = false; |
2008 | 0 | } else if ( |
2009 | 0 | tokenizer_pre == "afmoe") { |
2010 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE; |
2011 | 0 | clean_spaces = false; |
2012 | 0 | } else if ( |
2013 | 0 | tokenizer_pre == "minimax-m2") { |
2014 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; |
2015 | 0 | clean_spaces = false; |
2016 | 0 | } else { |
2017 | 0 | throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); |
2018 | 0 | } |
2019 | 0 | } else if (type == LLAMA_VOCAB_TYPE_SPM) { |
2020 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
2021 | 0 | add_space_prefix = true; |
2022 | 0 | clean_spaces = false; |
2023 | 0 | add_bos = true; |
2024 | 0 | add_eos = false; |
2025 | 0 | } else if (type == LLAMA_VOCAB_TYPE_WPM) { |
2026 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
2027 | 0 | add_space_prefix = false; |
2028 | 0 | clean_spaces = true; |
2029 | 0 | add_bos = true; |
2030 | 0 | add_eos = false; |
2031 | 0 | add_sep = true; |
2032 | 0 | } else if (type == LLAMA_VOCAB_TYPE_UGM) { |
2033 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
2034 | 0 | add_bos = false; |
2035 | 0 | add_eos = true; |
2036 | 0 | } else if (type == LLAMA_VOCAB_TYPE_RWKV) { |
2037 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
2038 | 0 | add_space_prefix = false; |
2039 | 0 | clean_spaces = false; |
2040 | 0 | add_bos = false; |
2041 | 0 | add_eos = false; |
2042 | 0 | } else { |
2043 | 0 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
2044 | 0 | } |
2045 | | |
2046 | 0 | ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); |
2047 | 0 | ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); |
2048 | 0 | } |
2049 | | |
2050 | 0 | const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); |
2051 | 0 | if (token_idx == -1) { |
2052 | 0 | throw std::runtime_error("cannot find tokenizer vocab in model file\n"); |
2053 | 0 | } |
2054 | | |
2055 | 0 | const float * scores = nullptr; |
2056 | 0 | const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); |
2057 | 0 | if (score_idx != -1) { |
2058 | 0 | scores = (const float * ) gguf_get_arr_data(ctx, score_idx); |
2059 | 0 | } |
2060 | |
|
2061 | 0 | const int * toktypes = nullptr; |
2062 | 0 | const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); |
2063 | 0 | if (toktype_idx != -1) { |
2064 | 0 | toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); |
2065 | 0 | } |
2066 | |
|
2067 | 0 | uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx); |
2068 | 0 | id_to_token.resize(n_tokens); |
2069 | |
|
2070 | 0 | for (uint32_t i = 0; i < n_tokens; i++) { |
2071 | 0 | std::string word = gguf_get_arr_str(ctx, token_idx, i); |
2072 | 0 | if (word.empty()) { |
2073 | 0 | LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i); |
2074 | 0 | word = "[EMPTY_" + std::to_string(i) + "]"; |
2075 | 0 | } |
2076 | |
|
2077 | 0 | token_to_id[word] = i; |
2078 | 0 | max_token_len = std::max(max_token_len, (int) word.size()); |
2079 | |
|
2080 | 0 | auto & token_data = id_to_token[i]; |
2081 | 0 | token_data.text = std::move(word); |
2082 | 0 | token_data.score = scores ? scores[i] : 0.0f; |
2083 | 0 | token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; |
2084 | |
|
2085 | 0 | if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file |
2086 | 0 | switch(toktypes[i]) { |
2087 | 0 | case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; |
2088 | 0 | case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break; |
2089 | 0 | case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break; |
2090 | 0 | case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break; |
2091 | 0 | case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break; |
2092 | 0 | case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break; |
2093 | 0 | case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; |
2094 | 0 | default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; |
2095 | 0 | } |
2096 | 0 | } |
2097 | 0 | } |
2098 | 0 | GGML_ASSERT(id_to_token.size() == token_to_id.size()); |
2099 | |
|
2100 | 0 | init_tokenizer(type); |
2101 | | |
2102 | | // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' |
2103 | 0 | if (type == LLAMA_VOCAB_TYPE_SPM) { |
2104 | 0 | try { |
2105 | 0 | linefeed_id = vocab.byte_to_token('\n'); |
2106 | 0 | } catch (const std::exception & e) { |
2107 | 0 | LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); |
2108 | 0 | linefeed_id = special_pad_id; |
2109 | 0 | } |
2110 | 0 | } else if (type == LLAMA_VOCAB_TYPE_WPM) { |
2111 | 0 | linefeed_id = special_pad_id; |
2112 | 0 | } else if (type == LLAMA_VOCAB_TYPE_RWKV) { |
2113 | 0 | const std::vector<int> ids = tokenize("\n", false); |
2114 | 0 | GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); |
2115 | 0 | linefeed_id = ids[0]; |
2116 | 0 | } else { |
2117 | 0 | const std::vector<int> ids = tokenize("\n", false); |
2118 | | |
2119 | | //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); |
2120 | 0 | if (ids.empty()) { |
2121 | 0 | LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__); |
2122 | 0 | linefeed_id = special_pad_id; |
2123 | 0 | } else { |
2124 | 0 | linefeed_id = ids[0]; |
2125 | 0 | } |
2126 | 0 | } |
2127 | | |
2128 | | // special tokens |
2129 | 0 | { |
2130 | 0 | const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = { |
2131 | 0 | { LLM_KV_TOKENIZER_BOS_ID, special_bos_id }, |
2132 | 0 | { LLM_KV_TOKENIZER_EOS_ID, special_eos_id }, |
2133 | 0 | { LLM_KV_TOKENIZER_EOT_ID, special_eot_id }, |
2134 | 0 | { LLM_KV_TOKENIZER_EOM_ID, special_eom_id }, |
2135 | 0 | { LLM_KV_TOKENIZER_UNK_ID, special_unk_id }, |
2136 | 0 | { LLM_KV_TOKENIZER_SEP_ID, special_sep_id }, |
2137 | 0 | { LLM_KV_TOKENIZER_PAD_ID, special_pad_id }, |
2138 | 0 | { LLM_KV_TOKENIZER_MASK_ID, special_mask_id }, |
2139 | 0 | { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id }, |
2140 | 0 | { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id }, |
2141 | 0 | { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id }, |
2142 | 0 | { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id }, |
2143 | 0 | { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id }, |
2144 | 0 | { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id }, |
2145 | | |
2146 | | // deprecated |
2147 | 0 | { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id }, |
2148 | 0 | { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id }, |
2149 | 0 | { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id }, |
2150 | 0 | }; |
2151 | |
|
2152 | 0 | for (const auto & it : special_token_types) { |
2153 | 0 | const std::string & key = kv(std::get<0>(it)); |
2154 | 0 | int32_t & id = std::get<1>(it); |
2155 | |
|
2156 | 0 | uint32_t new_id; |
2157 | 0 | if (!ml.get_key(std::get<0>(it), new_id, false)) { |
2158 | 0 | continue; |
2159 | 0 | } |
2160 | 0 | if (new_id >= id_to_token.size()) { |
2161 | 0 | LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n", |
2162 | 0 | __func__, key.c_str(), new_id, id); |
2163 | 0 | } else { |
2164 | 0 | id = new_id; |
2165 | 0 | } |
2166 | 0 | } |
2167 | | |
2168 | | // Handle add_bos, add_eos and add_sep |
2169 | 0 | { |
2170 | 0 | bool temp = true; |
2171 | |
|
2172 | 0 | if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) { |
2173 | 0 | add_bos = temp; |
2174 | 0 | } |
2175 | 0 | if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) { |
2176 | 0 | add_eos = temp; |
2177 | 0 | } |
2178 | 0 | if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) { |
2179 | 0 | add_sep = temp; |
2180 | 0 | } |
2181 | 0 | } |
2182 | | |
2183 | | // auto-detect special tokens by text |
2184 | | // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_... |
2185 | | // for now, we apply this workaround to find the tokens based on their text |
2186 | |
|
2187 | 0 | for (const auto & t : token_to_id) { |
2188 | | // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc. |
2189 | 0 | if (special_eot_id == LLAMA_TOKEN_NULL) { |
2190 | 0 | if (false |
2191 | 0 | || t.first == "<|eot_id|>" |
2192 | 0 | || t.first == "<|im_end|>" |
2193 | 0 | || t.first == "<|end|>" |
2194 | 0 | || t.first == "<end_of_turn>" |
2195 | 0 | || t.first == "<|endoftext|>" |
2196 | 0 | || t.first == "<|end_of_text|>" // granite |
2197 | 0 | || t.first == "<EOT>" |
2198 | 0 | || t.first == "_<EOT>" |
2199 | 0 | || t.first == "<|end▁of▁sentence|>" // DeepSeek |
2200 | 0 | || t.first == "<end_of_utterance>" // smoldocling |
2201 | 0 | ) { |
2202 | 0 | special_eot_id = t.second; |
2203 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2204 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2205 | 0 | __func__, t.second, t.first.c_str()); |
2206 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2207 | 0 | } |
2208 | 0 | } |
2209 | 0 | } |
2210 | | |
2211 | | // find EOM token: "<|eom_id|>" |
2212 | 0 | if (special_eom_id == LLAMA_TOKEN_NULL) { |
2213 | 0 | if (false |
2214 | 0 | || t.first == "<|eom_id|>" |
2215 | 0 | ) { |
2216 | 0 | special_eom_id = t.second; |
2217 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2218 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2219 | 0 | __func__, t.second, t.first.c_str()); |
2220 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2221 | 0 | } |
2222 | 0 | } |
2223 | 0 | } |
2224 | | |
2225 | | // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc. |
2226 | 0 | if (special_fim_pre_id == LLAMA_TOKEN_NULL) { |
2227 | 0 | if (false |
2228 | 0 | || t.first == "<|fim_prefix|>" // Qwen |
2229 | 0 | || t.first == "<fim-prefix>" |
2230 | 0 | || t.first == "<fim_prefix>" // Granite |
2231 | 0 | || t.first == "<|fim▁begin|>" // DeepSeek |
2232 | 0 | || t.first == "<PRE>" |
2233 | 0 | || t.first == "▁<PRE>" // CodeLlama |
2234 | 0 | || t.first == "<|code_prefix|>" // GLM-4.5 |
2235 | 0 | ) { |
2236 | 0 | special_fim_pre_id = t.second; |
2237 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2238 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2239 | 0 | __func__, t.second, t.first.c_str()); |
2240 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2241 | 0 | } |
2242 | 0 | } |
2243 | 0 | } |
2244 | | |
2245 | | // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc. |
2246 | 0 | if (special_fim_suf_id == LLAMA_TOKEN_NULL) { |
2247 | 0 | if (false |
2248 | 0 | || t.first == "<|fim_suffix|>" // Qwen |
2249 | 0 | || t.first == "<fim-suffix>" |
2250 | 0 | || t.first == "<fim_suffix>" // Granite |
2251 | 0 | || t.first == "<|fim▁hole|>" // DeepSeek |
2252 | 0 | || t.first == "<SUF>" |
2253 | 0 | || t.first == "▁<SUF>" // CodeLlama |
2254 | 0 | || t.first == "<|code_suffix|>" // GLM-4.5 |
2255 | 0 | ) { |
2256 | 0 | special_fim_suf_id = t.second; |
2257 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2258 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2259 | 0 | __func__, t.second, t.first.c_str()); |
2260 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2261 | 0 | } |
2262 | 0 | } |
2263 | 0 | } |
2264 | | |
2265 | | // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc. |
2266 | 0 | if (special_fim_mid_id == LLAMA_TOKEN_NULL) { |
2267 | 0 | if (false |
2268 | 0 | || t.first == "<|fim_middle|>" // Qwen |
2269 | 0 | || t.first == "<fim-middle>" |
2270 | 0 | || t.first == "<fim_middle>" // Granite |
2271 | 0 | || t.first == "<|fim▁end|>" // DeepSeek |
2272 | 0 | || t.first == "<MID>" |
2273 | 0 | || t.first == "▁<MID>" // CodeLlama |
2274 | 0 | || t.first == "<|code_middle|>" // GLM-4.5 |
2275 | 0 | ) { |
2276 | 0 | special_fim_mid_id = t.second; |
2277 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2278 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2279 | 0 | __func__, t.second, t.first.c_str()); |
2280 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2281 | 0 | } |
2282 | 0 | } |
2283 | 0 | } |
2284 | | |
2285 | | // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc. |
2286 | 0 | if (special_fim_pad_id == LLAMA_TOKEN_NULL) { |
2287 | 0 | if (false |
2288 | 0 | || t.first == "<|fim_pad|>" // Qwen |
2289 | 0 | || t.first == "<fim-pad>" |
2290 | 0 | || t.first == "<fim_pad>" // Granite |
2291 | 0 | || t.first == "<PAD>" |
2292 | 0 | ) { |
2293 | 0 | special_fim_pad_id = t.second; |
2294 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2295 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2296 | 0 | __func__, t.second, t.first.c_str()); |
2297 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2298 | 0 | } |
2299 | 0 | } |
2300 | 0 | } |
2301 | | |
2302 | | // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc. |
2303 | 0 | if (special_fim_rep_id == LLAMA_TOKEN_NULL) { |
2304 | 0 | if (false |
2305 | 0 | || t.first == "<|fim_repo|>" // Qwen |
2306 | 0 | || t.first == "<|repo_name|>" |
2307 | 0 | || t.first == "<fim-repo>" |
2308 | 0 | || t.first == "<REPO>" |
2309 | 0 | || t.first == "<reponame>" // Granite |
2310 | 0 | ) { |
2311 | 0 | special_fim_rep_id = t.second; |
2312 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2313 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2314 | 0 | __func__, t.second, t.first.c_str()); |
2315 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2316 | 0 | } |
2317 | 0 | } |
2318 | 0 | } |
2319 | | |
2320 | | // find FIM_SEP token: "<|file_sep|>" |
2321 | 0 | if (special_fim_sep_id == LLAMA_TOKEN_NULL) { |
2322 | 0 | if (false |
2323 | 0 | || t.first == "<|file_sep|>" // Qwen |
2324 | 0 | ) { |
2325 | 0 | special_fim_sep_id = t.second; |
2326 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2327 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2328 | 0 | __func__, t.second, t.first.c_str()); |
2329 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2330 | 0 | } |
2331 | 0 | } |
2332 | 0 | } |
2333 | 0 | } |
2334 | | |
2335 | | // maintain a list of tokens that cause end-of-generation |
2336 | | // this is currently determined based on the token text, which is obviously not ideal |
2337 | | // ref: https://github.com/ggerganov/llama.cpp/issues/9606 |
2338 | 0 | special_eog_ids.clear(); |
2339 | |
|
2340 | 0 | if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) { |
2341 | 0 | special_eog_ids.insert(special_fim_pad_id); |
2342 | 0 | } |
2343 | |
|
2344 | 0 | if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) { |
2345 | 0 | special_eog_ids.insert(special_fim_rep_id); |
2346 | 0 | } |
2347 | |
|
2348 | 0 | if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) { |
2349 | 0 | special_eog_ids.insert(special_fim_sep_id); |
2350 | 0 | } |
2351 | |
|
2352 | 0 | for (const auto & t : token_to_id) { |
2353 | 0 | if (false |
2354 | 0 | || t.first == "<|eot_id|>" |
2355 | 0 | || t.first == "<|im_end|>" |
2356 | 0 | || t.first == "<|end|>" |
2357 | 0 | || t.first == "<|return|>" // o200k_harmony |
2358 | 0 | || t.first == "<|call|>" // o200k_harmony |
2359 | 0 | || t.first == "<end_of_turn>" |
2360 | 0 | || t.first == "<|endoftext|>" |
2361 | 0 | || t.first == "<|eom_id|>" |
2362 | 0 | || t.first == "<EOT>" |
2363 | 0 | || t.first == "_<EOT>" |
2364 | 0 | || t.first == "<|end_of_text|>" |
2365 | 0 | || t.first == "<end_of_utterance>" // smoldocling |
2366 | 0 | ) { |
2367 | 0 | special_eog_ids.insert(t.second); |
2368 | 0 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
2369 | 0 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", |
2370 | 0 | __func__, t.second, t.first.c_str()); |
2371 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
2372 | 0 | } |
2373 | 0 | } else { |
2374 | | // token is control, but not marked as EOG -> print a debug log |
2375 | 0 | if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) { |
2376 | 0 | LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", |
2377 | 0 | __func__, t.second, t.first.c_str()); |
2378 | 0 | } |
2379 | 0 | } |
2380 | 0 | } |
2381 | | |
2382 | | // @ngxson : quick hack for gpt-oss, always render these tokens |
2383 | 0 | for (const auto & t : token_to_id) { |
2384 | 0 | if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") { |
2385 | 0 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED; |
2386 | 0 | } |
2387 | 0 | } |
2388 | | |
2389 | | // sanity checks |
2390 | 0 | if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) { |
2391 | 0 | special_eog_ids.insert(special_eos_id); |
2392 | 0 | LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); |
2393 | 0 | } |
2394 | |
|
2395 | 0 | if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) { |
2396 | 0 | special_eog_ids.insert(special_eot_id); |
2397 | 0 | LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); |
2398 | 0 | } |
2399 | |
|
2400 | 0 | if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) { |
2401 | 0 | special_eog_ids.insert(special_eom_id); |
2402 | 0 | LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); |
2403 | 0 | } |
2404 | | |
2405 | | // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG |
2406 | | // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens, |
2407 | | // we remove the "<|end|>" token from the EOG list |
2408 | 0 | { |
2409 | 0 | bool has_return = false; |
2410 | 0 | bool has_call = false; |
2411 | 0 | bool has_end = false; |
2412 | |
|
2413 | 0 | llama_token end_id = LLAMA_TOKEN_NULL; |
2414 | |
|
2415 | 0 | LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__); |
2416 | 0 | for (auto tid : special_eog_ids) { |
2417 | 0 | LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str()); |
2418 | |
|
2419 | 0 | if (id_to_token[tid].text == "<|return|>") { |
2420 | 0 | has_return = true; |
2421 | 0 | } else if (id_to_token[tid].text == "<|call|>") { |
2422 | 0 | has_call = true; |
2423 | 0 | } else if (id_to_token[tid].text == "<|end|>") { |
2424 | 0 | has_end = true; |
2425 | 0 | end_id = tid; |
2426 | 0 | } |
2427 | 0 | } |
2428 | |
|
2429 | 0 | if (has_return && has_call && has_end) { |
2430 | 0 | special_eog_ids.erase(end_id); |
2431 | 0 | id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED; |
2432 | 0 | LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__); |
2433 | 0 | } |
2434 | 0 | } |
2435 | 0 | } |
2436 | | |
2437 | | // build special tokens cache |
2438 | 0 | { |
2439 | 0 | for (llama_token id = 0; id < (llama_token) n_tokens; ++id) { |
2440 | 0 | if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) { |
2441 | 0 | cache_special_tokens.push_back(id); |
2442 | 0 | } |
2443 | 0 | } |
2444 | |
|
2445 | 0 | std::sort(cache_special_tokens.begin(), cache_special_tokens.end(), |
2446 | 0 | [&] (const llama_token a, const llama_token b) { |
2447 | 0 | return id_to_token[a].text.size() > id_to_token[b].text.size(); |
2448 | 0 | } |
2449 | 0 | ); |
2450 | |
|
2451 | 0 | LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size()); |
2452 | 0 | } |
2453 | | |
2454 | | // build token to piece cache |
2455 | 0 | { |
2456 | 0 | size_t size_cache = 0; |
2457 | |
|
2458 | 0 | std::vector<std::string> cache(n_tokens); |
2459 | |
|
2460 | 0 | for (uint32_t id = 0; id < n_tokens; ++id) { |
2461 | 0 | cache[id] = token_to_piece_for_cache(id, true); |
2462 | |
|
2463 | 0 | size_cache += cache[id].size(); |
2464 | 0 | } |
2465 | |
|
2466 | 0 | std::swap(cache_token_to_piece, cache); |
2467 | |
|
2468 | 0 | LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); |
2469 | 0 | } |
2470 | | |
2471 | | // Handle per token attributes |
2472 | | //NOTE: Each model customizes per token attributes. |
2473 | | //NOTE: Per token attributes are missing from the GGUF file. |
2474 | | //TODO: Extract attributes from GGUF file. |
2475 | 0 | { |
2476 | 0 | auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool { |
2477 | 0 | for (const auto & substr : substrs) { |
2478 | 0 | if (str.find(substr) != std::string::npos) { |
2479 | 0 | return true; |
2480 | 0 | } |
2481 | 0 | } |
2482 | 0 | return false; |
2483 | 0 | }; |
2484 | |
|
2485 | 0 | auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) { |
2486 | 0 | uint32_t current = id_to_token.at(id).attr; |
2487 | 0 | current = value ? (current | attr) : (current & ~attr); |
2488 | 0 | id_to_token[id].attr = (llama_token_attr) current; |
2489 | 0 | }; |
2490 | |
|
2491 | 0 | auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) { |
2492 | 0 | _set_tokenid_attr(token_to_id.at(token), attr, value); |
2493 | 0 | }; |
2494 | |
|
2495 | 0 | std::string model_name; |
2496 | 0 | std::string tokenizer_pre; |
2497 | 0 | std::string general_arch; |
2498 | |
|
2499 | 0 | ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); |
2500 | 0 | ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); |
2501 | 0 | ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false); |
2502 | | |
2503 | | // model name to lowercase |
2504 | 0 | std::transform(model_name.begin(), model_name.end(), model_name.begin(), |
2505 | 0 | [] (const std::string::value_type x) { |
2506 | 0 | return std::tolower(x); |
2507 | 0 | } |
2508 | 0 | ); |
2509 | | |
2510 | | // set attributes by model/tokenizer/architecture name |
2511 | 0 | if (false |
2512 | 0 | || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) |
2513 | 0 | || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"}) |
2514 | 0 | ) { |
2515 | 0 | if (token_to_id.count("<mask>") == 0) { |
2516 | 0 | LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__); |
2517 | 0 | } else { |
2518 | 0 | _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true); |
2519 | 0 | } |
2520 | 0 | } else if (_contains_any(model_name, {"phi-3", "phi3"})) { |
2521 | 0 | for (auto id : cache_special_tokens) { |
2522 | 0 | _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true); |
2523 | 0 | } |
2524 | 0 | for (const auto * token : {"</s>"}) { |
2525 | 0 | _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true); |
2526 | 0 | } |
2527 | 0 | for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) { |
2528 | 0 | _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); |
2529 | 0 | } |
2530 | 0 | } |
2531 | 0 | } |
2532 | 0 | } |
2533 | | |
2534 | 0 | enum llama_vocab_type llama_vocab::impl::get_type() const { |
2535 | 0 | return type; |
2536 | 0 | } |
2537 | | |
2538 | 0 | std::string llama_vocab::impl::type_name() const{ |
2539 | 0 | switch (type) { |
2540 | 0 | case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; |
2541 | 0 | case LLAMA_VOCAB_TYPE_SPM: return "SPM"; |
2542 | 0 | case LLAMA_VOCAB_TYPE_BPE: return "BPE"; |
2543 | 0 | case LLAMA_VOCAB_TYPE_WPM: return "WPM"; |
2544 | 0 | case LLAMA_VOCAB_TYPE_UGM: return "UGM"; |
2545 | 0 | case LLAMA_VOCAB_TYPE_RWKV: return "RWKV"; |
2546 | 0 | case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2"; |
2547 | 0 | default: return "unknown"; |
2548 | 0 | } |
2549 | 0 | } |
2550 | | |
2551 | 0 | bool llama_vocab::impl::is_normal(llama_token id) const { |
2552 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2553 | 0 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL; |
2554 | 0 | } |
2555 | | |
2556 | 0 | bool llama_vocab::impl::is_unknown(llama_token id) const { |
2557 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2558 | 0 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN; |
2559 | 0 | } |
2560 | | |
2561 | 0 | bool llama_vocab::impl::is_control(llama_token id) const { |
2562 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2563 | 0 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL; |
2564 | 0 | } |
2565 | | |
2566 | 0 | bool llama_vocab::impl::is_byte(llama_token id) const { |
2567 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2568 | 0 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE; |
2569 | 0 | } |
2570 | | |
2571 | 0 | bool llama_vocab::impl::is_user_defined(llama_token id) const { |
2572 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2573 | 0 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED; |
2574 | 0 | } |
2575 | | |
2576 | 0 | bool llama_vocab::impl::is_unused(llama_token id) const { |
2577 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2578 | 0 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED; |
2579 | 0 | } |
2580 | | |
2581 | 0 | bool llama_vocab::impl::is_eog(llama_token id) const { |
2582 | 0 | return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0; |
2583 | 0 | } |
2584 | | |
2585 | 0 | uint8_t llama_vocab::impl::token_to_byte(llama_token id) const { |
2586 | 0 | GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE); |
2587 | 0 | GGML_ASSERT(is_byte(id)); |
2588 | 0 | const auto & token_data = id_to_token.at(id); |
2589 | 0 | switch (get_type()) { |
2590 | 0 | case LLAMA_VOCAB_TYPE_SPM: |
2591 | 0 | case LLAMA_VOCAB_TYPE_UGM: { |
2592 | 0 | auto buf = token_data.text.substr(3, 2); |
2593 | 0 | return strtol(buf.c_str(), NULL, 16); |
2594 | 0 | } |
2595 | 0 | case LLAMA_VOCAB_TYPE_BPE: { |
2596 | 0 | GGML_ABORT("fatal error"); |
2597 | 0 | } |
2598 | 0 | case LLAMA_VOCAB_TYPE_WPM: { |
2599 | 0 | GGML_ABORT("fatal error"); |
2600 | 0 | } |
2601 | 0 | default: |
2602 | 0 | GGML_ABORT("fatal error"); |
2603 | 0 | } |
2604 | 0 | } |
2605 | | |
2606 | 0 | llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const { |
2607 | 0 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
2608 | 0 | return id_to_token.at(id).attr; |
2609 | 0 | } |
2610 | | |
2611 | 0 | void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { |
2612 | 0 | LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type); |
2613 | |
|
2614 | 0 | switch (type) { |
2615 | 0 | case LLAMA_VOCAB_TYPE_SPM: |
2616 | 0 | tokenizer = std::make_unique<llm_tokenizer_spm>(vocab); |
2617 | 0 | break; |
2618 | 0 | case LLAMA_VOCAB_TYPE_BPE: |
2619 | 0 | tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab); |
2620 | 0 | break; |
2621 | 0 | case LLAMA_VOCAB_TYPE_WPM: |
2622 | 0 | tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab); |
2623 | 0 | break; |
2624 | 0 | case LLAMA_VOCAB_TYPE_UGM: |
2625 | 0 | tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap); |
2626 | 0 | break; |
2627 | 0 | case LLAMA_VOCAB_TYPE_RWKV: |
2628 | 0 | tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab); |
2629 | 0 | break; |
2630 | 0 | case LLAMA_VOCAB_TYPE_PLAMO2: |
2631 | 0 | tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab); |
2632 | 0 | break; |
2633 | 0 | default: |
2634 | 0 | GGML_ABORT("unsupported vocab type"); |
2635 | 0 | } |
2636 | 0 | } |
2637 | | |
2638 | | // |
2639 | | // (de-) tokenize |
2640 | | // |
2641 | | |
2642 | | // #define PRETOKENIZERDEBUG |
2643 | | |
2644 | 0 | void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const { |
2645 | | // for each special token |
2646 | 0 | for (const llama_token special_id : cache_special_tokens) { |
2647 | 0 | const auto & data = vocab.get_token_data(special_id); |
2648 | 0 | const auto & text = data.text; |
2649 | |
|
2650 | 0 | if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) { |
2651 | | // Ignore control and unknown tokens when parse_special == false |
2652 | 0 | continue; |
2653 | | // User-defined tokens are still pre-tokenized before everything else |
2654 | | // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726 |
2655 | | // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.) |
2656 | 0 | } |
2657 | | |
2658 | | // for each text fragment |
2659 | 0 | std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin(); |
2660 | 0 | while (it != buffer.end()) { |
2661 | 0 | auto & fragment = (*it); |
2662 | | |
2663 | | // if a fragment is text ( not yet processed ) |
2664 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2665 | 0 | const auto & raw_text = fragment.raw_text; |
2666 | |
|
2667 | 0 | auto raw_text_base_offset = fragment.offset; |
2668 | 0 | auto raw_text_base_length = fragment.length; |
2669 | | |
2670 | | // loop over the text |
2671 | 0 | while (true) { |
2672 | | // find the first occurrence of a given special token in this fragment |
2673 | | // passing offset argument only limit the "search area" but match coordinates |
2674 | | // are still relative to the source full raw_text |
2675 | | // string_view begins at pos 0 for the same reason |
2676 | 0 | auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset); |
2677 | | |
2678 | | // no occurrences found, stop processing this fragment for a given special token |
2679 | 0 | if (match == std::string::npos) break; |
2680 | | |
2681 | | #ifdef PRETOKENIZERDEBUG |
2682 | | LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); |
2683 | | #endif |
2684 | 0 | auto source = std::distance(buffer.begin(), it); |
2685 | | |
2686 | | // if match is further than base offset |
2687 | | // then we have some text to the left of it |
2688 | 0 | if (match > raw_text_base_offset) { |
2689 | | // left |
2690 | 0 | const int64_t left_reminder_offset = raw_text_base_offset + 0; |
2691 | 0 | int64_t left_reminder_length = match - raw_text_base_offset; |
2692 | |
|
2693 | 0 | if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) { |
2694 | 0 | while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) { |
2695 | 0 | left_reminder_length--; |
2696 | 0 | } |
2697 | 0 | } |
2698 | |
|
2699 | 0 | if (left_reminder_length > 0) { |
2700 | 0 | buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); |
2701 | 0 | it++; |
2702 | 0 | } |
2703 | |
|
2704 | | #ifdef PRETOKENIZERDEBUG |
2705 | | LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); |
2706 | | #endif |
2707 | 0 | } |
2708 | | |
2709 | | // special token |
2710 | 0 | buffer.emplace_after(it, special_id); |
2711 | 0 | it++; |
2712 | | |
2713 | | // right |
2714 | 0 | if (match + text.length() < raw_text_base_offset + raw_text_base_length) { |
2715 | 0 | int64_t right_reminder_offset = match + text.length(); |
2716 | 0 | int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length()); |
2717 | |
|
2718 | 0 | if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) { |
2719 | 0 | while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { |
2720 | 0 | right_reminder_offset++; |
2721 | 0 | right_reminder_length--; |
2722 | 0 | } |
2723 | 0 | } |
2724 | |
|
2725 | 0 | if (right_reminder_length > 0) { |
2726 | 0 | buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); |
2727 | 0 | it++; |
2728 | 0 | } |
2729 | |
|
2730 | | #ifdef PRETOKENIZERDEBUG |
2731 | | LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); |
2732 | | #endif |
2733 | |
|
2734 | 0 | if (source == 0) { |
2735 | 0 | buffer.erase_after(buffer.before_begin()); |
2736 | 0 | } else { |
2737 | 0 | buffer.erase_after(std::next(buffer.begin(), (source - 1))); |
2738 | 0 | } |
2739 | | |
2740 | | // repeat for the right side |
2741 | 0 | raw_text_base_offset = right_reminder_offset; |
2742 | 0 | raw_text_base_length = right_reminder_length; |
2743 | |
|
2744 | | #ifdef PRETOKENIZERDEBUG |
2745 | | LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); |
2746 | | #endif |
2747 | 0 | } else { |
2748 | 0 | if (source == 0) { |
2749 | 0 | buffer.erase_after(buffer.before_begin()); |
2750 | 0 | } else { |
2751 | 0 | buffer.erase_after(std::next(buffer.begin(), (source - 1))); |
2752 | 0 | } |
2753 | 0 | break; |
2754 | 0 | } |
2755 | 0 | } |
2756 | 0 | } |
2757 | 0 | it++; |
2758 | 0 | } |
2759 | 0 | } |
2760 | 0 | } |
2761 | | |
2762 | | // NOTE: avoid ever using this except for building the token_to_piece caches |
2763 | 0 | std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const { |
2764 | 0 | std::string piece; |
2765 | 0 | piece.resize(piece.capacity()); // using string internal cache |
2766 | 0 | const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special); |
2767 | 0 | if (n_chars < 0) { |
2768 | 0 | piece.resize(-n_chars); |
2769 | 0 | int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special); |
2770 | 0 | GGML_ASSERT(check == -n_chars); |
2771 | 0 | } |
2772 | 0 | else { |
2773 | 0 | piece.resize(n_chars); |
2774 | 0 | } |
2775 | |
|
2776 | 0 | return piece; |
2777 | 0 | } |
2778 | | |
2779 | 0 | static void llama_escape_whitespace(std::string & text) { |
2780 | 0 | replace_all(text, " ", "\xe2\x96\x81"); |
2781 | 0 | } |
2782 | | |
2783 | 0 | static void llama_unescape_whitespace(std::string & word) { |
2784 | 0 | replace_all(word, "\xe2\x96\x81", " "); |
2785 | 0 | } |
2786 | | |
2787 | 0 | static std::string llama_decode_text(const std::string & text) { |
2788 | 0 | std::string decoded_text; |
2789 | |
|
2790 | 0 | const auto cpts = unicode_cpts_from_utf8(text); |
2791 | 0 | for (const auto cpt : cpts) { |
2792 | 0 | const auto utf8 = unicode_cpt_to_utf8(cpt); |
2793 | 0 | try { |
2794 | 0 | decoded_text += unicode_utf8_to_byte(utf8); |
2795 | 0 | } catch (const std::out_of_range & /*e*/) { |
2796 | 0 | decoded_text += "[UNK_BYTE_0x"; |
2797 | 0 | for (const auto c : utf8) { |
2798 | 0 | decoded_text += format("%02x", (uint8_t) c); |
2799 | 0 | } |
2800 | 0 | decoded_text += text + "]"; |
2801 | 0 | } |
2802 | 0 | } |
2803 | |
|
2804 | 0 | return decoded_text; |
2805 | 0 | } |
2806 | | |
2807 | | std::vector<llama_token> llama_vocab::impl::tokenize( |
2808 | | const std::string & raw_text, |
2809 | | bool add_special, |
2810 | 0 | bool parse_special) const { |
2811 | 0 | GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); |
2812 | |
|
2813 | 0 | std::vector<llama_token> output; |
2814 | 0 | std::forward_list<fragment_buffer_variant> fragment_buffer; |
2815 | |
|
2816 | 0 | if (!raw_text.empty()) { |
2817 | 0 | fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); |
2818 | 0 | tokenizer_st_partition(fragment_buffer, parse_special); |
2819 | 0 | } |
2820 | |
|
2821 | 0 | switch (get_type()) { |
2822 | 0 | case LLAMA_VOCAB_TYPE_SPM: |
2823 | 0 | { |
2824 | | // OG tokenizer behavior: |
2825 | | // |
2826 | | // tokenizer.encode('', add_special_tokens=True) returns [1] |
2827 | | // tokenizer.encode('', add_special_tokens=False) returns [] |
2828 | |
|
2829 | 0 | bool is_prev_special = true; // prefix with space if first token |
2830 | |
|
2831 | 0 | if (add_special && add_bos) { |
2832 | 0 | GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL); |
2833 | 0 | output.push_back(special_bos_id); |
2834 | 0 | is_prev_special = true; |
2835 | 0 | } |
2836 | |
|
2837 | 0 | for (const auto & fragment : fragment_buffer) { |
2838 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2839 | 0 | std::string text; |
2840 | | |
2841 | | // prefix with space if previous is special |
2842 | 0 | if (add_space_prefix && is_prev_special) { |
2843 | 0 | text = ' '; |
2844 | 0 | } |
2845 | |
|
2846 | 0 | text += fragment.raw_text.substr(fragment.offset, fragment.length); |
2847 | |
|
2848 | | #ifdef PRETOKENIZERDEBUG |
2849 | | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); |
2850 | | #endif |
2851 | 0 | llama_escape_whitespace(text); |
2852 | 0 | llm_tokenizer_spm_session session(vocab); |
2853 | 0 | session.tokenize(text, output); |
2854 | 0 | is_prev_special = false; |
2855 | 0 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
2856 | 0 | output.push_back(fragment.token); |
2857 | 0 | is_prev_special = true; |
2858 | 0 | } |
2859 | 0 | } |
2860 | |
|
2861 | 0 | if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) { |
2862 | 0 | LLAMA_LOG_WARN( |
2863 | 0 | "%s: Added a BOS token to the prompt as specified by the model but the prompt " |
2864 | 0 | "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " |
2865 | 0 | "Are you sure this is what you want?\n", __FUNCTION__); |
2866 | 0 | } |
2867 | |
|
2868 | 0 | if (add_special && add_eos) { |
2869 | 0 | GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL); |
2870 | 0 | output.push_back(special_eos_id); |
2871 | 0 | } |
2872 | 0 | } break; |
2873 | 0 | case LLAMA_VOCAB_TYPE_BPE: |
2874 | 0 | { |
2875 | 0 | llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get())); |
2876 | | // it calls some other methods that are not exist in llm_tokenizer, |
2877 | | // here just cast it to bpe tokenizer object |
2878 | 0 | if (add_special) { |
2879 | 0 | session.append_bos(output); |
2880 | 0 | } |
2881 | 0 | for (const auto & fragment : fragment_buffer) { |
2882 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2883 | 0 | std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); |
2884 | |
|
2885 | | #ifdef PRETOKENIZERDEBUG |
2886 | | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); |
2887 | | #endif |
2888 | 0 | session.tokenize(text, output); |
2889 | 0 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
2890 | 0 | session.append(fragment.token, output); |
2891 | 0 | } |
2892 | 0 | } |
2893 | |
|
2894 | 0 | if (add_special) { |
2895 | 0 | session.append_eos(output); |
2896 | 0 | session.check_double_bos_eos(output); |
2897 | 0 | } |
2898 | 0 | } break; |
2899 | 0 | case LLAMA_VOCAB_TYPE_WPM: |
2900 | 0 | { |
2901 | 0 | if (add_special) { |
2902 | 0 | GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL); |
2903 | 0 | output.push_back(special_bos_id); |
2904 | 0 | } |
2905 | |
|
2906 | 0 | llm_tokenizer_wpm_session session(vocab); |
2907 | |
|
2908 | 0 | for (const auto & fragment : fragment_buffer) { |
2909 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2910 | 0 | std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); |
2911 | |
|
2912 | | #ifdef PRETOKENIZERDEBUG |
2913 | | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); |
2914 | | #endif |
2915 | 0 | session.tokenize(text, output); |
2916 | 0 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
2917 | 0 | output.push_back(fragment.token); |
2918 | 0 | } |
2919 | 0 | } |
2920 | |
|
2921 | 0 | if (add_special) { |
2922 | 0 | GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL); |
2923 | 0 | output.push_back(special_sep_id); |
2924 | 0 | } |
2925 | 0 | } break; |
2926 | 0 | case LLAMA_VOCAB_TYPE_UGM: |
2927 | 0 | { |
2928 | 0 | if (add_special && add_bos) { |
2929 | 0 | GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL); |
2930 | 0 | output.push_back(special_bos_id); |
2931 | 0 | } |
2932 | 0 | llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get())); |
2933 | |
|
2934 | 0 | for (const auto & fragment : fragment_buffer) { |
2935 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2936 | 0 | std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); |
2937 | | #ifdef PRETOKENIZERDEBUG |
2938 | | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); |
2939 | | #endif |
2940 | 0 | session.tokenize(text, output); |
2941 | 0 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
2942 | 0 | output.push_back(fragment.token); |
2943 | 0 | } |
2944 | 0 | } |
2945 | |
|
2946 | 0 | if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) { |
2947 | 0 | LLAMA_LOG_WARN( |
2948 | 0 | "%s: Added a BOS token to the prompt as specified by the model but the prompt " |
2949 | 0 | "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " |
2950 | 0 | "Are you sure this is what you want?\n", __FUNCTION__); |
2951 | 0 | } |
2952 | |
|
2953 | 0 | if (add_special && add_eos) { |
2954 | 0 | GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL); |
2955 | 0 | output.push_back(special_eos_id); |
2956 | 0 | } |
2957 | 0 | } break; |
2958 | 0 | case LLAMA_VOCAB_TYPE_RWKV: |
2959 | 0 | { |
2960 | 0 | llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get())); |
2961 | 0 | for (const auto & fragment : fragment_buffer) { |
2962 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2963 | 0 | std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); |
2964 | |
|
2965 | | #ifdef PRETOKENIZERDEBUG |
2966 | | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); |
2967 | | #endif |
2968 | |
|
2969 | 0 | session.tokenize(text, output); |
2970 | 0 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
2971 | 0 | output.push_back(fragment.token); |
2972 | 0 | } |
2973 | 0 | } |
2974 | 0 | } break; |
2975 | 0 | case LLAMA_VOCAB_TYPE_PLAMO2: |
2976 | 0 | { |
2977 | 0 | llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get())); |
2978 | 0 | for (const auto & fragment : fragment_buffer) { |
2979 | 0 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
2980 | 0 | std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); |
2981 | |
|
2982 | | #ifdef PRETOKENIZERDEBUG |
2983 | | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); |
2984 | | #endif |
2985 | |
|
2986 | 0 | session.tokenize(text, output); |
2987 | 0 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
2988 | 0 | output.push_back(fragment.token); |
2989 | 0 | } |
2990 | 0 | } |
2991 | 0 | } break; |
2992 | 0 | case LLAMA_VOCAB_TYPE_NONE: |
2993 | 0 | GGML_ABORT("fatal error"); |
2994 | 0 | } |
2995 | | |
2996 | 0 | return output; |
2997 | 0 | } |
2998 | | |
2999 | 0 | int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const { |
3000 | | // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 |
3001 | 0 | static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL; |
3002 | 0 | const llama_token_attr attr = token_get_attr(token); |
3003 | 0 | if (!special && (attr & attr_special)) { |
3004 | 0 | return 0; |
3005 | 0 | } |
3006 | | |
3007 | | // copy piece chars to output text buffer |
3008 | | // skip up to 'lstrip' leading spaces before copying |
3009 | 0 | auto _try_copy = [=] (const char * token, size_t size) -> int32_t { |
3010 | 0 | if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) { |
3011 | 0 | GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size); |
3012 | 0 | } |
3013 | |
|
3014 | 0 | for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { |
3015 | 0 | token++; |
3016 | 0 | size--; |
3017 | 0 | } |
3018 | 0 | if (length < (int32_t)size) { |
3019 | 0 | return -(int32_t) size; |
3020 | 0 | } |
3021 | 0 | memcpy(buf, token, size); |
3022 | 0 | return (int32_t) size; |
3023 | 0 | }; |
3024 | | |
3025 | | // if we have a cache - use it |
3026 | 0 | { |
3027 | 0 | const auto & cache = cache_token_to_piece; |
3028 | |
|
3029 | 0 | if (!cache.empty()) { |
3030 | 0 | const auto & result = cache.at(token); |
3031 | 0 | return _try_copy(result.data(), result.size()); |
3032 | 0 | } |
3033 | 0 | } |
3034 | | |
3035 | 0 | if (0 <= token && token < (int32_t) id_to_token.size()) { |
3036 | 0 | const std::string & token_text = id_to_token[token].text; |
3037 | 0 | switch (get_type()) { |
3038 | 0 | case LLAMA_VOCAB_TYPE_WPM: |
3039 | 0 | case LLAMA_VOCAB_TYPE_SPM: |
3040 | 0 | case LLAMA_VOCAB_TYPE_UGM: { |
3041 | | // NOTE: we accept all unsupported token types, |
3042 | | // suppressing them like CONTROL tokens. |
3043 | 0 | if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { |
3044 | 0 | return _try_copy(token_text.data(), token_text.size()); |
3045 | 0 | } |
3046 | 0 | if (attr & LLAMA_TOKEN_ATTR_NORMAL) { |
3047 | 0 | std::string result = token_text; |
3048 | 0 | llama_unescape_whitespace(result); |
3049 | 0 | return _try_copy(result.data(), result.size()); |
3050 | 0 | } |
3051 | 0 | if (attr & LLAMA_TOKEN_ATTR_BYTE) { |
3052 | 0 | char byte = (char) token_to_byte(token); |
3053 | 0 | return _try_copy((char*) &byte, 1); |
3054 | 0 | } |
3055 | 0 | break; |
3056 | 0 | } |
3057 | 0 | case LLAMA_VOCAB_TYPE_BPE: { |
3058 | | // NOTE: we accept all unsupported token types, |
3059 | | // suppressing them like CONTROL tokens. |
3060 | 0 | if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { |
3061 | 0 | return _try_copy(token_text.data(), token_text.size()); |
3062 | 0 | } |
3063 | 0 | if (attr & LLAMA_TOKEN_ATTR_NORMAL) { |
3064 | 0 | std::string result = llama_decode_text(token_text); |
3065 | 0 | return _try_copy(result.data(), result.size()); |
3066 | 0 | } |
3067 | 0 | break; |
3068 | 0 | } |
3069 | 0 | case LLAMA_VOCAB_TYPE_RWKV: { |
3070 | 0 | std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text); |
3071 | | |
3072 | | // If we don't have enough space, return an error |
3073 | 0 | if (result.size() > (size_t)length) { |
3074 | 0 | return -(int)result.size(); |
3075 | 0 | } |
3076 | | |
3077 | 0 | memcpy(buf, result.data(), result.size()); |
3078 | 0 | return (int)result.size(); |
3079 | 0 | } |
3080 | 0 | case LLAMA_VOCAB_TYPE_PLAMO2: { |
3081 | | // PLaMo-2 uses similar token handling as BPE/SPM |
3082 | 0 | if (vocab.is_byte(token)) { |
3083 | | // Handle byte tokens like <0xXX> |
3084 | 0 | if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') { |
3085 | 0 | int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16); |
3086 | 0 | if (length < 1) { |
3087 | 0 | return -1; |
3088 | 0 | } |
3089 | 0 | buf[0] = static_cast<char>(hex_val); |
3090 | 0 | return 1; |
3091 | 0 | } |
3092 | 0 | } |
3093 | | |
3094 | | // Normal token - just copy the text |
3095 | 0 | std::string result = token_text; |
3096 | 0 | return _try_copy(result.data(), result.size()); |
3097 | 0 | } |
3098 | 0 | default: |
3099 | 0 | GGML_ABORT("fatal error"); |
3100 | 0 | } |
3101 | 0 | } |
3102 | | |
3103 | 0 | return 0; |
3104 | 0 | } |
3105 | | |
3106 | 0 | const std::string & llama_vocab::impl::token_to_piece(llama_token token) const { |
3107 | 0 | return cache_token_to_piece.at(token); |
3108 | 0 | } |
3109 | | |
3110 | | int32_t llama_vocab::impl::detokenize( |
3111 | | const llama_token * tokens, |
3112 | | int32_t n_tokens, |
3113 | | char * text, |
3114 | | int32_t text_len_max, |
3115 | | bool remove_special, |
3116 | 0 | bool unparse_special) const { |
3117 | 0 | if (type == LLAMA_VOCAB_TYPE_NONE) { |
3118 | 0 | return 0; |
3119 | 0 | } |
3120 | | |
3121 | 0 | GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); |
3122 | |
|
3123 | 0 | int32_t avail = text_len_max; |
3124 | 0 | int32_t total = 0; |
3125 | | |
3126 | | // remove the leading space |
3127 | 0 | bool remove_space = add_space_prefix; |
3128 | |
|
3129 | 0 | if (remove_special && add_bos) { |
3130 | 0 | if (n_tokens > 0 && tokens[0] == special_bos_id) { |
3131 | 0 | remove_space = false; |
3132 | 0 | n_tokens--; |
3133 | 0 | tokens++; |
3134 | 0 | } |
3135 | 0 | } |
3136 | |
|
3137 | 0 | if (remove_special && add_eos) { |
3138 | 0 | if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) { |
3139 | 0 | n_tokens--; |
3140 | 0 | } |
3141 | 0 | } |
3142 | |
|
3143 | 0 | for (int32_t i = 0; i < n_tokens; ++i) { |
3144 | 0 | GGML_ASSERT(avail >= 0); |
3145 | 0 | int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special); |
3146 | 0 | remove_space = false; |
3147 | 0 | if (n_chars < 0) { |
3148 | 0 | avail = 0; |
3149 | 0 | total -= n_chars; |
3150 | 0 | } else if (n_chars > 0) { |
3151 | 0 | avail -= n_chars; |
3152 | 0 | text += n_chars; |
3153 | 0 | total += n_chars; |
3154 | 0 | } |
3155 | 0 | } |
3156 | |
|
3157 | 0 | if (total > text_len_max) { |
3158 | 0 | return -total; |
3159 | 0 | } |
3160 | | |
3161 | 0 | if (clean_spaces) { |
3162 | 0 | text -= total; // restart text |
3163 | | |
3164 | | // first pass: characters ?!., //TODO: where do these characters come from? |
3165 | 0 | const int32_t total1 = total; |
3166 | 0 | total = total ? 1 : 0; |
3167 | 0 | for (int32_t i = 1; i < total1; ++i) { |
3168 | 0 | const char x = text[i]; |
3169 | 0 | if (text[i - 1] == ' ') { |
3170 | 0 | if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ," |
3171 | 0 | total--; // remove space |
3172 | 0 | } |
3173 | 0 | } |
3174 | 0 | text[total++] = x; |
3175 | 0 | } |
3176 | | |
3177 | | // second pass: strip single apostrophe between spaces |
3178 | 0 | const int32_t total2 = total; |
3179 | 0 | total = total ? 1 : 0; |
3180 | 0 | for (int32_t i = 1; i < total2; ++i) { |
3181 | 0 | const char x = text[i]; |
3182 | 0 | if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' " |
3183 | 0 | total--; // remove prev space |
3184 | 0 | text[++i] = '\0'; // remove next space |
3185 | 0 | } |
3186 | 0 | text[total++] = x; |
3187 | 0 | } |
3188 | | |
3189 | | // third pass: apostrophe contractions //NOTE: this makes sense? |
3190 | 0 | const int32_t total3 = total; |
3191 | 0 | total = total ? 1 : 0; |
3192 | 0 | for (int32_t i = 1; i < total3; ++i) { |
3193 | 0 | const char x = text[i]; |
3194 | 0 | if (text[i - 1] == ' ') { |
3195 | 0 | if (x == '\'' && i + 1 < total3) { |
3196 | 0 | const char x1 = text[i + 1]; |
3197 | 0 | if (x1 == 't' || x1 == 'd') { // " 't", " 'd" |
3198 | | //total--; // remove space |
3199 | 0 | } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm" |
3200 | 0 | total--; // remove space |
3201 | 0 | } else if (i + 2 < total3) { |
3202 | 0 | const char x2 = text[i + 2]; |
3203 | 0 | if ((x1 == 'l' && x2 == 'l')) { // " 'll" |
3204 | | //total--; // remove space |
3205 | 0 | } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've" |
3206 | 0 | total--; // remove space |
3207 | 0 | } else { |
3208 | | //total--; // remove space |
3209 | 0 | } |
3210 | 0 | } else { |
3211 | | //total--; // remove space |
3212 | 0 | } |
3213 | 0 | } |
3214 | 0 | } |
3215 | 0 | text[total++] = x; |
3216 | 0 | } |
3217 | 0 | } |
3218 | |
|
3219 | 0 | return total <= text_len_max ? total : -total; |
3220 | 0 | } |
3221 | | |
3222 | 0 | void llama_vocab::impl::print_info() const { |
3223 | 0 | LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str()); |
3224 | 0 | LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens()); |
3225 | 0 | LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); |
3226 | | |
3227 | | // special tokens |
3228 | 0 | if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); } |
3229 | 0 | if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); } |
3230 | 0 | if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); } |
3231 | 0 | if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); } |
3232 | 0 | if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); } |
3233 | 0 | if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); } |
3234 | 0 | if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); } |
3235 | 0 | if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); } |
3236 | |
|
3237 | 0 | if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); } |
3238 | |
|
3239 | 0 | if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); } |
3240 | 0 | if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); } |
3241 | 0 | if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); } |
3242 | 0 | if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); } |
3243 | 0 | if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); } |
3244 | 0 | if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); } |
3245 | |
|
3246 | 0 | for (const auto & id : special_eog_ids) { |
3247 | 0 | LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() ); |
3248 | 0 | } |
3249 | |
|
3250 | 0 | LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len); |
3251 | 0 | } |
3252 | | |
3253 | 862 | llama_vocab::llama_vocab() : pimpl(new impl(*this)) { |
3254 | 862 | } |
3255 | | |
3256 | 807 | llama_vocab::~llama_vocab() { |
3257 | 807 | } |
3258 | | |
3259 | 0 | void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) { |
3260 | 0 | pimpl->load(ml, kv); |
3261 | 0 | } |
3262 | | |
3263 | 0 | std::string llama_vocab::get_tokenizer_model() const { |
3264 | 0 | return pimpl->tokenizer_model; |
3265 | 0 | } |
3266 | | |
3267 | 0 | std::string llama_vocab::get_tokenizer_pre() const { |
3268 | 0 | return pimpl->tokenizer_pre; |
3269 | 0 | } |
3270 | | |
3271 | 0 | enum llama_vocab_type llama_vocab::get_type() const { |
3272 | 0 | return pimpl->type; |
3273 | 0 | } |
3274 | | |
3275 | 0 | enum llama_vocab_pre_type llama_vocab::get_pre_type() const { |
3276 | 0 | return pimpl->pre_type; |
3277 | 0 | } |
3278 | | |
3279 | 0 | uint32_t llama_vocab::n_tokens() const { |
3280 | 0 | return (uint32_t) pimpl->id_to_token.size(); |
3281 | 0 | } |
3282 | | |
3283 | 0 | uint32_t llama_vocab::n_token_types() const { |
3284 | 0 | return (uint32_t) pimpl->n_token_types; |
3285 | 0 | } |
3286 | | |
3287 | 0 | std::string llama_vocab::type_name() const{ |
3288 | 0 | return pimpl->type_name(); |
3289 | 0 | } |
3290 | | |
3291 | 0 | bool llama_vocab::is_normal(llama_token id) const { |
3292 | 0 | return pimpl->is_normal(id); |
3293 | 0 | } |
3294 | | |
3295 | 0 | bool llama_vocab::is_unknown(llama_token id) const { |
3296 | 0 | return pimpl->is_unknown(id); |
3297 | 0 | } |
3298 | | |
3299 | 0 | bool llama_vocab::is_control(llama_token id) const { |
3300 | 0 | return pimpl->is_control(id); |
3301 | 0 | } |
3302 | | |
3303 | 0 | bool llama_vocab::is_byte(llama_token id) const { |
3304 | 0 | return pimpl->is_byte(id); |
3305 | 0 | } |
3306 | | |
3307 | 0 | bool llama_vocab::is_user_defined(llama_token id) const { |
3308 | 0 | return pimpl->is_user_defined(id); |
3309 | 0 | } |
3310 | | |
3311 | 0 | bool llama_vocab::is_unused(llama_token id) const { |
3312 | 0 | return pimpl->is_unused(id); |
3313 | 0 | } |
3314 | | |
3315 | 0 | bool llama_vocab::is_eog(llama_token id) const { |
3316 | 0 | return pimpl->is_eog(id); |
3317 | 0 | } |
3318 | | |
3319 | 0 | uint8_t llama_vocab::token_to_byte(llama_token id) const { |
3320 | 0 | return pimpl->token_to_byte(id); |
3321 | 0 | } |
3322 | | |
3323 | 0 | llama_token llama_vocab::byte_to_token(uint8_t ch) const { |
3324 | 0 | GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE); |
3325 | 0 | static const char * hex = "0123456789ABCDEF"; |
3326 | 0 | switch (get_type()) { |
3327 | 0 | case LLAMA_VOCAB_TYPE_SPM: |
3328 | 0 | case LLAMA_VOCAB_TYPE_UGM: { |
3329 | 0 | const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; |
3330 | 0 | auto token = pimpl->token_to_id.find(buf); |
3331 | 0 | if (token != pimpl->token_to_id.end()) { |
3332 | 0 | return (*token).second; |
3333 | 0 | } |
3334 | | // Try to fall back to just the byte as a string |
3335 | 0 | const char buf2[2] = { (char)ch, 0 }; |
3336 | 0 | return pimpl->token_to_id.at(buf2); |
3337 | 0 | } |
3338 | 0 | case LLAMA_VOCAB_TYPE_WPM: |
3339 | 0 | case LLAMA_VOCAB_TYPE_BPE: { |
3340 | 0 | return pimpl->token_to_id.at(unicode_byte_to_utf8(ch)); |
3341 | 0 | } |
3342 | 0 | case LLAMA_VOCAB_TYPE_PLAMO2: { |
3343 | | // PLaMo-2 uses byte tokens in format <0xXX> |
3344 | 0 | char hex_str[8]; |
3345 | 0 | snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch); |
3346 | 0 | return pimpl->token_to_id.at(hex_str); |
3347 | 0 | } |
3348 | 0 | default: |
3349 | 0 | GGML_ABORT("fatal error"); |
3350 | 0 | } |
3351 | 0 | } |
3352 | | |
3353 | 0 | llama_token llama_vocab::text_to_token(const std::string & text) const { |
3354 | 0 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
3355 | 0 | auto it = pimpl->token_to_id.find(text); |
3356 | 0 | if (it != pimpl->token_to_id.end()) { |
3357 | 0 | return (*it).second; |
3358 | 0 | } |
3359 | 0 | return LLAMA_TOKEN_NULL; |
3360 | 0 | } |
3361 | | |
3362 | 0 | const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const { |
3363 | 0 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
3364 | 0 | return pimpl->id_to_token.at(id); |
3365 | 0 | } |
3366 | | |
3367 | 0 | const char * llama_vocab::token_get_text(llama_token id) const { |
3368 | 0 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
3369 | 0 | return pimpl->id_to_token.at(id).text.c_str(); |
3370 | 0 | } |
3371 | | |
3372 | 0 | float llama_vocab::token_get_score(llama_token id) const { |
3373 | 0 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
3374 | 0 | return pimpl->id_to_token.at(id).score; |
3375 | 0 | } |
3376 | | |
3377 | 0 | llama_token_attr llama_vocab::token_get_attr(llama_token id) const { |
3378 | 0 | return pimpl->token_get_attr(id); |
3379 | 0 | } |
3380 | | |
3381 | 0 | llama_token llama_vocab::token_bos() const { |
3382 | 0 | return pimpl->special_bos_id; |
3383 | 0 | } |
3384 | | |
3385 | 0 | llama_token llama_vocab::token_eos() const { |
3386 | 0 | return pimpl->special_eos_id; |
3387 | 0 | } |
3388 | | |
3389 | 0 | llama_token llama_vocab::token_eot() const { |
3390 | 0 | return pimpl->special_eot_id; |
3391 | 0 | } |
3392 | | |
3393 | 0 | llama_token llama_vocab::token_eom() const { |
3394 | 0 | return pimpl->special_eom_id; |
3395 | 0 | } |
3396 | | |
3397 | 0 | llama_token llama_vocab::token_unk() const { |
3398 | 0 | return pimpl->special_unk_id; |
3399 | 0 | } |
3400 | | |
3401 | 0 | llama_token llama_vocab::token_sep() const { |
3402 | 0 | return pimpl->special_sep_id; |
3403 | 0 | } |
3404 | | |
3405 | 0 | llama_token llama_vocab::token_nl() const { |
3406 | 0 | return pimpl->linefeed_id; |
3407 | 0 | } |
3408 | | |
3409 | 0 | llama_token llama_vocab::token_pad() const { |
3410 | 0 | return pimpl->special_pad_id; |
3411 | 0 | } |
3412 | | |
3413 | 0 | llama_token llama_vocab::token_prefix() const { |
3414 | 0 | return pimpl->special_fim_pre_id; |
3415 | 0 | } |
3416 | | |
3417 | 0 | llama_token llama_vocab::token_middle() const { |
3418 | 0 | return pimpl->special_fim_mid_id; |
3419 | 0 | } |
3420 | | |
3421 | 0 | llama_token llama_vocab::token_suffix() const { |
3422 | 0 | return pimpl->special_fim_suf_id; |
3423 | 0 | } |
3424 | | |
3425 | 0 | llama_token llama_vocab::token_fim_pre() const { |
3426 | 0 | return pimpl->special_fim_pre_id; |
3427 | 0 | } |
3428 | | |
3429 | 0 | llama_token llama_vocab::token_fim_suf() const { |
3430 | 0 | return pimpl->special_fim_suf_id; |
3431 | 0 | } |
3432 | | |
3433 | 0 | llama_token llama_vocab::token_fim_mid() const { |
3434 | 0 | return pimpl->special_fim_mid_id; |
3435 | 0 | } |
3436 | | |
3437 | 0 | llama_token llama_vocab::token_fim_pad() const { |
3438 | 0 | return pimpl->special_fim_pad_id; |
3439 | 0 | } |
3440 | | |
3441 | 0 | llama_token llama_vocab::token_fim_rep() const { |
3442 | 0 | return pimpl->special_fim_rep_id; |
3443 | 0 | } |
3444 | | |
3445 | 0 | llama_token llama_vocab::token_fim_sep() const { |
3446 | 0 | return pimpl->special_fim_sep_id; |
3447 | 0 | } |
3448 | | |
3449 | 0 | llama_token llama_vocab::token_mask() const { |
3450 | 0 | return pimpl->special_mask_id; |
3451 | 0 | } |
3452 | | |
3453 | 0 | bool llama_vocab::get_add_space_prefix() const { |
3454 | 0 | return pimpl->add_space_prefix; |
3455 | 0 | } |
3456 | | |
3457 | 0 | bool llama_vocab::get_add_bos() const { |
3458 | 0 | return pimpl->add_bos; |
3459 | 0 | } |
3460 | | |
3461 | 0 | bool llama_vocab::get_add_eos() const { |
3462 | 0 | return pimpl->add_eos; |
3463 | 0 | } |
3464 | | |
3465 | 0 | bool llama_vocab::get_add_sep() const { |
3466 | 0 | return pimpl->add_sep; |
3467 | 0 | } |
3468 | | |
3469 | 0 | bool llama_vocab::get_ignore_merges() const { |
3470 | 0 | return pimpl->ignore_merges; |
3471 | 0 | } |
3472 | | |
3473 | 0 | bool llama_vocab::get_clean_spaces() const { |
3474 | 0 | return pimpl->clean_spaces; |
3475 | 0 | } |
3476 | | |
3477 | 0 | bool llama_vocab::get_remove_extra_whitespaces() const { |
3478 | 0 | return pimpl->remove_extra_whitespaces; |
3479 | 0 | } |
3480 | | |
3481 | 0 | bool llama_vocab::get_escape_whitespaces() const { |
3482 | 0 | return pimpl->escape_whitespaces; |
3483 | 0 | } |
3484 | | |
3485 | 0 | bool llama_vocab::get_treat_whitespace_as_suffix() const { |
3486 | 0 | return pimpl->treat_whitespace_as_suffix; |
3487 | 0 | } |
3488 | | |
3489 | 0 | int llama_vocab::max_token_len() const { |
3490 | 0 | return pimpl->max_token_len; |
3491 | 0 | } |
3492 | | |
3493 | 0 | int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const { |
3494 | 0 | GGML_ASSERT(token_left.find(' ') == std::string::npos); |
3495 | 0 | GGML_ASSERT(token_left.find('\n') == std::string::npos); |
3496 | 0 | GGML_ASSERT(token_right.find(' ') == std::string::npos); |
3497 | 0 | GGML_ASSERT(token_right.find('\n') == std::string::npos); |
3498 | |
|
3499 | 0 | auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right)); |
3500 | 0 | if (it == pimpl->bpe_ranks.end()) { |
3501 | 0 | return -1; |
3502 | 0 | } |
3503 | | |
3504 | 0 | return it->second; |
3505 | 0 | } |
3506 | | |
3507 | 0 | std::vector<std::string> llama_vocab::get_bpe_merges() const { |
3508 | 0 | std::vector<std::string> result(pimpl->bpe_ranks.size()); |
3509 | |
|
3510 | 0 | for (const auto & pair : pimpl->bpe_ranks) { |
3511 | 0 | result[pair.second] = pair.first.first + " " + pair.first.second; |
3512 | 0 | } |
3513 | |
|
3514 | 0 | return result; |
3515 | 0 | } |
3516 | | |
3517 | 0 | std::vector<char> llama_vocab::get_precompiled_charsmap() const { |
3518 | 0 | return pimpl->precompiled_charsmap; |
3519 | 0 | } |
3520 | | |
3521 | | int32_t llama_vocab::tokenize( |
3522 | | const char * text, |
3523 | | int32_t text_len, |
3524 | | llama_token * tokens, |
3525 | | int32_t n_tokens_max, |
3526 | | bool add_special, |
3527 | 0 | bool parse_special) const { |
3528 | 0 | auto res = tokenize(std::string(text, text_len), add_special, parse_special); |
3529 | 0 | if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) { |
3530 | 0 | LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size()); |
3531 | 0 | return std::numeric_limits<int32_t>::min(); |
3532 | 0 | } |
3533 | | |
3534 | 0 | if (n_tokens_max < (int) res.size()) { |
3535 | | // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); |
3536 | 0 | return -((int) res.size()); |
3537 | 0 | } |
3538 | | |
3539 | 0 | for (size_t i = 0; i < res.size(); i++) { |
3540 | 0 | tokens[i] = res[i]; |
3541 | 0 | } |
3542 | |
|
3543 | 0 | return res.size(); |
3544 | 0 | } |
3545 | | |
3546 | | std::vector<llama_token> llama_vocab::tokenize( |
3547 | | const std::string & raw_text, |
3548 | | bool add_special, |
3549 | 0 | bool parse_special) const { |
3550 | 0 | return pimpl->tokenize(raw_text, add_special, parse_special); |
3551 | 0 | } |
3552 | | |
3553 | 0 | const std::string & llama_vocab::token_to_piece(llama_token token) const { |
3554 | 0 | return pimpl->token_to_piece(token); |
3555 | 0 | } |
3556 | | |
3557 | 0 | int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const { |
3558 | 0 | return pimpl->token_to_piece(token, buf, length, lstrip, special); |
3559 | 0 | } |
3560 | | |
3561 | | int32_t llama_vocab::detokenize( |
3562 | | const llama_token * tokens, |
3563 | | int32_t n_tokens, |
3564 | | char * text, |
3565 | | int32_t text_len_max, |
3566 | | bool remove_special, |
3567 | 0 | bool unparse_special) const { |
3568 | 0 | return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); |
3569 | 0 | } |
3570 | | |
3571 | 0 | std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const { |
3572 | 0 | std::string text; |
3573 | 0 | text.resize(std::max(text.capacity(), tokens.size())); |
3574 | 0 | int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); |
3575 | 0 | if (n_chars < 0) { |
3576 | 0 | text.resize(-n_chars); |
3577 | 0 | n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); |
3578 | 0 | GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization |
3579 | 0 | } |
3580 | |
|
3581 | 0 | text.resize(n_chars); |
3582 | | |
3583 | | // NOTE: the original tokenizer decodes bytes after collecting the pieces. |
3584 | 0 | return text; |
3585 | 0 | } |
3586 | | |
3587 | 0 | void llama_vocab::print_info() const { |
3588 | 0 | pimpl->print_info(); |
3589 | 0 | } |
3590 | | |
3591 | | // |
3592 | | // interface implementation |
3593 | | // |
3594 | | |
3595 | 0 | int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) { |
3596 | 0 | return vocab->n_tokens(); |
3597 | 0 | } |
3598 | | |
3599 | | // deprecated |
3600 | 0 | int32_t llama_n_vocab(const struct llama_vocab * vocab) { |
3601 | 0 | return llama_vocab_n_tokens(vocab); |
3602 | 0 | } |
3603 | | |
3604 | 0 | enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) { |
3605 | 0 | return vocab->get_type(); |
3606 | 0 | } |
3607 | | |
3608 | 0 | const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) { |
3609 | 0 | return vocab->token_get_text(token); |
3610 | 0 | } |
3611 | | |
3612 | 0 | float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) { |
3613 | 0 | return vocab->token_get_score(token); |
3614 | 0 | } |
3615 | | |
3616 | 0 | enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) { |
3617 | 0 | return vocab->token_get_attr(token); |
3618 | 0 | } |
3619 | | |
3620 | 0 | bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) { |
3621 | 0 | return vocab->is_eog(token); |
3622 | 0 | } |
3623 | | |
3624 | 0 | bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) { |
3625 | 0 | return vocab->is_control(token); |
3626 | 0 | } |
3627 | | |
3628 | 0 | llama_token llama_vocab_bos(const struct llama_vocab * vocab) { |
3629 | 0 | return vocab->token_bos(); |
3630 | 0 | } |
3631 | | |
3632 | 0 | llama_token llama_vocab_eos(const struct llama_vocab * vocab) { |
3633 | 0 | return vocab->token_eos(); |
3634 | 0 | } |
3635 | | |
3636 | 0 | llama_token llama_vocab_eot(const struct llama_vocab * vocab) { |
3637 | 0 | return vocab->token_eot(); |
3638 | 0 | } |
3639 | | |
3640 | | // deprecated |
3641 | 0 | llama_token llama_vocab_cls(const struct llama_vocab * vocab) { |
3642 | 0 | return vocab->token_bos(); |
3643 | 0 | } |
3644 | | |
3645 | 0 | llama_token llama_vocab_sep(const struct llama_vocab * vocab) { |
3646 | 0 | return vocab->token_sep(); |
3647 | 0 | } |
3648 | | |
3649 | 0 | llama_token llama_vocab_nl (const struct llama_vocab * vocab) { |
3650 | 0 | return vocab->token_nl(); |
3651 | 0 | } |
3652 | | |
3653 | 0 | llama_token llama_vocab_pad(const struct llama_vocab * vocab) { |
3654 | 0 | return vocab->token_pad(); |
3655 | 0 | } |
3656 | | |
3657 | 0 | bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) { |
3658 | 0 | return vocab->get_add_bos(); |
3659 | 0 | } |
3660 | | |
3661 | 0 | bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) { |
3662 | 0 | return vocab->get_add_eos(); |
3663 | 0 | } |
3664 | | |
3665 | 0 | bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) { |
3666 | 0 | return vocab->get_add_sep(); |
3667 | 0 | } |
3668 | | |
3669 | 0 | llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) { |
3670 | 0 | return vocab->token_fim_pre(); |
3671 | 0 | } |
3672 | | |
3673 | 0 | llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) { |
3674 | 0 | return vocab->token_fim_suf(); |
3675 | 0 | } |
3676 | | |
3677 | 0 | llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) { |
3678 | 0 | return vocab->token_fim_mid(); |
3679 | 0 | } |
3680 | | |
3681 | 0 | llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) { |
3682 | 0 | return vocab->token_fim_pad(); |
3683 | 0 | } |
3684 | | |
3685 | 0 | llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) { |
3686 | 0 | return vocab->token_fim_rep(); |
3687 | 0 | } |
3688 | | |
3689 | 0 | llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) { |
3690 | 0 | return vocab->token_fim_sep(); |
3691 | 0 | } |
3692 | | |
3693 | 0 | llama_token llama_vocab_mask(const struct llama_vocab* vocab) { |
3694 | 0 | return vocab->token_mask(); |
3695 | 0 | } |
3696 | | |
3697 | | // deprecated |
3698 | 0 | const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) { |
3699 | 0 | return llama_vocab_get_text(vocab, token); |
3700 | 0 | } |
3701 | | |
3702 | | // deprecated |
3703 | 0 | float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) { |
3704 | 0 | return llama_vocab_get_score(vocab, token); |
3705 | 0 | } |
3706 | | |
3707 | | // deprecated |
3708 | 0 | enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) { |
3709 | 0 | return llama_vocab_get_attr(vocab, token); |
3710 | 0 | } |
3711 | | |
3712 | | // deprecated |
3713 | 0 | bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) { |
3714 | 0 | return llama_vocab_is_eog(vocab, token); |
3715 | 0 | } |
3716 | | |
3717 | | // deprecated |
3718 | 0 | bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) { |
3719 | 0 | return llama_vocab_is_control(vocab, token); |
3720 | 0 | } |
3721 | | |
3722 | | // deprecated |
3723 | 0 | llama_token llama_token_bos(const struct llama_vocab * vocab) { |
3724 | 0 | return llama_vocab_bos(vocab); |
3725 | 0 | } |
3726 | | |
3727 | | // deprecated |
3728 | 0 | llama_token llama_token_eos(const struct llama_vocab * vocab) { |
3729 | 0 | return llama_vocab_eos(vocab); |
3730 | 0 | } |
3731 | | |
3732 | | // deprecated |
3733 | 0 | llama_token llama_token_eot(const struct llama_vocab * vocab) { |
3734 | 0 | return llama_vocab_eot(vocab); |
3735 | 0 | } |
3736 | | |
3737 | | // deprecated |
3738 | 0 | llama_token llama_token_cls(const struct llama_vocab * vocab) { |
3739 | | //return llama_vocab_cls(vocab); |
3740 | 0 | return llama_vocab_bos(vocab); // avoid deprecation warning |
3741 | 0 | } |
3742 | | |
3743 | | // deprecated |
3744 | 0 | llama_token llama_token_sep(const struct llama_vocab * vocab) { |
3745 | 0 | return llama_vocab_sep(vocab); |
3746 | 0 | } |
3747 | | |
3748 | | // deprecated |
3749 | 0 | llama_token llama_token_nl (const struct llama_vocab * vocab) { |
3750 | 0 | return llama_vocab_nl(vocab); |
3751 | 0 | } |
3752 | | |
3753 | | // deprecated |
3754 | 0 | llama_token llama_token_pad(const struct llama_vocab * vocab) { |
3755 | 0 | return llama_vocab_pad(vocab); |
3756 | 0 | } |
3757 | | |
3758 | | // deprecated |
3759 | 0 | bool llama_add_bos_token(const struct llama_vocab * vocab) { |
3760 | 0 | return llama_vocab_get_add_bos(vocab); |
3761 | 0 | } |
3762 | | |
3763 | | // deprecated |
3764 | 0 | bool llama_add_eos_token(const struct llama_vocab * vocab) { |
3765 | 0 | return llama_vocab_get_add_eos(vocab); |
3766 | 0 | } |
3767 | | |
3768 | | // deprecated |
3769 | 0 | llama_token llama_token_fim_pre(const struct llama_vocab * vocab) { |
3770 | 0 | return llama_vocab_fim_pre(vocab); |
3771 | 0 | } |
3772 | | |
3773 | | // deprecated |
3774 | 0 | llama_token llama_token_fim_suf(const struct llama_vocab * vocab) { |
3775 | 0 | return llama_vocab_fim_suf(vocab); |
3776 | 0 | } |
3777 | | |
3778 | | // deprecated |
3779 | 0 | llama_token llama_token_fim_mid(const struct llama_vocab * vocab) { |
3780 | 0 | return llama_vocab_fim_mid(vocab); |
3781 | 0 | } |
3782 | | |
3783 | | // deprecated |
3784 | 0 | llama_token llama_token_fim_pad(const struct llama_vocab * vocab) { |
3785 | 0 | return llama_vocab_fim_pad(vocab); |
3786 | 0 | } |
3787 | | |
3788 | | // deprecated |
3789 | 0 | llama_token llama_token_fim_rep(const struct llama_vocab * vocab) { |
3790 | 0 | return llama_vocab_fim_rep(vocab); |
3791 | 0 | } |
3792 | | |
3793 | | // deprecated |
3794 | 0 | llama_token llama_token_fim_sep(const struct llama_vocab * vocab) { |
3795 | 0 | return llama_vocab_fim_sep(vocab); |
3796 | 0 | } |
3797 | | |
3798 | | // |
3799 | | // tokenization |
3800 | | // |
3801 | | |
3802 | | int32_t llama_tokenize( |
3803 | | const struct llama_vocab * vocab, |
3804 | | const char * text, |
3805 | | int32_t text_len, |
3806 | | llama_token * tokens, |
3807 | | int32_t n_tokens_max, |
3808 | | bool add_special, |
3809 | 0 | bool parse_special) { |
3810 | 0 | return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special); |
3811 | 0 | } |
3812 | | |
3813 | | int32_t llama_token_to_piece( |
3814 | | const struct llama_vocab * vocab, |
3815 | | llama_token token, |
3816 | | char * buf, |
3817 | | int32_t length, |
3818 | | int32_t lstrip, |
3819 | 0 | bool special) { |
3820 | 0 | return vocab->token_to_piece(token, buf, length, lstrip, special); |
3821 | 0 | } |
3822 | | |
3823 | | int32_t llama_detokenize( |
3824 | | const struct llama_vocab * vocab, |
3825 | | const llama_token * tokens, |
3826 | | int32_t n_tokens, |
3827 | | char * text, |
3828 | | int32_t text_len_max, |
3829 | | bool remove_special, |
3830 | 0 | bool unparse_special) { |
3831 | 0 | return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); |
3832 | 0 | } |