/src/llama.cpp/src/models/models.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama-model.h" |
4 | | #include "llama-graph.h" |
5 | | #include "llama-model-loader.h" |
6 | | |
7 | | // note: almost all graphs require at least sqrtf, so include cmath globally |
8 | | #include <cmath> |
9 | | |
10 | | // |
11 | | // base classes |
12 | | // |
13 | | |
14 | | struct llm_build_mamba_base : public llm_graph_context { |
15 | | llm_build_mamba_base(const llm_graph_params & params); |
16 | | |
17 | | virtual ~llm_build_mamba_base() = default; |
18 | | |
19 | | ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il); |
20 | | ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const; |
21 | | |
22 | | }; |
23 | | |
24 | | struct llm_build_delta_net_base : public llm_graph_context { |
25 | | llm_build_delta_net_base(const llm_graph_params & params); |
26 | | |
27 | | virtual ~llm_build_delta_net_base() = default; |
28 | | |
29 | | // returns pair of output and new state |
30 | | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking( |
31 | | ggml_tensor * q, |
32 | | ggml_tensor * k, |
33 | | ggml_tensor * v, |
34 | | ggml_tensor * g, |
35 | | ggml_tensor * b, |
36 | | ggml_tensor * s, |
37 | | int il); |
38 | | |
39 | | // returns pair of output and new state |
40 | | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive( |
41 | | ggml_tensor * q, |
42 | | ggml_tensor * k, |
43 | | ggml_tensor * v, |
44 | | ggml_tensor * g, |
45 | | ggml_tensor * b, |
46 | | ggml_tensor * s, |
47 | | int il); |
48 | | |
49 | | // use the ggml_gated_delta_net fused operator (K=1; state has shape [S_v, S_v, H_v, n_seqs]) |
50 | | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused( |
51 | | ggml_tensor * q, |
52 | | ggml_tensor * k, |
53 | | ggml_tensor * v, |
54 | | ggml_tensor * g, |
55 | | ggml_tensor * b, |
56 | | ggml_tensor * s, |
57 | | int il); |
58 | | |
59 | | // choose one of two implementations above based on the number of tokens |
60 | | std::pair<ggml_tensor *, ggml_tensor *> build_delta_net( |
61 | | ggml_tensor * q, |
62 | | ggml_tensor * k, |
63 | | ggml_tensor * v, |
64 | | ggml_tensor * g, |
65 | | ggml_tensor * b, |
66 | | ggml_tensor * s, |
67 | | int il); |
68 | | |
69 | | // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) |
70 | | // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) |
71 | | ggml_tensor * build_conv_state( |
72 | | llm_graph_input_rs * inp, |
73 | | ggml_tensor * conv_states_all, |
74 | | ggml_tensor * qkv_mixed, |
75 | | int64_t conv_kernel_size, |
76 | | int64_t conv_channels, |
77 | | int il); |
78 | | |
79 | | // run delta-net attention and write the new recurrent state(s) back to ssm_states_all |
80 | | // s: (head_v_dim, head_v_dim, num_v_heads, n_seqs); returns output: (head_v_dim, num_v_heads, n_seq_tokens, n_seqs) |
81 | | ggml_tensor * build_recurrent_attn( |
82 | | llm_graph_input_rs * inp, |
83 | | ggml_tensor * ssm_states_all, |
84 | | ggml_tensor * q, |
85 | | ggml_tensor * k, |
86 | | ggml_tensor * v, |
87 | | ggml_tensor * g, |
88 | | ggml_tensor * b, |
89 | | ggml_tensor * s, |
90 | | int il); |
91 | | }; |
92 | | |
93 | | struct llm_build_rwkv6_base : public llm_graph_context { |
94 | | const llama_model & model; |
95 | | |
96 | | llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params); |
97 | | |
98 | | virtual ~llm_build_rwkv6_base() = default; |
99 | | |
100 | | ggml_tensor * build_rwkv6_channel_mix(const llama_layer * layer, |
101 | | ggml_tensor * cur, |
102 | | ggml_tensor * x_prev, |
103 | | llm_arch arch) const; |
104 | | |
105 | | ggml_tensor * build_rwkv6_time_mix(llm_graph_input_rs * inp, |
106 | | ggml_tensor * cur, |
107 | | ggml_tensor * x_prev, |
108 | | const llama_ubatch & ubatch, |
109 | | int il) const; |
110 | | }; |
111 | | |
112 | | // Base class for RWKV7-related models |
113 | | struct llm_build_rwkv7_base : public llm_graph_context { |
114 | | const llama_model & model; |
115 | | |
116 | | llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params); |
117 | | |
118 | | virtual ~llm_build_rwkv7_base() = default; |
119 | | |
120 | | // RWKV7-specific graph building methods |
121 | | ggml_tensor * build_rwkv7_channel_mix(const llama_layer * layer, |
122 | | ggml_tensor * cur, |
123 | | ggml_tensor * x_prev, |
124 | | llm_arch arch) const; |
125 | | ggml_tensor * build_rwkv7_time_mix(llm_graph_input_rs * inp, |
126 | | ggml_tensor * cur, |
127 | | ggml_tensor * x_prev, |
128 | | ggml_tensor *& first_layer_value, |
129 | | const llama_ubatch & ubatch, |
130 | | int il) const; |
131 | | }; |
132 | | |
133 | | // |
134 | | // models |
135 | | // |
136 | | |
137 | | struct llama_model_llama : public llama_model_base { |
138 | 0 | llama_model_llama(const struct llama_model_params & params) : llama_model_base(params) {} |
139 | | void load_arch_hparams(llama_model_loader & ml) override; |
140 | | void load_arch_tensors(llama_model_loader & ml) override; |
141 | | |
142 | | template <bool embed> |
143 | | struct graph : public llm_graph_context { |
144 | | graph(const llama_model & model, const llm_graph_params & params); |
145 | | }; |
146 | | |
147 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
148 | | }; |
149 | | |
150 | | |
151 | | struct llama_model_llama4 : public llama_model_base { |
152 | 0 | llama_model_llama4(const struct llama_model_params & params) : llama_model_base(params) {} |
153 | | void load_arch_hparams(llama_model_loader & ml) override; |
154 | | void load_arch_tensors(llama_model_loader & ml) override; |
155 | | |
156 | | template <bool iswa> |
157 | | struct graph : public llm_graph_context { |
158 | | graph(const llama_model & model, const llm_graph_params & params); |
159 | | }; |
160 | | |
161 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
162 | | }; |
163 | | |
164 | | |
165 | | struct llama_model_llama_embed : public llama_model_llama { |
166 | 0 | llama_model_llama_embed(const struct llama_model_params & params) : llama_model_llama(params) {} |
167 | | // reuse load_arch_hparams and load_arch_tensors from llama_model_llama |
168 | | |
169 | | template <bool embed> |
170 | | using graph = llama_model_llama::graph<embed>; |
171 | | |
172 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
173 | | }; |
174 | | |
175 | | |
176 | | struct llama_model_maincoder : public llama_model_base { |
177 | 0 | llama_model_maincoder(const struct llama_model_params & params) : llama_model_base(params) {} |
178 | | void load_arch_hparams(llama_model_loader & ml) override; |
179 | | void load_arch_tensors(llama_model_loader & ml) override; |
180 | | |
181 | | struct graph : public llm_graph_context { |
182 | | graph(const llama_model & model, const llm_graph_params & params); |
183 | | }; |
184 | | |
185 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
186 | | }; |
187 | | |
188 | | |
189 | | struct llama_model_talkie : public llama_model_base { |
190 | 0 | llama_model_talkie(const struct llama_model_params & params) : llama_model_base(params) {} |
191 | | void load_arch_hparams(llama_model_loader & ml) override; |
192 | | void load_arch_tensors(llama_model_loader & ml) override; |
193 | | |
194 | | struct graph : public llm_graph_context { |
195 | | graph(const llama_model & model, const llm_graph_params & params); |
196 | | }; |
197 | | |
198 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
199 | | }; |
200 | | |
201 | | |
202 | | struct llama_model_deci : public llama_model_base { |
203 | 0 | llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {} |
204 | | void load_arch_hparams(llama_model_loader & ml) override; |
205 | | void load_arch_tensors(llama_model_loader & ml) override; |
206 | | |
207 | | struct graph : public llm_graph_context { |
208 | | graph(const llama_model & model, const llm_graph_params & params); |
209 | | }; |
210 | | |
211 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
212 | | }; |
213 | | |
214 | | |
215 | | struct llama_model_baichuan : public llama_model_base { |
216 | 0 | llama_model_baichuan(const struct llama_model_params & params) : llama_model_base(params) {} |
217 | | void load_arch_hparams(llama_model_loader & ml) override; |
218 | | void load_arch_tensors(llama_model_loader & ml) override; |
219 | | |
220 | | struct graph : public llm_graph_context { |
221 | | graph(const llama_model & model, const llm_graph_params & params); |
222 | | }; |
223 | | |
224 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
225 | | }; |
226 | | |
227 | | |
228 | | struct llama_model_falcon : public llama_model_base { |
229 | 0 | llama_model_falcon(const struct llama_model_params & params) : llama_model_base(params) {} |
230 | | void load_arch_hparams(llama_model_loader & ml) override; |
231 | | void load_arch_tensors(llama_model_loader & ml) override; |
232 | | |
233 | | struct graph : public llm_graph_context { |
234 | | graph(const llama_model & model, const llm_graph_params & params); |
235 | | }; |
236 | | |
237 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
238 | | }; |
239 | | |
240 | | |
241 | | struct llama_model_grok : public llama_model_base { |
242 | 0 | llama_model_grok(const struct llama_model_params & params) : llama_model_base(params) {} |
243 | | void load_arch_hparams(llama_model_loader & ml) override; |
244 | | void load_arch_tensors(llama_model_loader & ml) override; |
245 | | |
246 | | struct graph : public llm_graph_context { |
247 | | graph(const llama_model & model, const llm_graph_params & params); |
248 | | }; |
249 | | |
250 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
251 | | }; |
252 | | |
253 | | |
254 | | struct llama_model_starcoder : public llama_model_base { |
255 | 0 | llama_model_starcoder(const struct llama_model_params & params) : llama_model_base(params) {} |
256 | | void load_arch_hparams(llama_model_loader & ml) override; |
257 | | void load_arch_tensors(llama_model_loader & ml) override; |
258 | | |
259 | | struct graph : public llm_graph_context { |
260 | | graph(const llama_model & model, const llm_graph_params & params); |
261 | | }; |
262 | | |
263 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
264 | | }; |
265 | | |
266 | | |
267 | | struct llama_model_refact : public llama_model_base { |
268 | 0 | llama_model_refact(const struct llama_model_params & params) : llama_model_base(params) {} |
269 | | void load_arch_hparams(llama_model_loader & ml) override; |
270 | | void load_arch_tensors(llama_model_loader & ml) override; |
271 | | |
272 | | struct graph : public llm_graph_context { |
273 | | graph(const llama_model & model, const llm_graph_params & params); |
274 | | }; |
275 | | |
276 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
277 | | }; |
278 | | |
279 | | |
280 | | struct llama_model_bert : public llama_model_base { |
281 | 0 | llama_model_bert(const struct llama_model_params & params) : llama_model_base(params) {} |
282 | | void load_arch_hparams(llama_model_loader & ml) override; |
283 | | void load_arch_tensors(llama_model_loader & ml) override; |
284 | | |
285 | | struct graph : public llm_graph_context { |
286 | | graph(const llama_model & model, const llm_graph_params & params); |
287 | | }; |
288 | | |
289 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
290 | | }; |
291 | | |
292 | | |
293 | | struct llama_model_jina_bert_v2 : public llama_model_base { |
294 | 0 | llama_model_jina_bert_v2(const struct llama_model_params & params) : llama_model_base(params) {} |
295 | | void load_arch_hparams(llama_model_loader & ml) override; |
296 | | void load_arch_tensors(llama_model_loader & ml) override; |
297 | | |
298 | | using graph = llama_model_bert::graph; |
299 | | |
300 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
301 | | }; |
302 | | |
303 | | |
304 | | struct llama_model_jina_bert_v3 : public llama_model_base { |
305 | 0 | llama_model_jina_bert_v3(const struct llama_model_params & params) : llama_model_base(params) {} |
306 | | void load_arch_hparams(llama_model_loader & ml) override; |
307 | | void load_arch_tensors(llama_model_loader & ml) override; |
308 | | |
309 | | using graph = llama_model_bert::graph; |
310 | | |
311 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
312 | | }; |
313 | | |
314 | | |
315 | | struct llama_model_nomic_bert : public llama_model_base { |
316 | 0 | llama_model_nomic_bert(const struct llama_model_params & params) : llama_model_base(params) {} |
317 | | void load_arch_hparams(llama_model_loader & ml) override; |
318 | | void load_arch_tensors(llama_model_loader & ml) override; |
319 | | |
320 | | using graph = llama_model_bert::graph; |
321 | | |
322 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
323 | | }; |
324 | | |
325 | | |
326 | | struct llama_model_nomic_bert_moe : public llama_model_base { |
327 | 0 | llama_model_nomic_bert_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
328 | | void load_arch_hparams(llama_model_loader & ml) override; |
329 | | void load_arch_tensors(llama_model_loader & ml) override; |
330 | | |
331 | | using graph = llama_model_bert::graph; |
332 | | |
333 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
334 | | }; |
335 | | |
336 | | |
337 | | struct llama_model_modern_bert : public llama_model_base { |
338 | 0 | llama_model_modern_bert(const struct llama_model_params & params) : llama_model_base(params) {} |
339 | | void load_arch_hparams(llama_model_loader & ml) override; |
340 | | void load_arch_tensors(llama_model_loader & ml) override; |
341 | | |
342 | | struct graph : public llm_graph_context { |
343 | | graph(const llama_model & model, const llm_graph_params & params); |
344 | | }; |
345 | | |
346 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
347 | | }; |
348 | | |
349 | | |
350 | | struct llama_model_neo_bert : public llama_model_base { |
351 | 0 | llama_model_neo_bert(const struct llama_model_params & params) : llama_model_base(params) {} |
352 | | void load_arch_hparams(llama_model_loader & ml) override; |
353 | | void load_arch_tensors(llama_model_loader & ml) override; |
354 | | |
355 | | struct graph : public llm_graph_context { |
356 | | graph(const llama_model & model, const llm_graph_params & params); |
357 | | }; |
358 | | |
359 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
360 | | }; |
361 | | |
362 | | |
363 | | struct llama_model_eurobert : public llama_model_base { |
364 | 0 | llama_model_eurobert(const struct llama_model_params & params) : llama_model_base(params) {} |
365 | | void load_arch_hparams(llama_model_loader & ml) override; |
366 | | void load_arch_tensors(llama_model_loader & ml) override; |
367 | | |
368 | | struct graph : public llm_graph_context { |
369 | | graph(const llama_model & model, const llm_graph_params & params); |
370 | | }; |
371 | | |
372 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
373 | | }; |
374 | | |
375 | | |
376 | | struct llama_model_bloom : public llama_model_base { |
377 | 0 | llama_model_bloom(const struct llama_model_params & params) : llama_model_base(params) {} |
378 | | void load_arch_hparams(llama_model_loader & ml) override; |
379 | | void load_arch_tensors(llama_model_loader & ml) override; |
380 | | |
381 | | struct graph : public llm_graph_context { |
382 | | graph(const llama_model & model, const llm_graph_params & params); |
383 | | }; |
384 | | |
385 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
386 | | }; |
387 | | |
388 | | |
389 | | struct llama_model_mpt : public llama_model_base { |
390 | 0 | llama_model_mpt(const struct llama_model_params & params) : llama_model_base(params) {} |
391 | | void load_arch_hparams(llama_model_loader & ml) override; |
392 | | void load_arch_tensors(llama_model_loader & ml) override; |
393 | | |
394 | | struct graph : public llm_graph_context { |
395 | | graph(const llama_model & model, const llm_graph_params & params); |
396 | | }; |
397 | | |
398 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
399 | | }; |
400 | | |
401 | | |
402 | | struct llama_model_stablelm : public llama_model_base { |
403 | 0 | llama_model_stablelm(const struct llama_model_params & params) : llama_model_base(params) {} |
404 | | void load_arch_hparams(llama_model_loader & ml) override; |
405 | | void load_arch_tensors(llama_model_loader & ml) override; |
406 | | |
407 | | struct graph : public llm_graph_context { |
408 | | graph(const llama_model & model, const llm_graph_params & params); |
409 | | }; |
410 | | |
411 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
412 | | }; |
413 | | |
414 | | struct llama_model_mellum : public llama_model_base { |
415 | 0 | llama_model_mellum(const struct llama_model_params & params) : llama_model_base(params) {} |
416 | | void load_arch_hparams(llama_model_loader & ml) override; |
417 | | void load_arch_tensors(llama_model_loader & ml) override; |
418 | | |
419 | | template <bool iswa> |
420 | | struct graph : public llm_graph_context { |
421 | | graph(const llama_model & model, const llm_graph_params & params); |
422 | | }; |
423 | | |
424 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
425 | | }; |
426 | | |
427 | | struct llama_model_qwen : public llama_model_base { |
428 | 0 | llama_model_qwen(const struct llama_model_params & params) : llama_model_base(params) {} |
429 | | void load_arch_hparams(llama_model_loader & ml) override; |
430 | | void load_arch_tensors(llama_model_loader & ml) override; |
431 | | |
432 | | struct graph : public llm_graph_context { |
433 | | graph(const llama_model & model, const llm_graph_params & params); |
434 | | }; |
435 | | |
436 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
437 | | }; |
438 | | |
439 | | |
440 | | struct llama_model_qwen2 : public llama_model_base { |
441 | 0 | llama_model_qwen2(const struct llama_model_params & params) : llama_model_base(params) {} |
442 | | void load_arch_hparams(llama_model_loader & ml) override; |
443 | | void load_arch_tensors(llama_model_loader & ml) override; |
444 | | |
445 | | struct graph : public llm_graph_context { |
446 | | graph(const llama_model & model, const llm_graph_params & params); |
447 | | }; |
448 | | |
449 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
450 | | }; |
451 | | |
452 | | |
453 | | struct llama_model_dream : public llama_model_base { |
454 | 0 | llama_model_dream(const struct llama_model_params & params) : llama_model_base(params) {} |
455 | | void load_arch_hparams(llama_model_loader & ml) override; |
456 | | void load_arch_tensors(llama_model_loader & ml) override; |
457 | | |
458 | | struct graph : public llm_graph_context { |
459 | | graph(const llama_model & model, const llm_graph_params & params); |
460 | | }; |
461 | | |
462 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
463 | | }; |
464 | | |
465 | | |
466 | | struct llama_model_llada : public llama_model_base { |
467 | 0 | llama_model_llada(const struct llama_model_params & params) : llama_model_base(params) {} |
468 | | void load_arch_hparams(llama_model_loader & ml) override; |
469 | | void load_arch_tensors(llama_model_loader & ml) override; |
470 | | |
471 | | struct graph : public llm_graph_context { |
472 | | graph(const llama_model & model, const llm_graph_params & params); |
473 | | }; |
474 | | |
475 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
476 | | }; |
477 | | |
478 | | |
479 | | struct llama_model_llada_moe : public llama_model_base { |
480 | 0 | llama_model_llada_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
481 | | void load_arch_hparams(llama_model_loader & ml) override; |
482 | | void load_arch_tensors(llama_model_loader & ml) override; |
483 | | |
484 | | struct graph : public llm_graph_context { |
485 | | graph(const llama_model & model, const llm_graph_params & params); |
486 | | }; |
487 | | |
488 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
489 | | }; |
490 | | |
491 | | |
492 | | struct llama_model_rnd1 : public llama_model_base { |
493 | 0 | llama_model_rnd1(const struct llama_model_params & params) : llama_model_base(params) {} |
494 | | void load_arch_hparams(llama_model_loader & ml) override; |
495 | | void load_arch_tensors(llama_model_loader & ml) override; |
496 | | |
497 | | struct graph : public llm_graph_context { |
498 | | graph(const llama_model & model, const llm_graph_params & params); |
499 | | }; |
500 | | |
501 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
502 | | }; |
503 | | |
504 | | |
505 | | struct llama_model_qwen2vl : public llama_model_base { |
506 | 0 | llama_model_qwen2vl(const struct llama_model_params & params) : llama_model_base(params) {} |
507 | | void load_arch_hparams(llama_model_loader & ml) override; |
508 | | void load_arch_tensors(llama_model_loader & ml) override; |
509 | | |
510 | | struct graph : public llm_graph_context { |
511 | | graph(const llama_model & model, const llm_graph_params & params); |
512 | | }; |
513 | | |
514 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
515 | | }; |
516 | | |
517 | | |
518 | | struct llama_model_qwen2moe : public llama_model_base { |
519 | 0 | llama_model_qwen2moe(const struct llama_model_params & params) : llama_model_base(params) {} |
520 | | void load_arch_hparams(llama_model_loader & ml) override; |
521 | | void load_arch_tensors(llama_model_loader & ml) override; |
522 | | |
523 | | struct graph : public llm_graph_context { |
524 | | graph(const llama_model & model, const llm_graph_params & params); |
525 | | }; |
526 | | |
527 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
528 | | }; |
529 | | |
530 | | |
531 | | struct llama_model_qwen3 : public llama_model_base { |
532 | 0 | llama_model_qwen3(const struct llama_model_params & params) : llama_model_base(params) {} |
533 | | void load_arch_hparams(llama_model_loader & ml) override; |
534 | | void load_arch_tensors(llama_model_loader & ml) override; |
535 | | |
536 | | struct graph : public llm_graph_context { |
537 | | graph(const llama_model & model, const llm_graph_params & params); |
538 | | }; |
539 | | |
540 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
541 | | }; |
542 | | |
543 | | |
544 | | struct llama_model_qwen3moe : public llama_model_base { |
545 | 0 | llama_model_qwen3moe(const struct llama_model_params & params) : llama_model_base(params) {} |
546 | | void load_arch_hparams(llama_model_loader & ml) override; |
547 | | void load_arch_tensors(llama_model_loader & ml) override; |
548 | | |
549 | | struct graph : public llm_graph_context { |
550 | | graph(const llama_model & model, const llm_graph_params & params); |
551 | | }; |
552 | | |
553 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
554 | | }; |
555 | | |
556 | | |
557 | | struct llama_model_qwen3vl : public llama_model_base { |
558 | 0 | llama_model_qwen3vl(const struct llama_model_params & params) : llama_model_base(params) {} |
559 | | void load_arch_hparams(llama_model_loader & ml) override; |
560 | | void load_arch_tensors(llama_model_loader & ml) override; |
561 | | |
562 | | struct graph : public llm_graph_context { |
563 | | graph(const llama_model & model, const llm_graph_params & params); |
564 | | }; |
565 | | |
566 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
567 | | }; |
568 | | |
569 | | |
570 | | struct llama_model_qwen3vlmoe : public llama_model_base { |
571 | 0 | llama_model_qwen3vlmoe(const struct llama_model_params & params) : llama_model_base(params) {} |
572 | | void load_arch_hparams(llama_model_loader & ml) override; |
573 | | void load_arch_tensors(llama_model_loader & ml) override; |
574 | | |
575 | | struct graph : public llm_graph_context { |
576 | | graph(const llama_model & model, const llm_graph_params & params); |
577 | | }; |
578 | | |
579 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
580 | | }; |
581 | | |
582 | | |
583 | | struct llama_model_phi2 : public llama_model_base { |
584 | 0 | llama_model_phi2(const struct llama_model_params & params) : llama_model_base(params) {} |
585 | | void load_arch_hparams(llama_model_loader & ml) override; |
586 | | void load_arch_tensors(llama_model_loader & ml) override; |
587 | | |
588 | | struct graph : public llm_graph_context { |
589 | | graph(const llama_model & model, const llm_graph_params & params); |
590 | | }; |
591 | | |
592 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
593 | | }; |
594 | | |
595 | | |
596 | | struct llama_model_phi3 : public llama_model_base { |
597 | 0 | llama_model_phi3(const struct llama_model_params & params) : llama_model_base(params) {} |
598 | | void load_arch_hparams(llama_model_loader & ml) override; |
599 | | void load_arch_tensors(llama_model_loader & ml) override; |
600 | | |
601 | | template <bool iswa> |
602 | | struct graph : public llm_graph_context { |
603 | | graph(const llama_model & model, const llm_graph_params & params); |
604 | | }; |
605 | | |
606 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
607 | | }; |
608 | | |
609 | | |
610 | | struct llama_model_phimoe : public llama_model_base { |
611 | 0 | llama_model_phimoe(const struct llama_model_params & params) : llama_model_base(params) {} |
612 | | void load_arch_hparams(llama_model_loader & ml) override; |
613 | | void load_arch_tensors(llama_model_loader & ml) override; |
614 | | |
615 | | template <bool iswa> |
616 | | using graph = llama_model_phi3::graph<iswa>; |
617 | | |
618 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
619 | | }; |
620 | | |
621 | | |
622 | | struct llama_model_plamo : public llama_model_base { |
623 | 0 | llama_model_plamo(const struct llama_model_params & params) : llama_model_base(params) {} |
624 | | void load_arch_hparams(llama_model_loader & ml) override; |
625 | | void load_arch_tensors(llama_model_loader & ml) override; |
626 | | |
627 | | struct graph : public llm_graph_context { |
628 | | graph(const llama_model & model, const llm_graph_params & params); |
629 | | }; |
630 | | |
631 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
632 | | }; |
633 | | |
634 | | |
635 | | struct llama_model_plamo2 : public llama_model_base { |
636 | 0 | llama_model_plamo2(const struct llama_model_params & params) : llama_model_base(params) {} |
637 | | void load_arch_hparams(llama_model_loader & ml) override; |
638 | | void load_arch_tensors(llama_model_loader & ml) override; |
639 | | |
640 | | struct graph : public llm_build_mamba_base { |
641 | | graph(const llama_model & model, const llm_graph_params & params); |
642 | | private: |
643 | | ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il); |
644 | | ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur, |
645 | | const llama_model & model, int il); |
646 | | }; |
647 | | |
648 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
649 | | }; |
650 | | |
651 | | |
652 | | struct llama_model_plamo3 : public llama_model_base { |
653 | 0 | llama_model_plamo3(const struct llama_model_params & params) : llama_model_base(params) {} |
654 | | void load_arch_hparams(llama_model_loader & ml) override; |
655 | | void load_arch_tensors(llama_model_loader & ml) override; |
656 | | |
657 | | template <bool iswa> |
658 | | struct graph : public llm_graph_context { |
659 | | graph(const llama_model & model, const llm_graph_params & params); |
660 | | }; |
661 | | |
662 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
663 | | }; |
664 | | |
665 | | |
666 | | struct llama_model_gpt2 : public llama_model_base { |
667 | 0 | llama_model_gpt2(const struct llama_model_params & params) : llama_model_base(params) {} |
668 | | void load_arch_hparams(llama_model_loader & ml) override; |
669 | | void load_arch_tensors(llama_model_loader & ml) override; |
670 | | |
671 | | struct graph : public llm_graph_context { |
672 | | graph(const llama_model & model, const llm_graph_params & params); |
673 | | }; |
674 | | |
675 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
676 | | }; |
677 | | |
678 | | |
679 | | struct llama_model_codeshell : public llama_model_base { |
680 | 0 | llama_model_codeshell(const struct llama_model_params & params) : llama_model_base(params) {} |
681 | | void load_arch_hparams(llama_model_loader & ml) override; |
682 | | void load_arch_tensors(llama_model_loader & ml) override; |
683 | | |
684 | | struct graph : public llm_graph_context { |
685 | | graph(const llama_model & model, const llm_graph_params & params); |
686 | | }; |
687 | | |
688 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
689 | | }; |
690 | | |
691 | | |
692 | | struct llama_model_orion : public llama_model_base { |
693 | 0 | llama_model_orion(const struct llama_model_params & params) : llama_model_base(params) {} |
694 | | void load_arch_hparams(llama_model_loader & ml) override; |
695 | | void load_arch_tensors(llama_model_loader & ml) override; |
696 | | |
697 | | struct graph : public llm_graph_context { |
698 | | graph(const llama_model & model, const llm_graph_params & params); |
699 | | }; |
700 | | |
701 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
702 | | }; |
703 | | |
704 | | |
705 | | struct llama_model_internlm2 : public llama_model_base { |
706 | 0 | llama_model_internlm2(const struct llama_model_params & params) : llama_model_base(params) {} |
707 | | void load_arch_hparams(llama_model_loader & ml) override; |
708 | | void load_arch_tensors(llama_model_loader & ml) override; |
709 | | |
710 | | struct graph : public llm_graph_context { |
711 | | graph(const llama_model & model, const llm_graph_params & params); |
712 | | }; |
713 | | |
714 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
715 | | }; |
716 | | |
717 | | |
718 | | struct llama_model_minicpm3 : public llama_model_base { |
719 | 0 | llama_model_minicpm3(const struct llama_model_params & params) : llama_model_base(params) {} |
720 | | void load_arch_hparams(llama_model_loader & ml) override; |
721 | | void load_arch_tensors(llama_model_loader & ml) override; |
722 | | |
723 | | struct graph : public llm_graph_context { |
724 | | graph(const llama_model & model, const llm_graph_params & params); |
725 | | }; |
726 | | |
727 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
728 | | }; |
729 | | |
730 | | |
731 | | struct llama_model_gemma : public llama_model_base { |
732 | 0 | llama_model_gemma(const struct llama_model_params & params) : llama_model_base(params) {} |
733 | | void load_arch_hparams(llama_model_loader & ml) override; |
734 | | void load_arch_tensors(llama_model_loader & ml) override; |
735 | | |
736 | | struct graph : public llm_graph_context { |
737 | | graph(const llama_model & model, const llm_graph_params & params); |
738 | | }; |
739 | | |
740 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
741 | | }; |
742 | | |
743 | | |
744 | | struct llama_model_gemma2 : public llama_model_base { |
745 | 0 | llama_model_gemma2(const struct llama_model_params & params) : llama_model_base(params) {} |
746 | | void load_arch_hparams(llama_model_loader & ml) override; |
747 | | void load_arch_tensors(llama_model_loader & ml) override; |
748 | | |
749 | | struct graph : public llm_graph_context { |
750 | | graph(const llama_model & model, const llm_graph_params & params); |
751 | | }; |
752 | | |
753 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
754 | | }; |
755 | | |
756 | | |
757 | | struct llama_model_gemma3 : public llama_model_base { |
758 | 0 | llama_model_gemma3(const struct llama_model_params & params) : llama_model_base(params) {} |
759 | | void load_arch_hparams(llama_model_loader & ml) override; |
760 | | void load_arch_tensors(llama_model_loader & ml) override; |
761 | | |
762 | | template <bool iswa> |
763 | | struct graph : public llm_graph_context { |
764 | | graph(const llama_model & model, const llm_graph_params & params); |
765 | | }; |
766 | | |
767 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
768 | | }; |
769 | | |
770 | | |
771 | | struct llama_model_gemma3n : public llama_model_base { |
772 | 0 | llama_model_gemma3n(const struct llama_model_params & params) : llama_model_base(params) {} |
773 | | void load_arch_hparams(llama_model_loader & ml) override; |
774 | | void load_arch_tensors(llama_model_loader & ml) override; |
775 | | |
776 | | struct graph : public llm_graph_context { |
777 | | const llama_model & model; |
778 | | |
779 | | const int64_t n_embd_head; |
780 | | const int64_t n_embd_altup; |
781 | | const int64_t n_altup; |
782 | | const int i_altup_act; |
783 | | const int n_layer_sparsity = 10; // number of layers using activation sparsity |
784 | | const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) |
785 | | |
786 | | graph(const llama_model & model, const llm_graph_params & params); |
787 | | ggml_tensor * calc_magnitude(ggml_tensor * x); |
788 | | |
789 | | // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER] |
790 | | ggml_tensor * build_inp_per_layer(); |
791 | | ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer); |
792 | | |
793 | | ggml_tensor * gaussian_topk(ggml_tensor * x); |
794 | | ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il); |
795 | | ggml_tensor * altup_predict(ggml_tensor * cur, int il); |
796 | | ggml_tensor * laurel(ggml_tensor * cur, int il); |
797 | | ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il); |
798 | | }; |
799 | | |
800 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
801 | | }; |
802 | | |
803 | | |
804 | | struct llama_model_gemma4 : public llama_model_base { |
805 | 0 | llama_model_gemma4(const struct llama_model_params & params) : llama_model_base(params) {} |
806 | | void load_arch_hparams(llama_model_loader & ml) override; |
807 | | void load_arch_tensors(llama_model_loader & ml) override; |
808 | | |
809 | | struct graph : public llm_graph_context { |
810 | | const llama_model & model; |
811 | | |
812 | | const int64_t n_embd_per_layer; |
813 | | |
814 | | graph(const llama_model & model, const llm_graph_params & params); |
815 | | |
816 | | // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER] |
817 | | ggml_tensor * build_inp_per_layer(); |
818 | | ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer); |
819 | | }; |
820 | | |
821 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
822 | | }; |
823 | | |
824 | | |
825 | | struct llama_model_gemma4_assistant : public llama_model_base { |
826 | 0 | llama_model_gemma4_assistant(const struct llama_model_params & params) : llama_model_base(params) {} |
827 | | void load_arch_hparams(llama_model_loader & ml) override; |
828 | | void load_arch_tensors(llama_model_loader & ml) override; |
829 | | |
830 | | struct graph : public llm_graph_context { |
831 | | graph(const llama_model & model, const llm_graph_params & params); |
832 | | }; |
833 | | |
834 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
835 | | }; |
836 | | |
837 | | |
838 | | struct llama_model_gemma_embedding : public llama_model_base { |
839 | 0 | llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {} |
840 | | void load_arch_hparams(llama_model_loader & ml) override; |
841 | | void load_arch_tensors(llama_model_loader & ml) override; |
842 | | |
843 | | struct graph : public llm_graph_context { |
844 | | graph(const llama_model & model, const llm_graph_params & params); |
845 | | }; |
846 | | |
847 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
848 | | }; |
849 | | |
850 | | |
851 | | struct llama_model_starcoder2 : public llama_model_base { |
852 | 0 | llama_model_starcoder2(const struct llama_model_params & params) : llama_model_base(params) {} |
853 | | void load_arch_hparams(llama_model_loader & ml) override; |
854 | | void load_arch_tensors(llama_model_loader & ml) override; |
855 | | |
856 | | struct graph : public llm_graph_context { |
857 | | graph(const llama_model & model, const llm_graph_params & params); |
858 | | }; |
859 | | |
860 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
861 | | }; |
862 | | |
863 | | |
864 | | struct llama_model_mamba : public llama_model_base { |
865 | 0 | llama_model_mamba(const struct llama_model_params & params) : llama_model_base(params) {} |
866 | | void load_arch_hparams(llama_model_loader & ml) override; |
867 | | void load_arch_tensors(llama_model_loader & ml) override; |
868 | | |
869 | | struct graph : public llm_build_mamba_base { |
870 | | graph(const llama_model & model, const llm_graph_params & params); |
871 | | }; |
872 | | |
873 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
874 | | }; |
875 | | |
876 | | |
877 | | struct llama_model_mamba2 : public llama_model_base { |
878 | 0 | llama_model_mamba2(const struct llama_model_params & params) : llama_model_base(params) {} |
879 | | void load_arch_hparams(llama_model_loader & ml) override; |
880 | | void load_arch_tensors(llama_model_loader & ml) override; |
881 | | |
882 | | using graph = llama_model_mamba::graph; |
883 | | |
884 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
885 | | }; |
886 | | |
887 | | |
888 | | struct llama_model_jamba : public llama_model_base { |
889 | 0 | llama_model_jamba(const struct llama_model_params & params) : llama_model_base(params) {} |
890 | | void load_arch_hparams(llama_model_loader & ml) override; |
891 | | void load_arch_tensors(llama_model_loader & ml) override; |
892 | | |
893 | | struct graph : public llm_build_mamba_base { |
894 | | graph(const llama_model & model, const llm_graph_params & params); |
895 | | }; |
896 | | |
897 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
898 | | }; |
899 | | |
900 | | |
901 | | struct llama_model_xverse : public llama_model_base { |
902 | 0 | llama_model_xverse(const struct llama_model_params & params) : llama_model_base(params) {} |
903 | | void load_arch_hparams(llama_model_loader & ml) override; |
904 | | void load_arch_tensors(llama_model_loader & ml) override; |
905 | | |
906 | | struct graph : public llm_graph_context { |
907 | | graph(const llama_model & model, const llm_graph_params & params); |
908 | | }; |
909 | | |
910 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
911 | | }; |
912 | | |
913 | | |
914 | | struct llama_model_command_r : public llama_model_base { |
915 | 0 | llama_model_command_r(const struct llama_model_params & params) : llama_model_base(params) {} |
916 | | void load_arch_hparams(llama_model_loader & ml) override; |
917 | | void load_arch_tensors(llama_model_loader & ml) override; |
918 | | |
919 | | struct graph : public llm_graph_context { |
920 | | graph(const llama_model & model, const llm_graph_params & params); |
921 | | }; |
922 | | |
923 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
924 | | }; |
925 | | |
926 | | |
927 | | struct llama_model_cohere2 : public llama_model_base { |
928 | 0 | llama_model_cohere2(const struct llama_model_params & params) : llama_model_base(params) {} |
929 | | void load_arch_hparams(llama_model_loader & ml) override; |
930 | | void load_arch_tensors(llama_model_loader & ml) override; |
931 | | |
932 | | struct graph : public llm_graph_context { |
933 | | graph(const llama_model & model, const llm_graph_params & params); |
934 | | }; |
935 | | |
936 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
937 | | }; |
938 | | |
939 | | |
940 | | struct llama_model_cohere2moe : public llama_model_base { |
941 | 0 | llama_model_cohere2moe(const struct llama_model_params & params) : llama_model_base(params) {} |
942 | | void load_arch_hparams(llama_model_loader & ml) override; |
943 | | void load_arch_tensors(llama_model_loader & ml) override; |
944 | | |
945 | | struct graph : public llm_graph_context { |
946 | | graph(const llama_model & model, const llm_graph_params & params); |
947 | | }; |
948 | | |
949 | | struct graph_mtp : public llm_graph_context { |
950 | | graph_mtp(const llama_model & model, const llm_graph_params & params); |
951 | | }; |
952 | | |
953 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
954 | | }; |
955 | | |
956 | | |
957 | | struct llama_model_dbrx : public llama_model_base { |
958 | 0 | llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {} |
959 | | void load_arch_hparams(llama_model_loader & ml) override; |
960 | | void load_arch_tensors(llama_model_loader & ml) override; |
961 | | |
962 | | struct graph : public llm_graph_context { |
963 | | graph(const llama_model & model, const llm_graph_params & params); |
964 | | }; |
965 | | |
966 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
967 | | }; |
968 | | |
969 | | |
970 | | struct llama_model_olmo : public llama_model_base { |
971 | 0 | llama_model_olmo(const struct llama_model_params & params) : llama_model_base(params) {} |
972 | | void load_arch_hparams(llama_model_loader & ml) override; |
973 | | void load_arch_tensors(llama_model_loader & ml) override; |
974 | | |
975 | | struct graph : public llm_graph_context { |
976 | | graph(const llama_model & model, const llm_graph_params & params); |
977 | | }; |
978 | | |
979 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
980 | | }; |
981 | | |
982 | | |
983 | | struct llama_model_olmo2 : public llama_model_base { |
984 | 0 | llama_model_olmo2(const struct llama_model_params & params) : llama_model_base(params) {} |
985 | | void load_arch_hparams(llama_model_loader & ml) override; |
986 | | void load_arch_tensors(llama_model_loader & ml) override; |
987 | | |
988 | | template <bool iswa> |
989 | | struct graph : public llm_graph_context { |
990 | | graph(const llama_model & model, const llm_graph_params & params); |
991 | | }; |
992 | | |
993 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
994 | | }; |
995 | | |
996 | | |
997 | | struct llama_model_olmoe : public llama_model_base { |
998 | 0 | llama_model_olmoe(const struct llama_model_params & params) : llama_model_base(params) {} |
999 | | void load_arch_hparams(llama_model_loader & ml) override; |
1000 | | void load_arch_tensors(llama_model_loader & ml) override; |
1001 | | |
1002 | | struct graph : public llm_graph_context { |
1003 | | graph(const llama_model & model, const llm_graph_params & params); |
1004 | | }; |
1005 | | |
1006 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1007 | | }; |
1008 | | |
1009 | | |
1010 | | struct llama_model_openelm : public llama_model_base { |
1011 | 0 | llama_model_openelm(const struct llama_model_params & params) : llama_model_base(params) {} |
1012 | | void load_arch_hparams(llama_model_loader & ml) override; |
1013 | | void load_arch_tensors(llama_model_loader & ml) override; |
1014 | | |
1015 | | struct graph : public llm_graph_context { |
1016 | | graph(const llama_model & model, const llm_graph_params & params); |
1017 | | }; |
1018 | | |
1019 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1020 | | }; |
1021 | | |
1022 | | |
1023 | | struct llama_model_gptneox : public llama_model_base { |
1024 | 0 | llama_model_gptneox(const struct llama_model_params & params) : llama_model_base(params) {} |
1025 | | void load_arch_hparams(llama_model_loader & ml) override; |
1026 | | void load_arch_tensors(llama_model_loader & ml) override; |
1027 | | |
1028 | | struct graph : public llm_graph_context { |
1029 | | graph(const llama_model & model, const llm_graph_params & params); |
1030 | | }; |
1031 | | |
1032 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1033 | | }; |
1034 | | |
1035 | | |
1036 | | struct llama_model_arctic : public llama_model_base { |
1037 | 0 | llama_model_arctic(const struct llama_model_params & params) : llama_model_base(params) {} |
1038 | | void load_arch_hparams(llama_model_loader & ml) override; |
1039 | | void load_arch_tensors(llama_model_loader & ml) override; |
1040 | | |
1041 | | struct graph : public llm_graph_context { |
1042 | | graph(const llama_model & model, const llm_graph_params & params); |
1043 | | }; |
1044 | | |
1045 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1046 | | }; |
1047 | | |
1048 | | |
1049 | | struct llama_model_deepseek : public llama_model_base { |
1050 | 0 | llama_model_deepseek(const struct llama_model_params & params) : llama_model_base(params) {} |
1051 | | void load_arch_hparams(llama_model_loader & ml) override; |
1052 | | void load_arch_tensors(llama_model_loader & ml) override; |
1053 | | |
1054 | | struct graph : public llm_graph_context { |
1055 | | graph(const llama_model & model, const llm_graph_params & params); |
1056 | | }; |
1057 | | |
1058 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1059 | | }; |
1060 | | |
1061 | | |
1062 | | struct llama_model_deepseek2 : public llama_model_base { |
1063 | 0 | llama_model_deepseek2(const struct llama_model_params & params) : llama_model_base(params) {} |
1064 | | void load_arch_hparams(llama_model_loader & ml) override; |
1065 | | void load_arch_tensors(llama_model_loader & ml) override; |
1066 | | |
1067 | | struct graph : public llm_graph_context { |
1068 | | graph(const llama_model & model, const llm_graph_params & params); |
1069 | | }; |
1070 | | |
1071 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1072 | | }; |
1073 | | |
1074 | | |
1075 | | struct llama_model_deepseek32 : public llama_model_base { |
1076 | 0 | llama_model_deepseek32(const struct llama_model_params & params) : llama_model_base(params) {} |
1077 | | void load_arch_hparams(llama_model_loader & ml) override; |
1078 | | void load_arch_tensors(llama_model_loader & ml) override; |
1079 | | |
1080 | | struct graph : public llm_graph_context { |
1081 | | graph(const llama_model & model, const llm_graph_params & params); |
1082 | | }; |
1083 | | |
1084 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1085 | | }; |
1086 | | |
1087 | | |
1088 | | struct llama_model_deepseek2ocr : public llama_model_base { |
1089 | 0 | llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {} |
1090 | | void load_arch_hparams(llama_model_loader & ml) override; |
1091 | | void load_arch_tensors(llama_model_loader & ml) override; |
1092 | | |
1093 | | using graph = llama_model_deepseek2::graph; |
1094 | | |
1095 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1096 | | }; |
1097 | | |
1098 | | |
1099 | | struct llama_model_glm_dsa : public llama_model_base { |
1100 | 0 | llama_model_glm_dsa(const struct llama_model_params & params) : llama_model_base(params) {} |
1101 | | void load_arch_hparams(llama_model_loader & ml) override; |
1102 | | void load_arch_tensors(llama_model_loader & ml) override; |
1103 | | |
1104 | | using graph = llama_model_deepseek2::graph; |
1105 | | |
1106 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1107 | | }; |
1108 | | |
1109 | | struct llama_model_eagle3 : public llama_model_base { |
1110 | 0 | llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {} |
1111 | | void load_arch_hparams(llama_model_loader & ml) override; |
1112 | | void load_arch_tensors(llama_model_loader & ml) override; |
1113 | | |
1114 | | template <bool is_enc> |
1115 | | struct graph : public llm_graph_context { |
1116 | | graph(const llama_model & model, const llm_graph_params & params); |
1117 | | |
1118 | | ggml_tensor * build_inp_embd_enc() const; |
1119 | | }; |
1120 | | |
1121 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1122 | | }; |
1123 | | |
1124 | | |
1125 | | struct llama_model_mistral4 : public llama_model_deepseek2 { |
1126 | 0 | llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} |
1127 | | // reuse load_arch_hparams and load_arch_tensors from llama_model_deepseek2 |
1128 | | |
1129 | | using graph = llama_model_deepseek2::graph; |
1130 | | |
1131 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1132 | | }; |
1133 | | |
1134 | | |
1135 | | struct llama_model_chatglm : public llama_model_base { |
1136 | 0 | llama_model_chatglm(const struct llama_model_params & params) : llama_model_base(params) {} |
1137 | | void load_arch_hparams(llama_model_loader & ml) override; |
1138 | | void load_arch_tensors(llama_model_loader & ml) override; |
1139 | | |
1140 | | struct graph : public llm_graph_context { |
1141 | | graph(const llama_model & model, const llm_graph_params & params); |
1142 | | }; |
1143 | | |
1144 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1145 | | }; |
1146 | | |
1147 | | |
1148 | | struct llama_model_glm4 : public llama_model_base { |
1149 | 0 | llama_model_glm4(const struct llama_model_params & params) : llama_model_base(params) {} |
1150 | | void load_arch_hparams(llama_model_loader & ml) override; |
1151 | | void load_arch_tensors(llama_model_loader & ml) override; |
1152 | | |
1153 | | struct graph : public llm_graph_context { |
1154 | | graph(const llama_model & model, const llm_graph_params & params); |
1155 | | }; |
1156 | | |
1157 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1158 | | }; |
1159 | | |
1160 | | |
1161 | | struct llama_model_glm4_moe : public llama_model_base { |
1162 | 0 | llama_model_glm4_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1163 | | void load_arch_hparams(llama_model_loader & ml) override; |
1164 | | void load_arch_tensors(llama_model_loader & ml) override; |
1165 | | |
1166 | | struct graph : public llm_graph_context { |
1167 | | graph(const llama_model & model, const llm_graph_params & params); |
1168 | | }; |
1169 | | |
1170 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1171 | | }; |
1172 | | |
1173 | | |
1174 | | struct llama_model_bitnet : public llama_model_base { |
1175 | 0 | llama_model_bitnet(const struct llama_model_params & params) : llama_model_base(params) {} |
1176 | | void load_arch_hparams(llama_model_loader & ml) override; |
1177 | | void load_arch_tensors(llama_model_loader & ml) override; |
1178 | | |
1179 | | struct graph : public llm_graph_context { |
1180 | | graph(const llama_model & model, const llm_graph_params & params); |
1181 | | }; |
1182 | | |
1183 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1184 | | }; |
1185 | | |
1186 | | |
1187 | | struct llama_model_t5 : public llama_model_base { |
1188 | 0 | llama_model_t5(const struct llama_model_params & params) : llama_model_base(params) {} |
1189 | | void load_arch_hparams(llama_model_loader & ml) override; |
1190 | | void load_arch_tensors(llama_model_loader & ml) override; |
1191 | | |
1192 | | template <bool is_enc> |
1193 | | struct graph : public llm_graph_context { |
1194 | | graph(const llama_model & model, const llm_graph_params & params); |
1195 | | }; |
1196 | | |
1197 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1198 | | }; |
1199 | | |
1200 | | |
1201 | | struct llama_model_t5encoder : public llama_model_base { |
1202 | 0 | llama_model_t5encoder(const struct llama_model_params & params) : llama_model_base(params) {} |
1203 | | void load_arch_hparams(llama_model_loader & ml) override; |
1204 | | void load_arch_tensors(llama_model_loader & ml) override; |
1205 | | |
1206 | | using graph = llama_model_t5::graph<true>; |
1207 | | |
1208 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1209 | | }; |
1210 | | |
1211 | | |
1212 | | struct llama_model_jais : public llama_model_base { |
1213 | 0 | llama_model_jais(const struct llama_model_params & params) : llama_model_base(params) {} |
1214 | | void load_arch_hparams(llama_model_loader & ml) override; |
1215 | | void load_arch_tensors(llama_model_loader & ml) override; |
1216 | | |
1217 | | struct graph : public llm_graph_context { |
1218 | | graph(const llama_model & model, const llm_graph_params & params); |
1219 | | }; |
1220 | | |
1221 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1222 | | }; |
1223 | | |
1224 | | |
1225 | | struct llama_model_jais2 : public llama_model_base { |
1226 | 0 | llama_model_jais2(const struct llama_model_params & params) : llama_model_base(params) {} |
1227 | | void load_arch_hparams(llama_model_loader & ml) override; |
1228 | | void load_arch_tensors(llama_model_loader & ml) override; |
1229 | | |
1230 | | struct graph : public llm_graph_context { |
1231 | | graph(const llama_model & model, const llm_graph_params & params); |
1232 | | }; |
1233 | | |
1234 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1235 | | }; |
1236 | | |
1237 | | |
1238 | | struct llama_model_nemotron : public llama_model_base { |
1239 | 0 | llama_model_nemotron(const struct llama_model_params & params) : llama_model_base(params) {} |
1240 | | void load_arch_hparams(llama_model_loader & ml) override; |
1241 | | void load_arch_tensors(llama_model_loader & ml) override; |
1242 | | |
1243 | | struct graph : public llm_graph_context { |
1244 | | graph(const llama_model & model, const llm_graph_params & params); |
1245 | | }; |
1246 | | |
1247 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1248 | | }; |
1249 | | |
1250 | | |
1251 | | struct llama_model_nemotron_h : public llama_model_base { |
1252 | 0 | llama_model_nemotron_h(const struct llama_model_params & params) : llama_model_base(params) {} |
1253 | | void load_arch_hparams(llama_model_loader & ml) override; |
1254 | | void load_arch_tensors(llama_model_loader & ml) override; |
1255 | | |
1256 | | struct graph : public llm_build_mamba_base { |
1257 | | graph(const llama_model & model, const llm_graph_params & params); |
1258 | | ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il); |
1259 | | ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn, |
1260 | | const llama_model & model, int64_t n_embd_head, int il); |
1261 | | }; |
1262 | | |
1263 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1264 | | }; |
1265 | | |
1266 | | |
1267 | | struct llama_model_nemotron_h_moe : public llama_model_nemotron_h { |
1268 | 0 | llama_model_nemotron_h_moe(const struct llama_model_params & params) : llama_model_nemotron_h(params) {} |
1269 | | // reuse load_arch_hparams and load_arch_tensors from llama_model_nemotron_h |
1270 | | |
1271 | | using graph = llama_model_nemotron_h::graph; |
1272 | | |
1273 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1274 | | }; |
1275 | | |
1276 | | |
1277 | | struct llama_model_exaone : public llama_model_base { |
1278 | 0 | llama_model_exaone(const struct llama_model_params & params) : llama_model_base(params) {} |
1279 | | void load_arch_hparams(llama_model_loader & ml) override; |
1280 | | void load_arch_tensors(llama_model_loader & ml) override; |
1281 | | |
1282 | | struct graph : public llm_graph_context { |
1283 | | graph(const llama_model & model, const llm_graph_params & params); |
1284 | | }; |
1285 | | |
1286 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1287 | | }; |
1288 | | |
1289 | | |
1290 | | struct llama_model_exaone4 : public llama_model_base { |
1291 | 0 | llama_model_exaone4(const struct llama_model_params & params) : llama_model_base(params) {} |
1292 | | void load_arch_hparams(llama_model_loader & ml) override; |
1293 | | void load_arch_tensors(llama_model_loader & ml) override; |
1294 | | |
1295 | | template <bool iswa> |
1296 | | struct graph : public llm_graph_context { |
1297 | | graph(const llama_model & model, const llm_graph_params & params); |
1298 | | }; |
1299 | | |
1300 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1301 | | }; |
1302 | | |
1303 | | |
1304 | | struct llama_model_exaone_moe : public llama_model_base { |
1305 | 0 | llama_model_exaone_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1306 | | void load_arch_hparams(llama_model_loader & ml) override; |
1307 | | void load_arch_tensors(llama_model_loader & ml) override; |
1308 | | |
1309 | | struct graph : public llm_graph_context { |
1310 | | graph(const llama_model & model, const llm_graph_params & params); |
1311 | | }; |
1312 | | |
1313 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1314 | | }; |
1315 | | |
1316 | | |
1317 | | struct llama_model_rwkv6 : public llama_model_base { |
1318 | 0 | llama_model_rwkv6(const struct llama_model_params & params) : llama_model_base(params) {} |
1319 | | void load_arch_hparams(llama_model_loader & ml) override; |
1320 | | void load_arch_tensors(llama_model_loader & ml) override; |
1321 | | |
1322 | | struct graph : public llm_build_rwkv6_base { |
1323 | | graph(const llama_model & model, const llm_graph_params & params); |
1324 | | }; |
1325 | | |
1326 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1327 | | }; |
1328 | | |
1329 | | |
1330 | | struct llama_model_rwkv6qwen2 : public llama_model_base { |
1331 | 0 | llama_model_rwkv6qwen2(const struct llama_model_params & params) : llama_model_base(params) {} |
1332 | | void load_arch_hparams(llama_model_loader & ml) override; |
1333 | | void load_arch_tensors(llama_model_loader & ml) override; |
1334 | | |
1335 | | struct graph : public llm_build_rwkv6_base { |
1336 | | graph(const llama_model & model, const llm_graph_params & params); |
1337 | | }; |
1338 | | |
1339 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1340 | | }; |
1341 | | |
1342 | | |
1343 | | struct llama_model_rwkv7 : public llama_model_base { |
1344 | 0 | llama_model_rwkv7(const struct llama_model_params & params) : llama_model_base(params) {} |
1345 | | void load_arch_hparams(llama_model_loader & ml) override; |
1346 | | void load_arch_tensors(llama_model_loader & ml) override; |
1347 | | |
1348 | | struct graph : public llm_build_rwkv7_base { |
1349 | | graph(const llama_model & model, const llm_graph_params & params); |
1350 | | }; |
1351 | | |
1352 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1353 | | }; |
1354 | | |
1355 | | |
1356 | | struct llama_model_arwkv7 : public llama_model_base { |
1357 | 0 | llama_model_arwkv7(const struct llama_model_params & params) : llama_model_base(params) {} |
1358 | | void load_arch_hparams(llama_model_loader & ml) override; |
1359 | | void load_arch_tensors(llama_model_loader & ml) override; |
1360 | | |
1361 | | struct graph : public llm_build_rwkv7_base { |
1362 | | graph(const llama_model & model, const llm_graph_params & params); |
1363 | | }; |
1364 | | |
1365 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1366 | | }; |
1367 | | |
1368 | | |
1369 | | struct llama_model_granite : public llama_model_base { |
1370 | 0 | llama_model_granite(const struct llama_model_params & params) : llama_model_base(params) {} |
1371 | | void load_arch_hparams(llama_model_loader & ml) override; |
1372 | | void load_arch_tensors(llama_model_loader & ml) override; |
1373 | | |
1374 | | struct graph : public llm_graph_context { |
1375 | | graph(const llama_model & model, const llm_graph_params & params); |
1376 | | |
1377 | | private: |
1378 | | ggml_tensor * build_attention_layer( |
1379 | | ggml_tensor * cur, |
1380 | | ggml_tensor * inp_pos, |
1381 | | llm_graph_input_attn_kv * inp_attn, |
1382 | | const llama_model & model, |
1383 | | const int64_t n_embd_head, |
1384 | | const int il); |
1385 | | |
1386 | | ggml_tensor * build_layer_ffn( |
1387 | | ggml_tensor * cur, |
1388 | | ggml_tensor * inpSA, |
1389 | | const llama_model & model, |
1390 | | const int il); |
1391 | | }; |
1392 | | |
1393 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1394 | | }; |
1395 | | |
1396 | | |
1397 | | struct llama_model_granite_moe : public llama_model_base { |
1398 | 0 | llama_model_granite_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1399 | | void load_arch_hparams(llama_model_loader & ml) override; |
1400 | | void load_arch_tensors(llama_model_loader & ml) override; |
1401 | | |
1402 | | using graph = llama_model_granite::graph; |
1403 | | |
1404 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1405 | | }; |
1406 | | |
1407 | | |
1408 | | struct llama_model_minicpm : public llama_model_base { |
1409 | 0 | llama_model_minicpm(const struct llama_model_params & params) : llama_model_base(params) {} |
1410 | | void load_arch_hparams(llama_model_loader & ml) override; |
1411 | | void load_arch_tensors(llama_model_loader & ml) override; |
1412 | | |
1413 | | using graph = llama_model_granite::graph; |
1414 | | |
1415 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1416 | | }; |
1417 | | |
1418 | | |
1419 | | struct llama_model_granite_hybrid : public llama_model_base { |
1420 | 0 | llama_model_granite_hybrid(const struct llama_model_params & params) : llama_model_base(params) {} |
1421 | | void load_arch_hparams(llama_model_loader & ml) override; |
1422 | | void load_arch_tensors(llama_model_loader & ml) override; |
1423 | | |
1424 | | struct graph : public llm_build_mamba_base { |
1425 | | graph(const llama_model & model, const llm_graph_params & params); |
1426 | | ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il); |
1427 | | ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, |
1428 | | const llama_model & model,const int64_t n_embd_head, const int il); |
1429 | | }; |
1430 | | |
1431 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1432 | | }; |
1433 | | |
1434 | | |
1435 | | struct llama_model_chameleon : public llama_model_base { |
1436 | 0 | llama_model_chameleon(const struct llama_model_params & params) : llama_model_base(params) {} |
1437 | | void load_arch_hparams(llama_model_loader & ml) override; |
1438 | | void load_arch_tensors(llama_model_loader & ml) override; |
1439 | | |
1440 | | struct graph : public llm_graph_context { |
1441 | | graph(const llama_model & model, const llm_graph_params & params); |
1442 | | }; |
1443 | | |
1444 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1445 | | }; |
1446 | | |
1447 | | |
1448 | | struct llama_model_wavtokenizer_dec : public llama_model_base { |
1449 | 0 | llama_model_wavtokenizer_dec(const struct llama_model_params & params) : llama_model_base(params) {} |
1450 | | void load_arch_hparams(llama_model_loader & ml) override; |
1451 | | void load_arch_tensors(llama_model_loader & ml) override; |
1452 | | |
1453 | | struct graph : public llm_graph_context { |
1454 | | graph(const llama_model & model, const llm_graph_params & params); |
1455 | | }; |
1456 | | |
1457 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1458 | | }; |
1459 | | |
1460 | | |
1461 | | struct llama_model_plm : public llama_model_base { |
1462 | 0 | llama_model_plm(const struct llama_model_params & params) : llama_model_base(params) {} |
1463 | | void load_arch_hparams(llama_model_loader & ml) override; |
1464 | | void load_arch_tensors(llama_model_loader & ml) override; |
1465 | | |
1466 | | struct graph : public llm_graph_context { |
1467 | | graph(const llama_model & model, const llm_graph_params & params); |
1468 | | }; |
1469 | | |
1470 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1471 | | }; |
1472 | | |
1473 | | |
1474 | | struct llama_model_bailingmoe : public llama_model_base { |
1475 | 0 | llama_model_bailingmoe(const struct llama_model_params & params) : llama_model_base(params) {} |
1476 | | void load_arch_hparams(llama_model_loader & ml) override; |
1477 | | void load_arch_tensors(llama_model_loader & ml) override; |
1478 | | |
1479 | | struct graph : public llm_graph_context { |
1480 | | graph(const llama_model & model, const llm_graph_params & params); |
1481 | | }; |
1482 | | |
1483 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1484 | | }; |
1485 | | |
1486 | | |
1487 | | struct llama_model_bailingmoe2 : public llama_model_base { |
1488 | 0 | llama_model_bailingmoe2(const struct llama_model_params & params) : llama_model_base(params) {} |
1489 | | void load_arch_hparams(llama_model_loader & ml) override; |
1490 | | void load_arch_tensors(llama_model_loader & ml) override; |
1491 | | |
1492 | | struct graph : public llm_graph_context { |
1493 | | graph(const llama_model & model, const llm_graph_params & params); |
1494 | | }; |
1495 | | |
1496 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1497 | | }; |
1498 | | |
1499 | | |
1500 | | struct llama_model_seed_oss : public llama_model_base { |
1501 | 0 | llama_model_seed_oss(const struct llama_model_params & params) : llama_model_base(params) {} |
1502 | | void load_arch_hparams(llama_model_loader & ml) override; |
1503 | | void load_arch_tensors(llama_model_loader & ml) override; |
1504 | | |
1505 | | struct graph : public llm_graph_context { |
1506 | | graph(const llama_model & model, const llm_graph_params & params); |
1507 | | }; |
1508 | | |
1509 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1510 | | }; |
1511 | | |
1512 | | |
1513 | | struct llama_model_dots1 : public llama_model_base { |
1514 | 0 | llama_model_dots1(const struct llama_model_params & params) : llama_model_base(params) {} |
1515 | | void load_arch_hparams(llama_model_loader & ml) override; |
1516 | | void load_arch_tensors(llama_model_loader & ml) override; |
1517 | | |
1518 | | struct graph : public llm_graph_context { |
1519 | | graph(const llama_model & model, const llm_graph_params & params); |
1520 | | }; |
1521 | | |
1522 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1523 | | }; |
1524 | | |
1525 | | |
1526 | | struct llama_model_arcee : public llama_model_base { |
1527 | 0 | llama_model_arcee(const struct llama_model_params & params) : llama_model_base(params) {} |
1528 | | void load_arch_hparams(llama_model_loader & ml) override; |
1529 | | void load_arch_tensors(llama_model_loader & ml) override; |
1530 | | |
1531 | | struct graph : public llm_graph_context { |
1532 | | graph(const llama_model & model, const llm_graph_params & params); |
1533 | | }; |
1534 | | |
1535 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1536 | | }; |
1537 | | |
1538 | | |
1539 | | struct llama_model_afmoe : public llama_model_base { |
1540 | 0 | llama_model_afmoe(const struct llama_model_params & params) : llama_model_base(params) {} |
1541 | | void load_arch_hparams(llama_model_loader & ml) override; |
1542 | | void load_arch_tensors(llama_model_loader & ml) override; |
1543 | | |
1544 | | struct graph : public llm_graph_context { |
1545 | | graph(const llama_model & model, const llm_graph_params & params); |
1546 | | }; |
1547 | | |
1548 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1549 | | }; |
1550 | | |
1551 | | |
1552 | | struct llama_model_ernie4_5 : public llama_model_base { |
1553 | 0 | llama_model_ernie4_5(const struct llama_model_params & params) : llama_model_base(params) {} |
1554 | | void load_arch_hparams(llama_model_loader & ml) override; |
1555 | | void load_arch_tensors(llama_model_loader & ml) override; |
1556 | | |
1557 | | struct graph : public llm_graph_context { |
1558 | | graph(const llama_model & model, const llm_graph_params & params); |
1559 | | }; |
1560 | | |
1561 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1562 | | }; |
1563 | | |
1564 | | |
1565 | | struct llama_model_ernie4_5_moe : public llama_model_ernie4_5 { |
1566 | 0 | llama_model_ernie4_5_moe(const struct llama_model_params & params) : llama_model_ernie4_5(params) {} |
1567 | | // reuse load_arch_hparams and load_arch_tensors from llama_model_ernie4_5 |
1568 | | |
1569 | | struct graph : public llm_graph_context { |
1570 | | graph(const llama_model & model, const llm_graph_params & params); |
1571 | | }; |
1572 | | |
1573 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1574 | | }; |
1575 | | |
1576 | | |
1577 | | struct llama_model_paddleocr : public llama_model_ernie4_5 { |
1578 | 0 | llama_model_paddleocr(const struct llama_model_params & params) : llama_model_ernie4_5(params) {} |
1579 | | // reuse load_arch_hparams and load_arch_tensors from llama_model_ernie4_5 |
1580 | | |
1581 | | struct graph : public llm_graph_context { |
1582 | | graph(const llama_model & model, const llm_graph_params & params); |
1583 | | }; |
1584 | | |
1585 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1586 | | }; |
1587 | | |
1588 | | |
1589 | | struct llama_model_hunyuan_moe : public llama_model_base { |
1590 | 0 | llama_model_hunyuan_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1591 | | void load_arch_hparams(llama_model_loader & ml) override; |
1592 | | void load_arch_tensors(llama_model_loader & ml) override; |
1593 | | |
1594 | | struct graph : public llm_graph_context { |
1595 | | graph(const llama_model & model, const llm_graph_params & params); |
1596 | | }; |
1597 | | |
1598 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1599 | | }; |
1600 | | |
1601 | | |
1602 | | struct llama_model_hunyuan_vl : public llama_model_base { |
1603 | 0 | llama_model_hunyuan_vl(const struct llama_model_params & params) : llama_model_base(params) {} |
1604 | | void load_arch_hparams(llama_model_loader & ml) override; |
1605 | | void load_arch_tensors(llama_model_loader & ml) override; |
1606 | | |
1607 | | struct graph : public llm_graph_context { |
1608 | | graph(const llama_model & model, const llm_graph_params & params); |
1609 | | }; |
1610 | | |
1611 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1612 | | }; |
1613 | | |
1614 | | |
1615 | | struct llama_model_hunyuan_dense : public llama_model_hunyuan_vl { |
1616 | 0 | llama_model_hunyuan_dense(const struct llama_model_params & params) : llama_model_hunyuan_vl(params) {} |
1617 | | // reuse load_arch_hparams and load_arch_tensors from llama_model_hunyuan_vl |
1618 | | |
1619 | | using graph = llama_model_hunyuan_vl::graph; |
1620 | | |
1621 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1622 | | }; |
1623 | | |
1624 | | |
1625 | | struct llama_model_smollm3 : public llama_model_base { |
1626 | 0 | llama_model_smollm3(const struct llama_model_params & params) : llama_model_base(params) {} |
1627 | | void load_arch_hparams(llama_model_loader & ml) override; |
1628 | | void load_arch_tensors(llama_model_loader & ml) override; |
1629 | | |
1630 | | struct graph : public llm_graph_context { |
1631 | | graph(const llama_model & model, const llm_graph_params & params); |
1632 | | }; |
1633 | | |
1634 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1635 | | }; |
1636 | | |
1637 | | |
1638 | | struct llama_model_openai_moe : public llama_model_base { |
1639 | 0 | llama_model_openai_moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1640 | | void load_arch_hparams(llama_model_loader & ml) override; |
1641 | | void load_arch_tensors(llama_model_loader & ml) override; |
1642 | | |
1643 | | struct graph : public llm_graph_context { |
1644 | | graph(const llama_model & model, const llm_graph_params & params); |
1645 | | }; |
1646 | | |
1647 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1648 | | }; |
1649 | | |
1650 | | |
1651 | | struct llama_model_falcon_h1 : public llama_model_base { |
1652 | 0 | llama_model_falcon_h1(const struct llama_model_params & params) : llama_model_base(params) {} |
1653 | | void load_arch_hparams(llama_model_loader & ml) override; |
1654 | | void load_arch_tensors(llama_model_loader & ml) override; |
1655 | | |
1656 | | struct graph : public llm_build_mamba_base { |
1657 | | graph(const llama_model & model, const llm_graph_params & params); |
1658 | | }; |
1659 | | |
1660 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1661 | | }; |
1662 | | |
1663 | | |
1664 | | struct llama_model_lfm2 : public llama_model_base { |
1665 | 0 | llama_model_lfm2(const struct llama_model_params & params) : llama_model_base(params) {} |
1666 | | void load_arch_hparams(llama_model_loader & ml) override; |
1667 | | void load_arch_tensors(llama_model_loader & ml) override; |
1668 | | |
1669 | | template <bool iswa> |
1670 | | struct graph : public llm_graph_context { |
1671 | | graph(const llama_model & model, const llm_graph_params & params); |
1672 | | }; |
1673 | | |
1674 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1675 | | }; |
1676 | | |
1677 | | |
1678 | | struct llama_model_lfm2moe : public llama_model_base { |
1679 | 0 | llama_model_lfm2moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1680 | | void load_arch_hparams(llama_model_loader & ml) override; |
1681 | | void load_arch_tensors(llama_model_loader & ml) override; |
1682 | | |
1683 | | template <bool iswa> |
1684 | | using graph = llama_model_lfm2::graph<iswa>; |
1685 | | |
1686 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1687 | | }; |
1688 | | |
1689 | | |
1690 | | struct llama_model_smallthinker : public llama_model_base { |
1691 | 0 | llama_model_smallthinker(const struct llama_model_params & params) : llama_model_base(params) {} |
1692 | | void load_arch_hparams(llama_model_loader & ml) override; |
1693 | | void load_arch_tensors(llama_model_loader & ml) override; |
1694 | | |
1695 | | template <bool iswa> |
1696 | | struct graph : public llm_graph_context { |
1697 | | graph(const llama_model & model, const llm_graph_params & params); |
1698 | | }; |
1699 | | |
1700 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1701 | | }; |
1702 | | |
1703 | | |
1704 | | struct llama_model_grovemoe : public llama_model_base { |
1705 | 0 | llama_model_grovemoe(const struct llama_model_params & params) : llama_model_base(params) {} |
1706 | | void load_arch_hparams(llama_model_loader & ml) override; |
1707 | | void load_arch_tensors(llama_model_loader & ml) override; |
1708 | | |
1709 | | struct graph : public llm_graph_context { |
1710 | | graph(const llama_model & model, const llm_graph_params & params); |
1711 | | }; |
1712 | | |
1713 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1714 | | }; |
1715 | | |
1716 | | |
1717 | | struct llama_model_apertus : public llama_model_base { |
1718 | 0 | llama_model_apertus(const struct llama_model_params & params) : llama_model_base(params) {} |
1719 | | void load_arch_hparams(llama_model_loader & ml) override; |
1720 | | void load_arch_tensors(llama_model_loader & ml) override; |
1721 | | |
1722 | | struct graph : public llm_graph_context { |
1723 | | graph(const llama_model & model, const llm_graph_params & params); |
1724 | | }; |
1725 | | |
1726 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1727 | | }; |
1728 | | |
1729 | | |
1730 | | struct llama_model_minimax_m2 : public llama_model_base { |
1731 | 0 | llama_model_minimax_m2(const struct llama_model_params & params) : llama_model_base(params) {} |
1732 | | void load_arch_hparams(llama_model_loader & ml) override; |
1733 | | void load_arch_tensors(llama_model_loader & ml) override; |
1734 | | |
1735 | | struct graph : public llm_graph_context { |
1736 | | graph(const llama_model & model, const llm_graph_params & params); |
1737 | | }; |
1738 | | |
1739 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1740 | | }; |
1741 | | |
1742 | | |
1743 | | struct llama_model_cogvlm : public llama_model_base { |
1744 | 0 | llama_model_cogvlm(const struct llama_model_params & params) : llama_model_base(params) {} |
1745 | | void load_arch_hparams(llama_model_loader & ml) override; |
1746 | | void load_arch_tensors(llama_model_loader & ml) override; |
1747 | | |
1748 | | struct graph : public llm_graph_context { |
1749 | | graph(const llama_model & model, const llm_graph_params & params); |
1750 | | }; |
1751 | | |
1752 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1753 | | }; |
1754 | | |
1755 | | |
1756 | | struct llama_model_pangu_embed : public llama_model_base { |
1757 | 0 | llama_model_pangu_embed(const struct llama_model_params & params) : llama_model_base(params) {} |
1758 | | void load_arch_hparams(llama_model_loader & ml) override; |
1759 | | void load_arch_tensors(llama_model_loader & ml) override; |
1760 | | |
1761 | | struct graph : public llm_graph_context { |
1762 | | graph(const llama_model & model, const llm_graph_params & params); |
1763 | | }; |
1764 | | |
1765 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1766 | | }; |
1767 | | |
1768 | | |
1769 | | struct llama_model_qwen3next : public llama_model_base { |
1770 | 0 | llama_model_qwen3next(const struct llama_model_params & params) : llama_model_base(params) {} |
1771 | | void load_arch_hparams(llama_model_loader & ml) override; |
1772 | | void load_arch_tensors(llama_model_loader & ml) override; |
1773 | | |
1774 | | struct graph : public llm_build_delta_net_base { |
1775 | | graph(const llama_model & model, const llm_graph_params & params); |
1776 | | private: |
1777 | | ggml_tensor * build_layer_attn( |
1778 | | llm_graph_input_attn_kv * inp_attn, |
1779 | | ggml_tensor * cur, |
1780 | | ggml_tensor * inp_pos, |
1781 | | int il); |
1782 | | |
1783 | | ggml_tensor * build_layer_attn_linear( |
1784 | | llm_graph_input_rs * inp, |
1785 | | ggml_tensor * cur, |
1786 | | int il); |
1787 | | |
1788 | | ggml_tensor * build_layer_ffn( |
1789 | | ggml_tensor * cur, |
1790 | | int il); |
1791 | | |
1792 | | ggml_tensor * build_norm_gated( |
1793 | | ggml_tensor * input, |
1794 | | ggml_tensor * weights, |
1795 | | ggml_tensor * gate, |
1796 | | int layer); |
1797 | | |
1798 | | // returns pair of qkv, z |
1799 | | std::pair<ggml_tensor *, ggml_tensor *> build_qkvz( |
1800 | | ggml_tensor * input, |
1801 | | int il); |
1802 | | |
1803 | | const llama_model & model; |
1804 | | }; |
1805 | | |
1806 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1807 | | }; |
1808 | | |
1809 | | |
1810 | | struct llama_model_qwen35 : public llama_model_base { |
1811 | 0 | llama_model_qwen35(const struct llama_model_params & params) : llama_model_base(params) {} |
1812 | | void load_arch_hparams(llama_model_loader & ml) override; |
1813 | | void load_arch_tensors(llama_model_loader & ml) override; |
1814 | | |
1815 | | struct graph : public llm_build_delta_net_base { |
1816 | | graph(const llama_model & model, const llm_graph_params & params); |
1817 | | private: |
1818 | | ggml_tensor * build_layer_attn( |
1819 | | llm_graph_input_attn_kv * inp_attn, |
1820 | | ggml_tensor * cur, |
1821 | | ggml_tensor * inp_pos, |
1822 | | int * sections, |
1823 | | int il); |
1824 | | |
1825 | | ggml_tensor * build_layer_attn_linear( |
1826 | | llm_graph_input_rs * inp, |
1827 | | ggml_tensor * cur, |
1828 | | int il); |
1829 | | |
1830 | | ggml_tensor * build_layer_ffn( |
1831 | | ggml_tensor * cur, |
1832 | | int il); |
1833 | | |
1834 | | ggml_tensor * build_norm_gated( |
1835 | | ggml_tensor * input, |
1836 | | ggml_tensor * weights, |
1837 | | ggml_tensor * gate, |
1838 | | int layer); |
1839 | | |
1840 | | // returns pair of qkv, z |
1841 | | std::pair<ggml_tensor *, ggml_tensor *> build_qkvz( |
1842 | | ggml_tensor * input, |
1843 | | int il); |
1844 | | |
1845 | | const llama_model & model; |
1846 | | }; |
1847 | | |
1848 | | struct graph_mtp : public llm_graph_context { |
1849 | | graph_mtp(const llama_model & model, const llm_graph_params & params); |
1850 | | }; |
1851 | | |
1852 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1853 | | }; |
1854 | | |
1855 | | |
1856 | | struct llama_model_qwen35moe : public llama_model_base { |
1857 | 0 | llama_model_qwen35moe(const struct llama_model_params & params) : llama_model_base(params) {} |
1858 | | void load_arch_hparams(llama_model_loader & ml) override; |
1859 | | void load_arch_tensors(llama_model_loader & ml) override; |
1860 | | |
1861 | | struct graph : public llm_build_delta_net_base { |
1862 | | graph(const llama_model & model, const llm_graph_params & params); |
1863 | | private: |
1864 | | ggml_tensor * build_layer_attn( |
1865 | | llm_graph_input_attn_kv * inp_attn, |
1866 | | ggml_tensor * cur, |
1867 | | ggml_tensor * inp_pos, |
1868 | | int * sections, |
1869 | | int il); |
1870 | | |
1871 | | ggml_tensor * build_layer_attn_linear( |
1872 | | llm_graph_input_rs * inp, |
1873 | | ggml_tensor * cur, |
1874 | | int il); |
1875 | | |
1876 | | ggml_tensor * build_layer_ffn( |
1877 | | ggml_tensor * cur, |
1878 | | int il); |
1879 | | |
1880 | | ggml_tensor * build_norm_gated( |
1881 | | ggml_tensor * input, |
1882 | | ggml_tensor * weights, |
1883 | | ggml_tensor * gate, |
1884 | | int layer); |
1885 | | |
1886 | | // returns pair of qkv, z |
1887 | | std::pair<ggml_tensor *, ggml_tensor *> build_qkvz( |
1888 | | ggml_tensor * input, |
1889 | | int il); |
1890 | | |
1891 | | const llama_model & model; |
1892 | | }; |
1893 | | |
1894 | | struct graph_mtp : public llm_graph_context { |
1895 | | graph_mtp(const llama_model & model, const llm_graph_params & params); |
1896 | | }; |
1897 | | |
1898 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1899 | | }; |
1900 | | |
1901 | | |
1902 | | struct llama_model_mistral3 : public llama_model_base { |
1903 | 0 | llama_model_mistral3(const struct llama_model_params & params) : llama_model_base(params) {} |
1904 | | void load_arch_hparams(llama_model_loader & ml) override; |
1905 | | void load_arch_tensors(llama_model_loader & ml) override; |
1906 | | |
1907 | | struct graph : public llm_graph_context { |
1908 | | graph(const llama_model & model, const llm_graph_params & params); |
1909 | | }; |
1910 | | |
1911 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1912 | | }; |
1913 | | |
1914 | | |
1915 | | struct llama_model_mimo2 : public llama_model_base { |
1916 | 0 | llama_model_mimo2(const struct llama_model_params & params) : llama_model_base(params) {} |
1917 | | void load_arch_hparams(llama_model_loader & ml) override; |
1918 | | void load_arch_tensors(llama_model_loader & ml) override; |
1919 | | |
1920 | | struct graph : public llm_graph_context { |
1921 | | graph(const llama_model & model, const llm_graph_params & params); |
1922 | | }; |
1923 | | |
1924 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1925 | | }; |
1926 | | |
1927 | | |
1928 | | struct llama_model_kimi_linear : public llama_model_base { |
1929 | 0 | llama_model_kimi_linear(const struct llama_model_params & params) : llama_model_base(params) {} |
1930 | | void load_arch_hparams(llama_model_loader & ml) override; |
1931 | | void load_arch_tensors(llama_model_loader & ml) override; |
1932 | | |
1933 | | struct graph : public llm_build_delta_net_base { |
1934 | | graph(const llama_model & model, const llm_graph_params & params); |
1935 | | |
1936 | | std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive( |
1937 | | ggml_tensor * q, |
1938 | | ggml_tensor * k, |
1939 | | ggml_tensor * v, |
1940 | | ggml_tensor * gk, |
1941 | | ggml_tensor * beta, |
1942 | | ggml_tensor * state, |
1943 | | int il); |
1944 | | |
1945 | | std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking( |
1946 | | ggml_tensor * q, |
1947 | | ggml_tensor * k, |
1948 | | ggml_tensor * v, |
1949 | | ggml_tensor * gk, |
1950 | | ggml_tensor * beta, |
1951 | | ggml_tensor * state, |
1952 | | ggml_tensor * causal_mask, |
1953 | | ggml_tensor * identity, |
1954 | | ggml_tensor * diag_mask, |
1955 | | int il); |
1956 | | |
1957 | | const llama_model & model; |
1958 | | }; |
1959 | | |
1960 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1961 | | }; |
1962 | | |
1963 | | |
1964 | | struct llama_model_step35 : public llama_model_base { |
1965 | 0 | llama_model_step35(const struct llama_model_params & params) : llama_model_base(params) {} |
1966 | | void load_arch_hparams(llama_model_loader & ml) override; |
1967 | | void load_arch_tensors(llama_model_loader & ml) override; |
1968 | | |
1969 | | struct graph : public llm_graph_context { |
1970 | | graph(const llama_model & model, const llm_graph_params & params); |
1971 | | }; |
1972 | | |
1973 | | struct graph_mtp : public llm_graph_context { |
1974 | | graph_mtp(const llama_model & model, const llm_graph_params & params); |
1975 | | }; |
1976 | | |
1977 | | std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override; |
1978 | | }; |