/src/llama.cpp/src/llama-model.cpp
Line | Count | Source |
1 | | #include "llama-model.h" |
2 | | |
3 | | #include "llama-arch.h" |
4 | | #include "llama-ext.h" |
5 | | #include "llama-hparams.h" |
6 | | #include "llama-impl.h" |
7 | | #include "llama-mmap.h" |
8 | | #include "llama-cparams.h" |
9 | | #include "llama-model-loader.h" |
10 | | |
11 | | #include "llama-kv-cache.h" |
12 | | #include "llama-kv-cache-iswa.h" |
13 | | #include "llama-kv-cache-dsa.h" |
14 | | #include "llama-memory-hybrid.h" |
15 | | #include "llama-memory-hybrid-iswa.h" |
16 | | #include "llama-memory-recurrent.h" |
17 | | |
18 | | #include "models/models.h" |
19 | | |
20 | | #include "ggml.h" |
21 | | #include "ggml-cpp.h" |
22 | | |
23 | | #include <algorithm> |
24 | | #include <cassert> |
25 | | #include <cfloat> |
26 | | #include <cstdint> |
27 | | #include <cstring> |
28 | | #include <cmath> |
29 | | #include <functional> |
30 | | #include <map> |
31 | | #include <numeric> |
32 | | #include <regex> |
33 | | #include <sstream> |
34 | | #include <stdexcept> |
35 | | #include <string> |
36 | | #include <vector> |
37 | | |
38 | 0 | static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params & params) { |
39 | 0 | switch (arch) { |
40 | 0 | case LLM_ARCH_LLAMA: |
41 | 0 | return new llama_model_llama(params); |
42 | 0 | case LLM_ARCH_LLAMA4: |
43 | 0 | return new llama_model_llama4(params); |
44 | 0 | case LLM_ARCH_LLAMA_EMBED: |
45 | 0 | return new llama_model_llama_embed(params); |
46 | 0 | case LLM_ARCH_MAINCODER: |
47 | 0 | return new llama_model_maincoder(params); |
48 | 0 | case LLM_ARCH_TALKIE: |
49 | 0 | return new llama_model_talkie(params); |
50 | 0 | case LLM_ARCH_DECI: |
51 | 0 | return new llama_model_deci(params); |
52 | 0 | case LLM_ARCH_BAICHUAN: |
53 | 0 | return new llama_model_baichuan(params); |
54 | 0 | case LLM_ARCH_FALCON: |
55 | 0 | return new llama_model_falcon(params); |
56 | 0 | case LLM_ARCH_GROK: |
57 | 0 | return new llama_model_grok(params); |
58 | 0 | case LLM_ARCH_STARCODER: |
59 | 0 | return new llama_model_starcoder(params); |
60 | 0 | case LLM_ARCH_REFACT: |
61 | 0 | return new llama_model_refact(params); |
62 | 0 | case LLM_ARCH_BERT: |
63 | 0 | return new llama_model_bert(params); |
64 | 0 | case LLM_ARCH_JINA_BERT_V2: |
65 | 0 | return new llama_model_jina_bert_v2(params); |
66 | 0 | case LLM_ARCH_JINA_BERT_V3: |
67 | 0 | return new llama_model_jina_bert_v3(params); |
68 | 0 | case LLM_ARCH_NOMIC_BERT: |
69 | 0 | return new llama_model_nomic_bert(params); |
70 | 0 | case LLM_ARCH_NOMIC_BERT_MOE: |
71 | 0 | return new llama_model_nomic_bert_moe(params); |
72 | 0 | case LLM_ARCH_MODERN_BERT: |
73 | 0 | return new llama_model_modern_bert(params); |
74 | 0 | case LLM_ARCH_NEO_BERT: |
75 | 0 | return new llama_model_neo_bert(params); |
76 | 0 | case LLM_ARCH_EUROBERT: |
77 | 0 | return new llama_model_eurobert(params); |
78 | 0 | case LLM_ARCH_BLOOM: |
79 | 0 | return new llama_model_bloom(params); |
80 | 0 | case LLM_ARCH_MPT: |
81 | 0 | return new llama_model_mpt(params); |
82 | 0 | case LLM_ARCH_STABLELM: |
83 | 0 | return new llama_model_stablelm(params); |
84 | 0 | case LLM_ARCH_MELLUM: |
85 | 0 | return new llama_model_mellum(params); |
86 | 0 | case LLM_ARCH_QWEN: |
87 | 0 | return new llama_model_qwen(params); |
88 | 0 | case LLM_ARCH_QWEN2: |
89 | 0 | return new llama_model_qwen2(params); |
90 | 0 | case LLM_ARCH_DREAM: |
91 | 0 | return new llama_model_dream(params); |
92 | 0 | case LLM_ARCH_LLADA: |
93 | 0 | return new llama_model_llada(params); |
94 | 0 | case LLM_ARCH_LLADA_MOE: |
95 | 0 | return new llama_model_llada_moe(params); |
96 | 0 | case LLM_ARCH_RND1: |
97 | 0 | return new llama_model_rnd1(params); |
98 | 0 | case LLM_ARCH_QWEN2VL: |
99 | 0 | return new llama_model_qwen2vl(params); |
100 | 0 | case LLM_ARCH_QWEN2MOE: |
101 | 0 | return new llama_model_qwen2moe(params); |
102 | 0 | case LLM_ARCH_QWEN3: |
103 | 0 | return new llama_model_qwen3(params); |
104 | 0 | case LLM_ARCH_QWEN3MOE: |
105 | 0 | return new llama_model_qwen3moe(params); |
106 | 0 | case LLM_ARCH_QWEN3VL: |
107 | 0 | return new llama_model_qwen3vl(params); |
108 | 0 | case LLM_ARCH_QWEN3VLMOE: |
109 | 0 | return new llama_model_qwen3vlmoe(params); |
110 | 0 | case LLM_ARCH_PHI2: |
111 | 0 | return new llama_model_phi2(params); |
112 | 0 | case LLM_ARCH_PHI3: |
113 | 0 | return new llama_model_phi3(params); |
114 | 0 | case LLM_ARCH_PHIMOE: |
115 | 0 | return new llama_model_phimoe(params); |
116 | 0 | case LLM_ARCH_PLAMO: |
117 | 0 | return new llama_model_plamo(params); |
118 | 0 | case LLM_ARCH_PLAMO2: |
119 | 0 | return new llama_model_plamo2(params); |
120 | 0 | case LLM_ARCH_PLAMO3: |
121 | 0 | return new llama_model_plamo3(params); |
122 | 0 | case LLM_ARCH_GPT2: |
123 | 0 | return new llama_model_gpt2(params); |
124 | 0 | case LLM_ARCH_CODESHELL: |
125 | 0 | return new llama_model_codeshell(params); |
126 | 0 | case LLM_ARCH_ORION: |
127 | 0 | return new llama_model_orion(params); |
128 | 0 | case LLM_ARCH_INTERNLM2: |
129 | 0 | return new llama_model_internlm2(params); |
130 | 0 | case LLM_ARCH_MINICPM3: |
131 | 0 | return new llama_model_minicpm3(params); |
132 | 0 | case LLM_ARCH_GEMMA: |
133 | 0 | return new llama_model_gemma(params); |
134 | 0 | case LLM_ARCH_GEMMA2: |
135 | 0 | return new llama_model_gemma2(params); |
136 | 0 | case LLM_ARCH_GEMMA3: |
137 | 0 | return new llama_model_gemma3(params); |
138 | 0 | case LLM_ARCH_GEMMA3N: |
139 | 0 | return new llama_model_gemma3n(params); |
140 | 0 | case LLM_ARCH_GEMMA4: |
141 | 0 | return new llama_model_gemma4(params); |
142 | 0 | case LLM_ARCH_GEMMA4_ASSISTANT: |
143 | 0 | return new llama_model_gemma4_assistant(params); |
144 | 0 | case LLM_ARCH_GEMMA_EMBEDDING: |
145 | 0 | return new llama_model_gemma_embedding(params); |
146 | 0 | case LLM_ARCH_STARCODER2: |
147 | 0 | return new llama_model_starcoder2(params); |
148 | 0 | case LLM_ARCH_MAMBA: |
149 | 0 | return new llama_model_mamba(params); |
150 | 0 | case LLM_ARCH_MAMBA2: |
151 | 0 | return new llama_model_mamba2(params); |
152 | 0 | case LLM_ARCH_JAMBA: |
153 | 0 | return new llama_model_jamba(params); |
154 | 0 | case LLM_ARCH_XVERSE: |
155 | 0 | return new llama_model_xverse(params); |
156 | 0 | case LLM_ARCH_COMMAND_R: |
157 | 0 | return new llama_model_command_r(params); |
158 | 0 | case LLM_ARCH_COHERE2: |
159 | 0 | return new llama_model_cohere2(params); |
160 | 0 | case LLM_ARCH_COHERE2MOE: |
161 | 0 | return new llama_model_cohere2moe(params); |
162 | 0 | case LLM_ARCH_DBRX: |
163 | 0 | return new llama_model_dbrx(params); |
164 | 0 | case LLM_ARCH_OLMO: |
165 | 0 | return new llama_model_olmo(params); |
166 | 0 | case LLM_ARCH_OLMO2: |
167 | 0 | return new llama_model_olmo2(params); |
168 | 0 | case LLM_ARCH_OLMOE: |
169 | 0 | return new llama_model_olmoe(params); |
170 | 0 | case LLM_ARCH_OPENELM: |
171 | 0 | return new llama_model_openelm(params); |
172 | 0 | case LLM_ARCH_GPTNEOX: |
173 | 0 | return new llama_model_gptneox(params); |
174 | 0 | case LLM_ARCH_ARCTIC: |
175 | 0 | return new llama_model_arctic(params); |
176 | 0 | case LLM_ARCH_DEEPSEEK: |
177 | 0 | return new llama_model_deepseek(params); |
178 | 0 | case LLM_ARCH_DEEPSEEK2: |
179 | 0 | return new llama_model_deepseek2(params); |
180 | 0 | case LLM_ARCH_DEEPSEEK2OCR: |
181 | 0 | return new llama_model_deepseek2ocr(params); |
182 | 0 | case LLM_ARCH_DEEPSEEK32: |
183 | 0 | return new llama_model_deepseek32(params); |
184 | 0 | case LLM_ARCH_GLM_DSA: |
185 | 0 | return new llama_model_glm_dsa(params); |
186 | 0 | case LLM_ARCH_MISTRAL4: |
187 | 0 | return new llama_model_mistral4(params); |
188 | 0 | case LLM_ARCH_CHATGLM: |
189 | 0 | return new llama_model_chatglm(params); |
190 | 0 | case LLM_ARCH_GLM4: |
191 | 0 | return new llama_model_glm4(params); |
192 | 0 | case LLM_ARCH_GLM4_MOE: |
193 | 0 | return new llama_model_glm4_moe(params); |
194 | 0 | case LLM_ARCH_BITNET: |
195 | 0 | return new llama_model_bitnet(params); |
196 | 0 | case LLM_ARCH_T5: |
197 | 0 | return new llama_model_t5(params); |
198 | 0 | case LLM_ARCH_T5ENCODER: |
199 | 0 | return new llama_model_t5encoder(params); |
200 | 0 | case LLM_ARCH_JAIS: |
201 | 0 | return new llama_model_jais(params); |
202 | 0 | case LLM_ARCH_JAIS2: |
203 | 0 | return new llama_model_jais2(params); |
204 | 0 | case LLM_ARCH_NEMOTRON: |
205 | 0 | return new llama_model_nemotron(params); |
206 | 0 | case LLM_ARCH_NEMOTRON_H: |
207 | 0 | return new llama_model_nemotron_h(params); |
208 | 0 | case LLM_ARCH_NEMOTRON_H_MOE: |
209 | 0 | return new llama_model_nemotron_h_moe(params); |
210 | 0 | case LLM_ARCH_EXAONE: |
211 | 0 | return new llama_model_exaone(params); |
212 | 0 | case LLM_ARCH_EXAONE4: |
213 | 0 | return new llama_model_exaone4(params); |
214 | 0 | case LLM_ARCH_EXAONE_MOE: |
215 | 0 | return new llama_model_exaone_moe(params); |
216 | 0 | case LLM_ARCH_RWKV6: |
217 | 0 | return new llama_model_rwkv6(params); |
218 | 0 | case LLM_ARCH_RWKV6QWEN2: |
219 | 0 | return new llama_model_rwkv6qwen2(params); |
220 | 0 | case LLM_ARCH_RWKV7: |
221 | 0 | return new llama_model_rwkv7(params); |
222 | 0 | case LLM_ARCH_ARWKV7: |
223 | 0 | return new llama_model_arwkv7(params); |
224 | 0 | case LLM_ARCH_GRANITE: |
225 | 0 | return new llama_model_granite(params); |
226 | 0 | case LLM_ARCH_GRANITE_MOE: |
227 | 0 | return new llama_model_granite_moe(params); |
228 | 0 | case LLM_ARCH_MINICPM: |
229 | 0 | return new llama_model_minicpm(params); |
230 | 0 | case LLM_ARCH_GRANITE_HYBRID: |
231 | 0 | return new llama_model_granite_hybrid(params); |
232 | 0 | case LLM_ARCH_CHAMELEON: |
233 | 0 | return new llama_model_chameleon(params); |
234 | 0 | case LLM_ARCH_WAVTOKENIZER_DEC: |
235 | 0 | return new llama_model_wavtokenizer_dec(params); |
236 | 0 | case LLM_ARCH_PLM: |
237 | 0 | return new llama_model_plm(params); |
238 | 0 | case LLM_ARCH_BAILINGMOE: |
239 | 0 | return new llama_model_bailingmoe(params); |
240 | 0 | case LLM_ARCH_BAILINGMOE2: |
241 | 0 | return new llama_model_bailingmoe2(params); |
242 | 0 | case LLM_ARCH_SEED_OSS: |
243 | 0 | return new llama_model_seed_oss(params); |
244 | 0 | case LLM_ARCH_DOTS1: |
245 | 0 | return new llama_model_dots1(params); |
246 | 0 | case LLM_ARCH_ARCEE: |
247 | 0 | return new llama_model_arcee(params); |
248 | 0 | case LLM_ARCH_AFMOE: |
249 | 0 | return new llama_model_afmoe(params); |
250 | 0 | case LLM_ARCH_ERNIE4_5: |
251 | 0 | return new llama_model_ernie4_5(params); |
252 | 0 | case LLM_ARCH_ERNIE4_5_MOE: |
253 | 0 | return new llama_model_ernie4_5_moe(params); |
254 | 0 | case LLM_ARCH_PADDLEOCR: |
255 | 0 | return new llama_model_paddleocr(params); |
256 | 0 | case LLM_ARCH_HUNYUAN_MOE: |
257 | 0 | return new llama_model_hunyuan_moe(params); |
258 | 0 | case LLM_ARCH_HUNYUAN_VL: |
259 | 0 | return new llama_model_hunyuan_vl(params); |
260 | 0 | case LLM_ARCH_HUNYUAN_DENSE: |
261 | 0 | return new llama_model_hunyuan_dense(params); |
262 | 0 | case LLM_ARCH_SMOLLM3: |
263 | 0 | return new llama_model_smollm3(params); |
264 | 0 | case LLM_ARCH_OPENAI_MOE: |
265 | 0 | return new llama_model_openai_moe(params); |
266 | 0 | case LLM_ARCH_FALCON_H1: |
267 | 0 | return new llama_model_falcon_h1(params); |
268 | 0 | case LLM_ARCH_LFM2: |
269 | 0 | return new llama_model_lfm2(params); |
270 | 0 | case LLM_ARCH_LFM2MOE: |
271 | 0 | return new llama_model_lfm2moe(params); |
272 | 0 | case LLM_ARCH_SMALLTHINKER: |
273 | 0 | return new llama_model_smallthinker(params); |
274 | 0 | case LLM_ARCH_GROVEMOE: |
275 | 0 | return new llama_model_grovemoe(params); |
276 | 0 | case LLM_ARCH_APERTUS: |
277 | 0 | return new llama_model_apertus(params); |
278 | 0 | case LLM_ARCH_MINIMAX_M2: |
279 | 0 | return new llama_model_minimax_m2(params); |
280 | 0 | case LLM_ARCH_COGVLM: |
281 | 0 | return new llama_model_cogvlm(params); |
282 | 0 | case LLM_ARCH_PANGU_EMBED: |
283 | 0 | return new llama_model_pangu_embed(params); |
284 | 0 | case LLM_ARCH_QWEN3NEXT: |
285 | 0 | return new llama_model_qwen3next(params); |
286 | 0 | case LLM_ARCH_QWEN35: |
287 | 0 | return new llama_model_qwen35(params); |
288 | 0 | case LLM_ARCH_QWEN35MOE: |
289 | 0 | return new llama_model_qwen35moe(params); |
290 | 0 | case LLM_ARCH_MISTRAL3: |
291 | 0 | return new llama_model_mistral3(params); |
292 | 0 | case LLM_ARCH_EAGLE3: |
293 | 0 | return new llama_model_eagle3(params); |
294 | 0 | case LLM_ARCH_MIMO2: |
295 | 0 | return new llama_model_mimo2(params); |
296 | 0 | case LLM_ARCH_KIMI_LINEAR: |
297 | 0 | return new llama_model_kimi_linear(params); |
298 | 0 | case LLM_ARCH_STEP35: |
299 | 0 | return new llama_model_step35(params); |
300 | 0 | default: |
301 | 0 | throw std::runtime_error(std::string("unsupported model architecture: '") + llm_arch_name(arch) + "'"); |
302 | 0 | } |
303 | |
|
304 | 0 | } |
305 | | |
306 | 0 | llama_model * llama_model_create(llm_arch arch, const llama_model_params & params) { |
307 | 0 | llama_model * model = llama_model_mapping(arch, params); |
308 | |
|
309 | 0 | if (model != nullptr) { |
310 | 0 | model->arch = arch; |
311 | 0 | auto & devices = model->devices; |
312 | 0 | if (!devices.empty() && devices[0].is_meta && !llm_arch_supports_sm_tensor(arch)) { |
313 | 0 | throw std::runtime_error(std::string("LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '") + llm_arch_name(arch) + "'"); |
314 | 0 | } |
315 | 0 | } |
316 | | |
317 | 0 | return model; |
318 | 0 | } |
319 | | |
320 | 0 | llama_model * llama_model_create(llama_model_loader & ml, const llama_model_params & params) { |
321 | 0 | llm_arch arch = ml.get_arch(); |
322 | 0 | if (arch == LLM_ARCH_UNKNOWN) { |
323 | 0 | throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'"); |
324 | 0 | } |
325 | | |
326 | 0 | return llama_model_create(arch, params); |
327 | 0 | } |
328 | | |
329 | 0 | struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const struct ggml_tensor * tensor, void * userdata) { |
330 | 0 | const llama_meta_device_get_split_state_userdata * ud = (const llama_meta_device_get_split_state_userdata *) userdata; |
331 | 0 | const llama_hparams & hparams = ud->model->hparams; |
332 | 0 | const std::string tensor_name = tensor->name; |
333 | |
|
334 | 0 | const std::regex pattern_q_weight ("blk\\.\\d*\\.attn_q.weight"); |
335 | 0 | const std::regex pattern_kv_weight ("blk\\.\\d*\\.attn_(k|v).weight"); |
336 | 0 | const std::regex pattern_qkv_weight ("blk\\.\\d*\\.attn_qkv.weight"); |
337 | 0 | const std::regex pattern_q_bias ("blk\\.\\d*\\.attn_q\\.bias"); |
338 | 0 | const std::regex pattern_kv_bias ("blk\\.\\d*\\.attn_(k|v)\\.bias"); |
339 | 0 | const std::regex pattern_qkv_bias ("blk\\.\\d*\\.attn_qkv.bias"); |
340 | 0 | const std::regex pattern_qk_norm ("blk\\.\\d*\\.attn_(q|k)_norm\\.weight"); |
341 | 0 | const std::regex pattern_kv_cache ("cache_(k|v)_l\\d*"); |
342 | 0 | const std::regex pattern_attn_sinks ("blk\\.\\d*\\.attn_sinks.weight"); |
343 | 0 | const std::regex pattern_attn_out_weight ("blk\\.\\d*\\.attn_output.weight"); |
344 | 0 | const std::regex pattern_attn_out_bias ("blk\\.\\d*\\.attn_output.bias"); |
345 | 0 | const std::regex pattern_attn_gate_weight("blk\\.\\d*\\.attn_gate.weight"); |
346 | |
|
347 | 0 | const std::regex pattern_ssm_dt ("blk\\.\\d*\\.ssm_dt.bias"); |
348 | 0 | const std::regex pattern_ssm_a ("blk\\.\\d*\\.ssm_a"); |
349 | 0 | const std::regex pattern_ssm_alpha ("blk\\.\\d*\\.ssm_alpha.weight"); |
350 | 0 | const std::regex pattern_ssm_beta ("blk\\.\\d*\\.ssm_beta.weight"); |
351 | 0 | const std::regex pattern_ssm_beta_alpha ("blk\\.\\d*\\.ssm_ba.weight"); |
352 | 0 | const std::regex pattern_r_cache ("cache_r_l\\d*"); |
353 | 0 | const std::regex pattern_s_cache ("cache_s_l\\d*"); |
354 | 0 | const std::regex pattern_ssm_conv1d ("blk\\.\\d*\\.ssm_conv1d.weight"); |
355 | 0 | const std::regex pattern_ssm_out_weight ("blk\\.\\d*\\.ssm_out.weight"); |
356 | |
|
357 | 0 | const std::regex pattern_ffn_up_gate_weight("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.weight"); |
358 | 0 | const std::regex pattern_ffn_up_gate_bias ("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.bias"); |
359 | 0 | const std::regex pattern_ffn_gate_up_weight("blk\\.\\d*\\.ffn_gate_up(_exps)?.weight"); |
360 | 0 | const std::regex pattern_ffn_down_weight ("blk\\.\\d*\\.ffn_down(_exps)?.weight"); |
361 | 0 | const std::regex pattern_ffn_down_bias ("blk\\.\\d*\\.ffn_down.bias"); |
362 | 0 | const std::regex pattern_ffn_down_exps_bias("blk\\.\\d*\\.ffn_down_exps.bias"); |
363 | |
|
364 | 0 | const std::regex pattern_output_weight("output\\.weight"); |
365 | 0 | const std::regex pattern_output_bias ("output\\.bias"); |
366 | |
|
367 | 0 | struct tensor_config { |
368 | 0 | ggml_backend_meta_split_axis axis; |
369 | |
|
370 | 0 | const ggml_tensor * tensor_axis_0; |
371 | |
|
372 | 0 | uint32_t il; |
373 | 0 | size_t rotation; // when assigning tensor slices, rotate how the rounding is done for more even allocation |
374 | 0 | }; |
375 | |
|
376 | 0 | auto get_tensor_config_impl = [&]( |
377 | 0 | const ggml_backend_meta_split_axis axis, const std::string & suffix = "", const std::string & suffix_fallback = "") -> tensor_config { |
378 | | // the layers in a tensor can be inhomogeneous, if the pattern is cleanly divided by the number of GPUs there can be aliasing effects, |
379 | | // count only the same type of previous layers to avoid this |
380 | 0 | auto get_il_eff = [&](const size_t il){ |
381 | 0 | size_t ret = 0; |
382 | 0 | const bool il_is_recr = hparams.is_recr(il); |
383 | 0 | const bool il_is_swa = hparams.is_swa(il); |
384 | 0 | for (size_t il_prev = 0; il_prev < il; il_prev++) { |
385 | 0 | ret += hparams.is_recr(il_prev) == il_is_recr && hparams.is_swa(il_prev) == il_is_swa; |
386 | 0 | } |
387 | 0 | return ret; |
388 | 0 | }; |
389 | |
|
390 | 0 | uint32_t il; |
391 | 0 | std::string prefix; |
392 | 0 | size_t rotation; |
393 | 0 | if (tensor_name.substr(0, 4) == "blk.") { |
394 | 0 | const size_t length_prefix = tensor_name.find('.', 4); |
395 | 0 | GGML_ASSERT(length_prefix != std::string::npos); |
396 | 0 | prefix = tensor_name.substr(0, length_prefix + 1); |
397 | 0 | il = std::stoull(tensor_name.substr(4, length_prefix)); |
398 | 0 | rotation = get_il_eff(il) % ud->n_devices; |
399 | 0 | } else if (tensor_name.substr(0, 6) == "cache_") { |
400 | 0 | const size_t layer_index_start = tensor_name.find("_l", 6); |
401 | 0 | GGML_ASSERT(layer_index_start != std::string::npos); |
402 | 0 | il = std::stoull(tensor_name.substr(layer_index_start + 2)); |
403 | 0 | prefix = "blk." + std::to_string(il) + "."; |
404 | 0 | rotation = get_il_eff(il) % ud->n_devices; |
405 | 0 | } else { |
406 | 0 | il = 0; |
407 | 0 | rotation = hparams.n_layer() % ud->n_devices; |
408 | 0 | } |
409 | 0 | const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str()); |
410 | 0 | if (tensor_axis_0 == nullptr) { |
411 | 0 | GGML_ASSERT(!suffix_fallback.empty()); |
412 | 0 | tensor_axis_0 = ud->model->get_tensor((prefix + suffix_fallback).c_str()); |
413 | 0 | } |
414 | 0 | GGML_ASSERT(tensor_axis_0 != nullptr); |
415 | 0 | return {axis, tensor_axis_0, il, rotation}; |
416 | 0 | }; |
417 | |
|
418 | 0 | auto get_tensor_config = [&]() -> tensor_config { |
419 | | // standard attention |
420 | 0 | if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) { |
421 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight"); |
422 | 0 | } |
423 | 0 | if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) { |
424 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight"); |
425 | 0 | } |
426 | 0 | if (std::regex_match(tensor_name, pattern_qkv_weight)) { |
427 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight"); |
428 | 0 | } |
429 | 0 | if ( std::regex_match(tensor_name, pattern_qkv_bias)) { |
430 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight"); |
431 | 0 | } |
432 | 0 | if (std::regex_match(tensor_name, pattern_qk_norm)) { |
433 | 0 | return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight"); |
434 | 0 | } |
435 | 0 | if (std::regex_match(tensor_name, pattern_kv_cache) || std::regex_match(tensor_name, pattern_attn_sinks)) { |
436 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight"); |
437 | 0 | } |
438 | 0 | if (std::regex_match(tensor_name, pattern_attn_out_weight)) { |
439 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0); |
440 | 0 | } |
441 | 0 | if (std::regex_match(tensor_name, pattern_attn_out_bias)) { |
442 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED); |
443 | 0 | } |
444 | | |
445 | 0 | if (std::regex_match(tensor_name, pattern_attn_gate_weight)) { |
446 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight"); |
447 | 0 | } |
448 | 0 | if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) { |
449 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight"); |
450 | 0 | } |
451 | 0 | if (std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta) || |
452 | 0 | std::regex_match(tensor_name, pattern_ssm_beta_alpha)) { |
453 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ssm_out.weight"); |
454 | 0 | } |
455 | 0 | if (std::regex_match(tensor_name, pattern_r_cache) || std::regex_match(tensor_name, pattern_s_cache)) { |
456 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight"); |
457 | 0 | } |
458 | 0 | if (std::regex_match(tensor_name, pattern_ssm_conv1d)) { |
459 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ssm_out.weight"); |
460 | 0 | } |
461 | 0 | if (std::regex_match(tensor_name, pattern_ssm_out_weight)) { |
462 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0); |
463 | 0 | } |
464 | | |
465 | | // FFN |
466 | 0 | if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight)) { |
467 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ffn_down.weight", "ffn_down_exps.weight"); |
468 | 0 | } |
469 | 0 | if (std::regex_match(tensor_name, pattern_ffn_up_gate_bias)) { |
470 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ffn_down.weight", "ffn_down_exps.weight"); |
471 | 0 | } |
472 | 0 | if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) { |
473 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ffn_down.weight", "ffn_down_exps.weight"); |
474 | 0 | } |
475 | 0 | if (std::regex_match(tensor_name, pattern_ffn_down_weight)) { |
476 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ffn_down.weight", "ffn_down_exps.weight"); |
477 | 0 | } |
478 | 0 | if (std::regex_match(tensor_name, pattern_ffn_down_bias)) { |
479 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED); |
480 | 0 | } |
481 | 0 | if (std::regex_match(tensor_name, pattern_ffn_down_exps_bias)) { |
482 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_PARTIAL); |
483 | 0 | } |
484 | | |
485 | | // output |
486 | 0 | if (std::regex_match(tensor_name, pattern_output_weight)) { |
487 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1); |
488 | 0 | } |
489 | 0 | if (std::regex_match(tensor_name, pattern_output_bias)) { |
490 | 0 | const ggml_tensor * output_weight = ud->model->get_tensor("output.weight"); |
491 | 0 | GGML_ASSERT(output_weight != nullptr); |
492 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0); |
493 | 0 | } |
494 | | |
495 | | // everything else |
496 | 0 | return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED); |
497 | 0 | }; |
498 | |
|
499 | 0 | auto get_split_segments = [&](int axis, uint32_t il) -> std::vector<std::pair<int64_t, uint32_t>> { |
500 | 0 | if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) { |
501 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
502 | 0 | const int64_t head_v_dim = hparams.ssm_d_state; |
503 | 0 | const int64_t n_k_heads = hparams.ssm_n_group; |
504 | 0 | const int64_t n_v_heads = hparams.ssm_dt_rank; |
505 | 0 | const int64_t key_dim = head_k_dim * n_k_heads; |
506 | 0 | const int64_t value_dim = head_v_dim * n_v_heads; |
507 | | |
508 | | // both Qwen 3 Next and Qwen 3.5 support n_v_heads > n_k_heads but the broadcasting pattern is different: |
509 | | // - Qwen 3 Next: [k0_v0, k0_v1, k1_v2, k1_v3] (this is the default split pattern) |
510 | | // - Qwen 3.5: [k0_v0, k1_v1, k0_v2, k1_v3] (needs segmenting of V on the scale of K to get the correct pattern) |
511 | 0 | if (ud->model->arch == LLM_ARCH_QWEN3NEXT) { |
512 | 0 | if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) { |
513 | 0 | GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim); |
514 | 0 | return {{key_dim, 2}, {value_dim, 1}}; |
515 | 0 | } |
516 | 0 | } else { |
517 | 0 | const int64_t head_ratio = n_v_heads / n_k_heads; |
518 | 0 | if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) { |
519 | 0 | GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim); |
520 | 0 | return {{key_dim, 2 + head_ratio}}; |
521 | 0 | } |
522 | 0 | if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) { |
523 | 0 | return {{key_dim, head_ratio}}; |
524 | 0 | } |
525 | 0 | if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) || |
526 | 0 | std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) { |
527 | 0 | return {{n_k_heads, head_ratio}}; |
528 | 0 | } |
529 | 0 | if (std::regex_match(tensor_name, pattern_r_cache)) { |
530 | 0 | return {{key_dim * (hparams.ssm_d_conv - 1), 2 + head_ratio}}; |
531 | 0 | } |
532 | 0 | if (std::regex_match(tensor_name, pattern_s_cache)) { |
533 | 0 | return {{n_k_heads * head_v_dim * head_v_dim, head_ratio}}; |
534 | 0 | } |
535 | 0 | } |
536 | | |
537 | | // the FFN is the same for Qwen 3 Next and Qwen 3.5: |
538 | 0 | if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) { |
539 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp; |
540 | 0 | GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp); |
541 | 0 | return {{n_ff_exp, 2}}; |
542 | 0 | } |
543 | 0 | return {{tensor->ne[axis], 1}}; |
544 | 0 | } |
545 | | |
546 | 0 | if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) { |
547 | 0 | const int64_t n_embd = hparams.n_embd; |
548 | 0 | const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il); |
549 | 0 | GGML_ASSERT(hparams.n_embd_k_gqa() == n_embd_gqa); |
550 | 0 | GGML_ASSERT(tensor->ne[axis] == n_embd + 2*n_embd_gqa); |
551 | 0 | return {{n_embd, 1}, {n_embd_gqa, 2}}; |
552 | 0 | } |
553 | 0 | if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) { |
554 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp; |
555 | 0 | GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp); |
556 | 0 | return {{n_ff_exp, 2}}; |
557 | 0 | } |
558 | 0 | return {{tensor->ne[axis], 1}}; |
559 | 0 | }; |
560 | |
|
561 | 0 | auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<std::pair<int64_t, uint32_t>> & segments) -> std::vector<int64_t> { |
562 | | // for better performance it may make sense to round up blck_size to a higher power of 2 so that more efficient kernels can be used |
563 | 0 | if (hparams.is_recr(il)) { |
564 | | // linear attention |
565 | 0 | const int64_t head_dim = hparams.ssm_d_state; |
566 | 0 | const int64_t blck_size_perf = std::lcm(blck_size, 128); |
567 | 0 | const int64_t granularity_qkv = std::lcm(blck_size_perf, head_dim); |
568 | 0 | if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) || |
569 | 0 | std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) { |
570 | 0 | return std::vector<int64_t>(segments.size(), granularity_qkv); |
571 | 0 | } |
572 | 0 | if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) || |
573 | 0 | std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) { |
574 | 0 | return std::vector<int64_t>(segments.size(), granularity_qkv / head_dim); |
575 | 0 | } |
576 | 0 | if (std::regex_match(tensor_name, pattern_ssm_beta_alpha)) { |
577 | 0 | return std::vector<int64_t>(segments.size(), 2 * (granularity_qkv / head_dim)); |
578 | 0 | } |
579 | 0 | if (std::regex_match(tensor_name, pattern_r_cache)) { |
580 | 0 | return std::vector<int64_t>(segments.size(), granularity_qkv * (hparams.ssm_d_conv - 1)); |
581 | 0 | } |
582 | 0 | if (std::regex_match(tensor_name, pattern_s_cache)) { |
583 | 0 | return std::vector<int64_t>(segments.size(), granularity_qkv * head_dim); |
584 | 0 | } |
585 | 0 | } else { |
586 | | // regular attention |
587 | 0 | const uint32_t n_gqa = hparams.n_gqa(il); |
588 | 0 | const uint32_t n_embd_q = n_gqa * hparams.n_embd_head_k(il); |
589 | | |
590 | | // to handle head sizes like 80, only increase granularity while it doesn't cause underutilization |
591 | 0 | int64_t blck_size_perf = blck_size; |
592 | 0 | while (blck_size_perf < 128 && blck_size_perf*ud->n_devices < n_embd_q) { |
593 | 0 | blck_size_perf *= 2; |
594 | 0 | } |
595 | |
|
596 | 0 | if (std::regex_match(tensor_name, pattern_attn_sinks)) { |
597 | 0 | GGML_ASSERT(segments.size() == 1); |
598 | 0 | return {std::lcm(n_embd_q, blck_size_perf)/n_embd_q * n_gqa}; |
599 | 0 | } |
600 | | |
601 | 0 | const int64_t granularity_q = std::lcm(n_embd_q, blck_size_perf); |
602 | 0 | if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_q_bias)) { |
603 | 0 | GGML_ASSERT(segments.size() == 1); |
604 | | // some models have Q gate tensors, for those cases the granularity needs to be doubled: |
605 | 0 | if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) { |
606 | 0 | return {std::lcm(2*n_embd_q, blck_size_perf)}; |
607 | 0 | } |
608 | 0 | return {granularity_q}; |
609 | 0 | } |
610 | 0 | if (std::regex_match(tensor_name, pattern_attn_out_weight)) { |
611 | 0 | GGML_ASSERT(segments.size() == 1); |
612 | 0 | return {granularity_q}; |
613 | 0 | } |
614 | | |
615 | 0 | const int64_t granularity_kv = granularity_q / n_gqa; |
616 | 0 | if (std::regex_match(tensor_name, pattern_kv_weight) || |
617 | 0 | std::regex_match(tensor_name, pattern_kv_bias) || |
618 | 0 | std::regex_match(tensor_name, pattern_kv_cache)) { |
619 | 0 | GGML_ASSERT(segments.size() == 1); |
620 | 0 | return {granularity_kv}; |
621 | 0 | } |
622 | 0 | if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) { |
623 | 0 | GGML_ASSERT(segments.size() == 2); |
624 | 0 | return {granularity_q, granularity_kv}; |
625 | 0 | } |
626 | 0 | } |
627 | | |
628 | | // FFN |
629 | 0 | if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) || |
630 | 0 | std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) { |
631 | 0 | const int64_t blck_size_perf = std::lcm(blck_size, 128); |
632 | 0 | GGML_ASSERT(segments.size() == 1); |
633 | 0 | return {blck_size_perf}; |
634 | 0 | } |
635 | | |
636 | | // everything else |
637 | 0 | GGML_ASSERT(segments.size() == 1); |
638 | 0 | return {1}; |
639 | 0 | }; |
640 | |
|
641 | 0 | ggml_backend_meta_split_state split_state; |
642 | 0 | memset(&split_state, 0, sizeof(split_state)); |
643 | 0 | tensor_config tc = get_tensor_config(); |
644 | 0 | split_state.axis = tc.axis; |
645 | 0 | if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) { |
646 | 0 | const int64_t blck_size = ggml_blck_size(tc.tensor_axis_0->type); |
647 | 0 | const float * tensor_split = ud->model->tensor_split(); |
648 | 0 | std::vector<float> tensor_split_scan; |
649 | 0 | tensor_split_scan.reserve(ud->n_devices); |
650 | 0 | for (size_t j = 0; j < ud->n_devices; j++) { |
651 | 0 | tensor_split_scan.push_back(tensor_split == nullptr ? 0.0f : tensor_split[(j + tc.rotation) % ud->n_devices]); |
652 | 0 | if (j > 0) { |
653 | 0 | tensor_split_scan[j] += tensor_split_scan[j - 1]; |
654 | 0 | } |
655 | 0 | } |
656 | 0 | const std::vector<std::pair<int64_t, uint32_t>> segments = get_split_segments(split_state.axis, tc.il); |
657 | 0 | const std::vector<int64_t> granularity = get_split_granularity(blck_size, tc.il, segments); |
658 | 0 | for (size_t is = 0; is < segments.size(); is++) { |
659 | 0 | const int64_t ne_s = segments[is].first; |
660 | 0 | const uint32_t nr_s = segments[is].second; |
661 | 0 | const int64_t g_s = granularity[is]; |
662 | 0 | int64_t low = 0; |
663 | 0 | size_t j = 0; |
664 | 0 | for (; j < ud->n_devices - 1; j++) { |
665 | 0 | int64_t high = tensor_split_scan.back() == 0.0f ? |
666 | 0 | ne_s * (j+1)/ud->n_devices : ne_s * tensor_split_scan[j]/tensor_split_scan.back(); |
667 | 0 | if (high % g_s != 0) { |
668 | 0 | high -= high % g_s; |
669 | 0 | } |
670 | 0 | split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = high - low; |
671 | 0 | low = high; |
672 | 0 | } |
673 | 0 | split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = ne_s - low; |
674 | 0 | split_state.nr[is] = nr_s; |
675 | 0 | } |
676 | 0 | split_state.n_segments = segments.size(); |
677 | 0 | } else { |
678 | 0 | memset(split_state.ne, 0, sizeof(split_state.ne)); |
679 | 0 | split_state.nr[0] = 1; |
680 | 0 | split_state.n_segments = 1; |
681 | 0 | } |
682 | 0 | return split_state; |
683 | 0 | GGML_UNUSED(userdata); |
684 | 0 | } |
685 | | |
686 | 0 | const char * llm_type_name(llm_type type) { |
687 | 0 | switch (type) { |
688 | 0 | case LLM_TYPE_14M: return "14M"; |
689 | 0 | case LLM_TYPE_17M: return "17M"; |
690 | 0 | case LLM_TYPE_22M: return "22M"; |
691 | 0 | case LLM_TYPE_33M: return "33M"; |
692 | 0 | case LLM_TYPE_47M: return "47M"; |
693 | 0 | case LLM_TYPE_60M: return "60M"; |
694 | 0 | case LLM_TYPE_70M: return "70M"; |
695 | 0 | case LLM_TYPE_80M: return "80M"; |
696 | 0 | case LLM_TYPE_109M: return "109M"; |
697 | 0 | case LLM_TYPE_137M: return "137M"; |
698 | 0 | case LLM_TYPE_140M: return "140M"; |
699 | 0 | case LLM_TYPE_149M: return "149M"; |
700 | 0 | case LLM_TYPE_160M: return "160M"; |
701 | 0 | case LLM_TYPE_190M: return "190M"; |
702 | 0 | case LLM_TYPE_220M: return "220M"; |
703 | 0 | case LLM_TYPE_250M: return "250M"; |
704 | 0 | case LLM_TYPE_256M: return "256M"; |
705 | 0 | case LLM_TYPE_270M: return "270M"; |
706 | 0 | case LLM_TYPE_335M: return "335M"; |
707 | 0 | case LLM_TYPE_350M: return "350M"; |
708 | 0 | case LLM_TYPE_360M: return "360M"; |
709 | 0 | case LLM_TYPE_395M: return "395M"; |
710 | 0 | case LLM_TYPE_410M: return "410M"; |
711 | 0 | case LLM_TYPE_450M: return "450M"; |
712 | 0 | case LLM_TYPE_475M: return "475M"; |
713 | 0 | case LLM_TYPE_558M: return "558M"; |
714 | 0 | case LLM_TYPE_700M: return "700M"; |
715 | 0 | case LLM_TYPE_770M: return "770M"; |
716 | 0 | case LLM_TYPE_780M: return "780M"; |
717 | 0 | case LLM_TYPE_950M: return "950M"; |
718 | 0 | case LLM_TYPE_0_3B: return "0.3B"; |
719 | 0 | case LLM_TYPE_0_5B: return "0.5B"; |
720 | 0 | case LLM_TYPE_0_6B: return "0.6B"; |
721 | 0 | case LLM_TYPE_0_8B: return "0.8B"; |
722 | 0 | case LLM_TYPE_1B: return "1B"; |
723 | 0 | case LLM_TYPE_1_2B: return "1.2B"; |
724 | 0 | case LLM_TYPE_1_3B: return "1.3B"; |
725 | 0 | case LLM_TYPE_1_4B: return "1.4B"; |
726 | 0 | case LLM_TYPE_1_5B: return "1.5B"; |
727 | 0 | case LLM_TYPE_1_6B: return "1.6B"; |
728 | 0 | case LLM_TYPE_1_7B: return "1.7B"; |
729 | 0 | case LLM_TYPE_1_8B: return "1.8B"; |
730 | 0 | case LLM_TYPE_2B: return "2B"; |
731 | 0 | case LLM_TYPE_2_6B: return "2.6B"; |
732 | 0 | case LLM_TYPE_2_8B: return "2.8B"; |
733 | 0 | case LLM_TYPE_2_9B: return "2.9B"; |
734 | 0 | case LLM_TYPE_3B: return "3B"; |
735 | 0 | case LLM_TYPE_4B: return "4B"; |
736 | 0 | case LLM_TYPE_6B: return "6B"; |
737 | 0 | case LLM_TYPE_6_9B: return "6.9B"; |
738 | 0 | case LLM_TYPE_7B: return "7B"; |
739 | 0 | case LLM_TYPE_8B: return "8B"; |
740 | 0 | case LLM_TYPE_9B: return "9B"; |
741 | 0 | case LLM_TYPE_11B: return "11B"; |
742 | 0 | case LLM_TYPE_12B: return "12B"; |
743 | 0 | case LLM_TYPE_13B: return "13B"; |
744 | 0 | case LLM_TYPE_14B: return "14B"; |
745 | 0 | case LLM_TYPE_15B: return "15B"; |
746 | 0 | case LLM_TYPE_16B: return "16B"; |
747 | 0 | case LLM_TYPE_20B: return "20B"; |
748 | 0 | case LLM_TYPE_26B: return "26B"; |
749 | 0 | case LLM_TYPE_27B: return "27B"; |
750 | 0 | case LLM_TYPE_30B: return "30B"; |
751 | 0 | case LLM_TYPE_31B: return "31B"; |
752 | 0 | case LLM_TYPE_32B: return "32B"; |
753 | 0 | case LLM_TYPE_34B: return "34B"; |
754 | 0 | case LLM_TYPE_35B: return "35B"; |
755 | 0 | case LLM_TYPE_36B: return "36B"; |
756 | 0 | case LLM_TYPE_40B: return "40B"; |
757 | 0 | case LLM_TYPE_65B: return "65B"; |
758 | 0 | case LLM_TYPE_70B: return "70B"; |
759 | 0 | case LLM_TYPE_120B: return "120B"; |
760 | 0 | case LLM_TYPE_142B: return "142B"; |
761 | 0 | case LLM_TYPE_236B: return "236B"; |
762 | 0 | case LLM_TYPE_290B: return "290B"; |
763 | 0 | case LLM_TYPE_314B: return "314B"; |
764 | 0 | case LLM_TYPE_405B: return "405B"; |
765 | 0 | case LLM_TYPE_671B: return "671B"; |
766 | 0 | case LLM_TYPE_SMALL: return "0.1B"; |
767 | 0 | case LLM_TYPE_MEDIUM: return "0.4B"; |
768 | 0 | case LLM_TYPE_LARGE: return "0.8B"; |
769 | 0 | case LLM_TYPE_XL: return "1.5B"; |
770 | 0 | case LLM_TYPE_A1_7B: return "A1.7B"; |
771 | 0 | case LLM_TYPE_A2_7B: return "A2.7B"; |
772 | 0 | case LLM_TYPE_8x7B: return "8x7B"; |
773 | 0 | case LLM_TYPE_8x22B: return "8x22B"; |
774 | 0 | case LLM_TYPE_16x12B: return "16x12B"; |
775 | 0 | case LLM_TYPE_16x3_8B: return "16x3.8B"; |
776 | 0 | case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; |
777 | 0 | case LLM_TYPE_57B_A14B: return "57B.A14B"; |
778 | 0 | case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; |
779 | 0 | case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; |
780 | 0 | case LLM_TYPE_A13B: return "A13B"; |
781 | 0 | case LLM_TYPE_7B_A1B: return "7B.A1B"; |
782 | 0 | case LLM_TYPE_8B_A1B: return "8B.A1B"; |
783 | 0 | case LLM_TYPE_12B_A2_5B: return "12B.A2.5B"; |
784 | 0 | case LLM_TYPE_16B_A1B: return "16B.A1B"; |
785 | 0 | case LLM_TYPE_21B_A3B: return "21B.A3B"; |
786 | 0 | case LLM_TYPE_24B_A2B: return "24B.A2B"; |
787 | 0 | case LLM_TYPE_26B_A4B: return "26B.A4B"; |
788 | 0 | case LLM_TYPE_30B_A3B: return "30B.A3B"; |
789 | 0 | case LLM_TYPE_31B_A3_5B: return "31B.A3.5B"; |
790 | 0 | case LLM_TYPE_35B_A3B: return "35B.A3B"; |
791 | 0 | case LLM_TYPE_48B_A3B: return "48B.A3B"; |
792 | 0 | case LLM_TYPE_80B_A3B: return "80B.A3B"; |
793 | 0 | case LLM_TYPE_100B_A6B: return "100B.A6B"; |
794 | 0 | case LLM_TYPE_102B_A12B: return "102B.A12B"; |
795 | 0 | case LLM_TYPE_106B_A12B: return "106B.A12B"; |
796 | 0 | case LLM_TYPE_120B_A12B: return "120B.A12B"; |
797 | 0 | case LLM_TYPE_122B_A10B: return "122B.A10B"; |
798 | 0 | case LLM_TYPE_196B_A11B: return "196B.A11B"; |
799 | 0 | case LLM_TYPE_230B_A10B: return "230B.A10B"; |
800 | 0 | case LLM_TYPE_235B_A22B: return "235B.A22B"; |
801 | 0 | case LLM_TYPE_300B_A47B: return "300B.A47B"; |
802 | 0 | case LLM_TYPE_310B_A15B: return "310B.A15B"; |
803 | 0 | case LLM_TYPE_355B_A32B: return "355B.A32B"; |
804 | 0 | case LLM_TYPE_397B_A17B: return "397B.A17B"; |
805 | 0 | case LLM_TYPE_685B_A37B: return "685B.A37B"; |
806 | 0 | case LLM_TYPE_744B_A40B: return "744B.A40B"; |
807 | 0 | case LLM_TYPE_E2B: return "E2B"; |
808 | 0 | case LLM_TYPE_E4B: return "E4B"; |
809 | 0 | default: return "?B"; |
810 | 0 | } |
811 | 0 | } |
812 | | |
813 | 0 | static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) { |
814 | 0 | switch (type) { |
815 | 0 | case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax"; |
816 | 0 | case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid"; |
817 | 0 | default: return "unknown"; |
818 | 0 | } |
819 | 0 | } |
820 | | |
821 | | static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = { |
822 | | { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, |
823 | | { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, |
824 | | { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, |
825 | | { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" }, |
826 | | }; |
827 | | |
828 | 0 | std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) { |
829 | 0 | return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type); |
830 | 0 | } |
831 | | |
832 | 0 | static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { |
833 | 0 | for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { |
834 | 0 | if (kv.second == name) { |
835 | 0 | return (llama_rope_scaling_type) kv.first; |
836 | 0 | } |
837 | 0 | } |
838 | | |
839 | 0 | return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; |
840 | 0 | } |
841 | | |
842 | | // Maps the GGUF `<arch>.hidden_activation` string to the FFN op type used by the |
843 | | // graph builders. Only gated activations that map cleanly to llm_ffn_op_type are |
844 | | // listed; unrecognized values fall back to GeGLU, which matches the historical |
845 | | // default for ModernBert-style architectures. |
846 | | static const std::map<std::string, llm_ffn_op_type> LLM_FFN_OP_TYPES_FROM_STRING = { |
847 | | { "gelu", LLM_FFN_GEGLU }, |
848 | | { "geglu", LLM_FFN_GEGLU }, |
849 | | { "silu", LLM_FFN_SWIGLU }, |
850 | | { "swish", LLM_FFN_SWIGLU }, |
851 | | { "swiglu", LLM_FFN_SWIGLU }, |
852 | | { "relu", LLM_FFN_RELU }, |
853 | | { "reglu", LLM_FFN_REGLU }, |
854 | | }; |
855 | | |
856 | 0 | llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback) { |
857 | 0 | const auto it = LLM_FFN_OP_TYPES_FROM_STRING.find(name); |
858 | 0 | if (it != LLM_FFN_OP_TYPES_FROM_STRING.end()) { |
859 | 0 | return it->second; |
860 | 0 | } |
861 | 0 | return fallback; |
862 | 0 | } |
863 | | |
864 | | // CPU: ACCEL -> GPU host -> CPU extra -> CPU |
865 | 0 | static buft_list_t make_cpu_buft_list(const std::vector<llama_device> & devices, bool use_extra_bufts, bool no_host) { |
866 | 0 | buft_list_t buft_list; |
867 | | |
868 | | // add ACCEL buffer types |
869 | 0 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
870 | 0 | ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
871 | 0 | if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { |
872 | 0 | auto * buft = ggml_backend_dev_buffer_type(dev); |
873 | | // skip |
874 | 0 | if (buft != ggml_backend_cpu_buffer_type()) { |
875 | 0 | buft_list.emplace_back(dev, buft); |
876 | 0 | } |
877 | 0 | } |
878 | 0 | } |
879 | | |
880 | | // add a host buffer type |
881 | | // storing the tensors in a host buffer is useful when the processing of large batches |
882 | | // is offloaded to a GPU device, since it reduces the time spent on data transfers |
883 | | // generally, this will be done using the first device in the list |
884 | | // a better approach would be to handle this on a weight-by-weight basis using the offload_op |
885 | | // function of the device to determine if it would benefit from being stored in a host buffer |
886 | 0 | if (!no_host) { |
887 | 0 | for (const auto & dev : devices) { |
888 | 0 | ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev.dev); |
889 | 0 | if (buft) { |
890 | 0 | buft_list.emplace_back(dev.dev, buft); |
891 | 0 | break; |
892 | 0 | } |
893 | 0 | } |
894 | 0 | } |
895 | | |
896 | | // add extra buffer types |
897 | 0 | if (use_extra_bufts) { |
898 | 0 | auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
899 | 0 | if (cpu_dev == nullptr) { |
900 | 0 | throw std::runtime_error(format("%s: no CPU backend found", __func__)); |
901 | 0 | } |
902 | | |
903 | 0 | auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); |
904 | 0 | auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) |
905 | 0 | ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); |
906 | 0 | if (ggml_backend_dev_get_extra_bufts_fn) { |
907 | 0 | ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); |
908 | 0 | while (extra_bufts && *extra_bufts) { |
909 | 0 | buft_list.emplace_back(cpu_dev, *extra_bufts); |
910 | 0 | ++extra_bufts; |
911 | 0 | } |
912 | 0 | } |
913 | 0 | } |
914 | | |
915 | | // add the CPU buffer type |
916 | 0 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
917 | 0 | ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
918 | 0 | if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { |
919 | 0 | buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev)); |
920 | 0 | } |
921 | 0 | } |
922 | |
|
923 | 0 | return buft_list; |
924 | 0 | } |
925 | | |
926 | | // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU |
927 | 0 | static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) { |
928 | 0 | buft_list_t buft_list; |
929 | | |
930 | | // add the device split buffer type if requested and available |
931 | 0 | if (split_mode == LLAMA_SPLIT_MODE_ROW) { |
932 | 0 | ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); |
933 | 0 | auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) |
934 | 0 | ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); |
935 | 0 | if (ggml_backend_split_buffer_type_fn) { |
936 | 0 | size_t dev_index = [&]() { |
937 | 0 | auto * reg = ggml_backend_dev_backend_reg(dev); |
938 | 0 | for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) { |
939 | 0 | if (ggml_backend_reg_dev_get(reg, i) == dev) { |
940 | 0 | return i; |
941 | 0 | } |
942 | 0 | } |
943 | 0 | throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev))); |
944 | 0 | }(); |
945 | 0 | auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split); |
946 | 0 | if (buft != nullptr) { |
947 | 0 | buft_list.emplace_back(dev, buft); |
948 | 0 | } |
949 | 0 | } |
950 | 0 | } |
951 | | |
952 | | // add the device default buffer type |
953 | 0 | buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev)); |
954 | | |
955 | | // add the device extra buffer type (if any) |
956 | 0 | ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); |
957 | 0 | if (reg) { |
958 | 0 | auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) |
959 | 0 | ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts"); |
960 | |
|
961 | 0 | if (ggml_backend_dev_get_extra_bufts_fn) { |
962 | 0 | ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev); |
963 | 0 | while (extra_bufts && *extra_bufts) { |
964 | 0 | buft_list.emplace_back(dev, *extra_bufts); |
965 | 0 | ++extra_bufts; |
966 | 0 | } |
967 | 0 | } |
968 | 0 | } |
969 | |
|
970 | 0 | return buft_list; |
971 | 0 | } |
972 | | |
973 | | struct llama_model::impl { |
974 | 0 | impl() = default; |
975 | 0 | ~impl() = default; |
976 | | |
977 | | uint64_t n_elements = 0; |
978 | | |
979 | | size_t n_bytes = 0; |
980 | | |
981 | | std::string desc_str; |
982 | | |
983 | | // model memory mapped files |
984 | | llama_mmaps mappings; |
985 | | |
986 | | // objects representing data potentially being locked in memory |
987 | | llama_mlocks mlock_bufs; |
988 | | llama_mlocks mlock_mmaps; |
989 | | |
990 | | // contexts where the model tensors metadata is stored as well as the corresponding buffers: |
991 | | std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs; |
992 | | |
993 | | buft_list_t cpu_buft_list; |
994 | | std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list; |
995 | | |
996 | | struct layer_dev { |
997 | | ggml_backend_dev_t dev; |
998 | | buft_list_t * buft_list; |
999 | | }; |
1000 | | |
1001 | | layer_dev dev_input = {}; |
1002 | | layer_dev dev_output = {}; |
1003 | | std::vector<layer_dev> dev_layer; |
1004 | | |
1005 | | bool has_tensor_overrides; |
1006 | | }; |
1007 | | |
1008 | 0 | llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) { |
1009 | 0 | pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern; |
1010 | 0 | } |
1011 | | |
1012 | 0 | llama_model::~llama_model() { |
1013 | 0 | for (auto * lora : loras) { |
1014 | 0 | delete lora; |
1015 | 0 | } |
1016 | 0 | } |
1017 | | |
1018 | 0 | void llama_model_base::load_stats(llama_model_loader & ml) { |
1019 | 0 | pimpl->n_elements = ml.n_elements; |
1020 | 0 | pimpl->n_bytes = ml.n_bytes; |
1021 | 0 | } |
1022 | | |
1023 | 0 | void llama_model_base::load_hparams(llama_model_loader & ml) { |
1024 | 0 | const gguf_context * ctx = ml.metadata; |
1025 | | |
1026 | | // get metadata as string |
1027 | 0 | for (int i = 0; i < gguf_get_n_kv(ctx); i++) { |
1028 | 0 | gguf_type type = gguf_get_kv_type(ctx, i); |
1029 | 0 | if (type == GGUF_TYPE_ARRAY) { |
1030 | 0 | continue; |
1031 | 0 | } |
1032 | 0 | const char * name = gguf_get_key(ctx, i); |
1033 | 0 | const std::string value = gguf_kv_to_str(ctx, i); |
1034 | 0 | gguf_kv.emplace(name, value); |
1035 | 0 | } |
1036 | | |
1037 | | // get general kv |
1038 | 0 | ml.get_key(LLM_KV_GENERAL_NAME, name, false); |
1039 | | |
1040 | | // everything past this point is not vocab-related |
1041 | | // for CLIP models, we only need to load tensors, no hparams |
1042 | 0 | if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) { |
1043 | 0 | return; |
1044 | 0 | } |
1045 | | |
1046 | 0 | ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); |
1047 | 0 | ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); |
1048 | 0 | ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); |
1049 | 0 | ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); |
1050 | 0 | ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); |
1051 | 0 | ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); |
1052 | 0 | ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); |
1053 | 0 | ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); |
1054 | 0 | ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); |
1055 | 0 | ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); |
1056 | |
|
1057 | 0 | if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) { |
1058 | 0 | if (hparams.n_expert <= 1) { |
1059 | 0 | hparams.n_expert = 0; |
1060 | 0 | hparams.n_expert_used = 0; |
1061 | 0 | } |
1062 | 0 | } |
1063 | |
|
1064 | 0 | if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { |
1065 | 0 | ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd); |
1066 | 0 | ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl); |
1067 | |
|
1068 | 0 | ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); |
1069 | 0 | ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); |
1070 | |
|
1071 | 0 | ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd); |
1072 | 0 | ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer); |
1073 | 0 | } |
1074 | |
|
1075 | 0 | GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); |
1076 | 0 | GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); |
1077 | 0 | if (hparams.n_expert > 0) { |
1078 | 0 | GGML_ASSERT(hparams.n_expert_used > 0); |
1079 | 0 | GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert); |
1080 | 0 | if (hparams.n_expert_groups > 1) { |
1081 | 0 | GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0); |
1082 | 0 | GGML_ASSERT(hparams.n_group_used > 0); |
1083 | 0 | GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups); |
1084 | 0 | } |
1085 | 0 | } else { |
1086 | 0 | GGML_ASSERT(hparams.n_expert_used == 0); |
1087 | 0 | GGML_ASSERT(hparams.n_expert_groups == 0); |
1088 | 0 | } |
1089 | |
|
1090 | 0 | std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); |
1091 | 0 | std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); |
1092 | 0 | std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); |
1093 | |
|
1094 | 0 | std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0); |
1095 | 0 | std::fill(hparams.is_swa_impl.begin(), hparams.is_swa_impl.end(), 0); |
1096 | 0 | std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), llm_arch_is_recurrent(ml.get_arch()) ? 1 : 0); |
1097 | |
|
1098 | 0 | std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f); |
1099 | 0 | std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f); |
1100 | 0 | std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f); |
1101 | 0 | std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f); |
1102 | |
|
1103 | 0 | std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); |
1104 | 0 | std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); |
1105 | |
|
1106 | 0 | ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); |
1107 | 0 | ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); |
1108 | | |
1109 | | // Populate deepstack_mapping_arr - initialized to -1 (no deepstack) |
1110 | 0 | std::fill(hparams.deepstack_mapping_arr.begin(), hparams.deepstack_mapping_arr.end(), -1); |
1111 | | |
1112 | | // n_head_kv is optional, default to n_head |
1113 | 0 | hparams.n_head_kv_arr = hparams.n_head_arr; |
1114 | |
|
1115 | 0 | ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false); |
1116 | |
|
1117 | 0 | bool rope_finetuned = false; |
1118 | 0 | ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); |
1119 | 0 | hparams.rope_finetuned = rope_finetuned; |
1120 | |
|
1121 | 0 | hparams.n_ctx_orig_yarn = hparams.n_ctx_train; |
1122 | 0 | ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false); |
1123 | | |
1124 | | // rope_freq_base (optional) |
1125 | 0 | hparams.rope_freq_base_train = 10000.0f; |
1126 | 0 | ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false); |
1127 | |
|
1128 | 0 | std::string rope_scaling("linear"); |
1129 | 0 | ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); |
1130 | 0 | hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); |
1131 | 0 | GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); |
1132 | | |
1133 | | // TODO: Handle SWA metadata similarly when models start implementing it |
1134 | | // rope_freq_scale (inverse of the kv) is optional |
1135 | 0 | float ropescale = 0.0f; |
1136 | 0 | if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) { |
1137 | | // try the old key name |
1138 | 0 | ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false); |
1139 | 0 | } |
1140 | 0 | hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; |
1141 | |
|
1142 | 0 | ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); |
1143 | 0 | ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false); |
1144 | | |
1145 | | // non-transformer models do not have attention heads |
1146 | 0 | if (hparams.n_head() > 0) { |
1147 | | // gpt-neox n_rot = rotary_pct * (n_embd / n_head) |
1148 | | // gpt-j n_rot = rotary_dim |
1149 | |
|
1150 | 0 | hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head(); |
1151 | 0 | ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false); |
1152 | |
|
1153 | 0 | hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head(); |
1154 | 0 | ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false); |
1155 | | |
1156 | | // sanity check for n_rot (optional) |
1157 | 0 | hparams.n_rot_full = hparams.n_embd_head_k_full; |
1158 | |
|
1159 | 0 | ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false); |
1160 | |
|
1161 | 0 | if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) { |
1162 | 0 | if (hparams.n_rot_full != hparams.n_embd_head_k_full) { |
1163 | 0 | throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full)); |
1164 | 0 | } |
1165 | 0 | } |
1166 | 0 | } else { |
1167 | 0 | hparams.n_rot_full = 0; |
1168 | 0 | hparams.n_embd_head_k_full = 0; |
1169 | 0 | hparams.n_embd_head_v_full = 0; |
1170 | 0 | } |
1171 | | |
1172 | | // head size and n_rot for SWA layers |
1173 | 0 | { |
1174 | 0 | hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full; |
1175 | 0 | hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full; |
1176 | 0 | ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false); |
1177 | 0 | ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false); |
1178 | |
|
1179 | 0 | hparams.n_rot_swa = hparams.n_rot_full; |
1180 | 0 | ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false); |
1181 | 0 | } |
1182 | | |
1183 | | // for classifier models |
1184 | 0 | ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false); |
1185 | 0 | if (!classifier_labels.empty()) { |
1186 | 0 | hparams.n_cls_out = classifier_labels.size(); |
1187 | 0 | } |
1188 | | |
1189 | | // per-arch hparams |
1190 | 0 | load_arch_hparams(ml); |
1191 | |
|
1192 | 0 | pimpl->n_bytes = ml.n_bytes; |
1193 | |
|
1194 | 0 | pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name(); |
1195 | |
|
1196 | 0 | if (hparams.f_max_alibi_bias > 0.0f) { |
1197 | 0 | hparams.use_alibi = true; |
1198 | 0 | } |
1199 | |
|
1200 | 0 | hparams.rope_type = llama_model_rope_type(this); |
1201 | 0 | } |
1202 | | |
1203 | 0 | void llama_model_base::load_vocab(llama_model_loader & ml) { |
1204 | 0 | const auto kv = LLM_KV(arch); |
1205 | |
|
1206 | 0 | vocab.load(ml, kv); |
1207 | 0 | } |
1208 | | |
1209 | 0 | bool llama_model_base::load_tensors(llama_model_loader & ml) { |
1210 | 0 | const auto & split_mode = params.split_mode; |
1211 | 0 | const auto & use_mlock = params.use_mlock; |
1212 | 0 | const auto & tensor_split = params.tensor_split; |
1213 | |
|
1214 | 0 | const int n_layer_all = hparams.n_layer_all; |
1215 | 0 | const int n_gpu_layers = this->n_gpu_layers(); |
1216 | |
|
1217 | 0 | const bool use_mmap_buffer = true; |
1218 | |
|
1219 | 0 | this->ml = &ml; // to be used by create_tensor() and load_arch_tensors() |
1220 | |
|
1221 | 0 | LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n", |
1222 | 0 | __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false"); |
1223 | | |
1224 | | // build a list of buffer types for the CPU and GPU devices |
1225 | 0 | pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host); |
1226 | 0 | for (const auto & dev : devices) { |
1227 | 0 | buft_list_t buft_list = make_gpu_buft_list(dev.dev, split_mode, tensor_split); |
1228 | | // add CPU buffer types as a fallback |
1229 | 0 | buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end()); |
1230 | 0 | pimpl->gpu_buft_list.emplace(dev.dev, std::move(buft_list)); |
1231 | 0 | } |
1232 | |
|
1233 | 0 | ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
1234 | 0 | if (cpu_dev == nullptr) { |
1235 | 0 | throw std::runtime_error(format("%s: no CPU backend found", __func__)); |
1236 | 0 | } |
1237 | | |
1238 | | // calculate the split points |
1239 | 0 | bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; }); |
1240 | 0 | std::vector<float> splits(n_devices()); |
1241 | 0 | if (all_zero) { |
1242 | | // default split, by free memory |
1243 | 0 | for (size_t i = 0; i < n_devices(); ++i) { |
1244 | 0 | ggml_backend_dev_t dev = devices[i].dev; |
1245 | 0 | size_t total; |
1246 | 0 | size_t free; |
1247 | 0 | ggml_backend_dev_memory(dev, &free, &total); |
1248 | | |
1249 | | // devices can return 0 bytes for free and total memory if they do not |
1250 | | // have any to report. in this case, we will use the host memory as a fallback |
1251 | | // fixes: https://github.com/ggml-org/llama.cpp/issues/18577 |
1252 | 0 | if (free == 0 && total == 0) { |
1253 | 0 | ggml_backend_dev_memory(cpu_dev, &free, &total); |
1254 | 0 | } |
1255 | 0 | splits[i] = free; |
1256 | 0 | } |
1257 | 0 | } else { |
1258 | 0 | std::copy(tensor_split, tensor_split + n_devices(), splits.begin()); |
1259 | 0 | } |
1260 | | |
1261 | | // sum and normalize the splits to get the split points |
1262 | 0 | float split_sum = 0.0f; |
1263 | 0 | for (size_t i = 0; i < n_devices(); ++i) { |
1264 | 0 | split_sum += splits[i]; |
1265 | 0 | splits[i] = split_sum; |
1266 | 0 | } |
1267 | 0 | for (size_t i = 0; i < n_devices(); ++i) { |
1268 | 0 | splits[i] /= split_sum; |
1269 | 0 | } |
1270 | |
|
1271 | 0 | const int i_gpu_start = std::max(n_layer_all + 1 - n_gpu_layers, 0); |
1272 | 0 | const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer_all + 1); |
1273 | 0 | auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { |
1274 | 0 | const bool is_swa = il < n_layer_all && hparams.is_swa(il); |
1275 | 0 | if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { |
1276 | 0 | LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); |
1277 | 0 | return {cpu_dev, &pimpl->cpu_buft_list}; |
1278 | 0 | } |
1279 | 0 | const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); |
1280 | 0 | auto * dev = devices.at(layer_gpu).dev; |
1281 | 0 | LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa); |
1282 | 0 | return {dev, &pimpl->gpu_buft_list.at(dev)}; |
1283 | 0 | }; |
1284 | | |
1285 | | // assign the input layer |
1286 | | // there is very little benefit to offloading the input layer, so always keep it on the CPU |
1287 | 0 | pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list }; |
1288 | | |
1289 | | // assign the repeating layers to the devices according to the splits |
1290 | 0 | pimpl->dev_layer.resize(n_layer_all); |
1291 | 0 | for (int il = 0; il < n_layer_all; ++il) { |
1292 | 0 | pimpl->dev_layer[il] = get_layer_buft_list(il); |
1293 | 0 | } |
1294 | | |
1295 | | // assign the output layer |
1296 | 0 | pimpl->dev_output = get_layer_buft_list(n_layer_all); |
1297 | |
|
1298 | 0 | const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED; |
1299 | | |
1300 | | // create tensors for the weights |
1301 | 0 | { |
1302 | | // TODO: move to a separate function |
1303 | 0 | const auto tn = LLM_TN(arch); |
1304 | |
|
1305 | 0 | const int64_t n_expert = hparams.n_expert; |
1306 | 0 | const int64_t n_expert_used = hparams.n_expert_used; |
1307 | |
|
1308 | 0 | if (n_expert > 0 && n_expert_used == 0) { |
1309 | 0 | throw std::runtime_error("model has expert layers but no expert layers are used"); |
1310 | 0 | } |
1311 | | |
1312 | 0 | layers.resize(n_layer_all); |
1313 | | |
1314 | | // call the per-model loading function |
1315 | 0 | load_arch_tensors(ml); |
1316 | | |
1317 | | // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2) |
1318 | | // this avoids having to add scale loading to every architecture |
1319 | 0 | for (int i = 0; i < n_layer_all; ++i) { |
1320 | 0 | auto & layer = layers[i]; |
1321 | | |
1322 | | // attention weight scales (per-tensor, shape {1}) |
1323 | 0 | if (!layer.wq_s && layer.wq) { |
1324 | 0 | layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1325 | 0 | } |
1326 | 0 | if (!layer.wk_s && layer.wk) { |
1327 | 0 | layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1328 | 0 | } |
1329 | 0 | if (!layer.wv_s && layer.wv) { |
1330 | 0 | layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1331 | 0 | } |
1332 | 0 | if (!layer.wo_s && layer.wo) { |
1333 | 0 | layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1334 | 0 | } |
1335 | 0 | if (!layer.wqkv_s && layer.wqkv) { |
1336 | 0 | layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1337 | 0 | } |
1338 | 0 | if (!layer.wqkv_gate_s && layer.wqkv_gate) { |
1339 | 0 | layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1340 | 0 | } |
1341 | | |
1342 | | // dense FFN weight scales (per-tensor, shape {1}) |
1343 | 0 | if (!layer.ffn_gate_s && layer.ffn_gate) { |
1344 | 0 | layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1345 | 0 | } |
1346 | 0 | if (!layer.ffn_down_s && layer.ffn_down) { |
1347 | 0 | layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1348 | 0 | } |
1349 | 0 | if (!layer.ffn_up_s && layer.ffn_up) { |
1350 | 0 | layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1351 | 0 | } |
1352 | 0 | if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) { |
1353 | 0 | layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1354 | 0 | } |
1355 | 0 | if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) { |
1356 | 0 | layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1357 | 0 | } |
1358 | 0 | if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) { |
1359 | 0 | layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1360 | 0 | } |
1361 | | |
1362 | | // MoE expert weight scales (per-expert, shape {n_expert}) |
1363 | 0 | if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) { |
1364 | 0 | layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED); |
1365 | 0 | } |
1366 | 0 | if (!layer.ffn_down_exps_s && layer.ffn_down_exps) { |
1367 | 0 | layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED); |
1368 | 0 | } |
1369 | 0 | if (!layer.ffn_up_exps_s && layer.ffn_up_exps) { |
1370 | 0 | layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED); |
1371 | 0 | } |
1372 | | |
1373 | | // recurrent / linear-attention weight scales (per-tensor, shape {1}) |
1374 | 0 | if (!layer.ssm_in_s && layer.ssm_in) { |
1375 | 0 | layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1376 | 0 | } |
1377 | 0 | if (!layer.ssm_out_s && layer.ssm_out) { |
1378 | 0 | layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1379 | 0 | } |
1380 | 0 | if (!layer.ssm_alpha_s && layer.ssm_alpha) { |
1381 | 0 | layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1382 | 0 | } |
1383 | 0 | if (!layer.ssm_beta_s && layer.ssm_beta) { |
1384 | 0 | layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1385 | 0 | } |
1386 | 0 | if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) { |
1387 | 0 | layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1388 | 0 | } |
1389 | 0 | if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) { |
1390 | 0 | layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED); |
1391 | 0 | } |
1392 | | |
1393 | | // input scales |
1394 | 0 | if (!layer.wq_in_s && layer.wq) { |
1395 | 0 | layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1396 | 0 | } |
1397 | 0 | if (!layer.wk_in_s && layer.wk) { |
1398 | 0 | layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1399 | 0 | } |
1400 | 0 | if (!layer.wv_in_s && layer.wv) { |
1401 | 0 | layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1402 | 0 | } |
1403 | 0 | if (!layer.wo_in_s && layer.wo) { |
1404 | 0 | layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1405 | 0 | } |
1406 | 0 | if (!layer.wqkv_in_s && layer.wqkv) { |
1407 | 0 | layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1408 | 0 | } |
1409 | 0 | if (!layer.wqkv_gate_in_s && layer.wqkv_gate) { |
1410 | 0 | layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1411 | 0 | } |
1412 | 0 | if (!layer.ffn_gate_in_s && layer.ffn_gate) { |
1413 | 0 | layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1414 | 0 | } |
1415 | 0 | if (!layer.ffn_down_in_s && layer.ffn_down) { |
1416 | 0 | layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1417 | 0 | } |
1418 | 0 | if (!layer.ffn_up_in_s && layer.ffn_up) { |
1419 | 0 | layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1420 | 0 | } |
1421 | 0 | if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) { |
1422 | 0 | layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED); |
1423 | 0 | } |
1424 | 0 | if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) { |
1425 | 0 | layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED); |
1426 | 0 | } |
1427 | 0 | if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) { |
1428 | 0 | layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED); |
1429 | 0 | } |
1430 | 0 | if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) { |
1431 | 0 | layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1432 | 0 | } |
1433 | 0 | if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) { |
1434 | 0 | layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1435 | 0 | } |
1436 | 0 | if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) { |
1437 | 0 | layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1438 | 0 | } |
1439 | 0 | if (!layer.ssm_in_in_s && layer.ssm_in) { |
1440 | 0 | layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1441 | 0 | } |
1442 | 0 | if (!layer.ssm_out_in_s && layer.ssm_out) { |
1443 | 0 | layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1444 | 0 | } |
1445 | 0 | if (!layer.ssm_alpha_in_s && layer.ssm_alpha) { |
1446 | 0 | layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1447 | 0 | } |
1448 | 0 | if (!layer.ssm_beta_in_s && layer.ssm_beta) { |
1449 | 0 | layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1450 | 0 | } |
1451 | 0 | if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) { |
1452 | 0 | layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1453 | 0 | } |
1454 | 0 | if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) { |
1455 | 0 | layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); |
1456 | 0 | } |
1457 | 0 | } |
1458 | | // output scales |
1459 | 0 | if (output && output->type == GGML_TYPE_NVFP4) { |
1460 | | // weight scale |
1461 | 0 | if (!output_s) { |
1462 | 0 | output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED); |
1463 | 0 | } |
1464 | | // input scale |
1465 | 0 | if (!output_in_s) { |
1466 | 0 | output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED); |
1467 | 0 | } |
1468 | 0 | } |
1469 | 0 | } |
1470 | 0 | ml.done_getting_tensors(); |
1471 | | |
1472 | | // Tied NVFP4 output is valid when no separate LM-head scale tensors are present. |
1473 | | // If sidecar scales exist, the output weight must be an actual output tensor. |
1474 | 0 | GGML_ASSERT(!(output && tok_embd && |
1475 | 0 | strcmp(output->name, tok_embd->name) == 0 && |
1476 | 0 | output->type == GGML_TYPE_NVFP4 && |
1477 | 0 | (output_s || output_in_s))); |
1478 | | // populate tensors_by_name |
1479 | 0 | for (auto & [_, ctx_ptr] : ml.ctx_map) { |
1480 | 0 | for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) { |
1481 | 0 | tensors_by_name.emplace_back(ggml_get_name(cur), cur); |
1482 | 0 | } |
1483 | 0 | } |
1484 | |
|
1485 | 0 | ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr); |
1486 | 0 | pimpl->mappings.reserve(ml.mappings.size()); |
1487 | | |
1488 | | // create the backend buffers |
1489 | 0 | std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps; |
1490 | 0 | ctx_buf_maps.reserve(ml.ctx_map.size()); |
1491 | | |
1492 | | // Ensure we have enough capacity for the maximum backend buffer we will potentially create |
1493 | 0 | const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size(); |
1494 | 0 | pimpl->ctxs_bufs.reserve(n_max_backend_buffer); |
1495 | |
|
1496 | 0 | for (auto & [buft, ctx_ptr] : ml.ctx_map) { |
1497 | 0 | ggml_context * ctx = ctx_ptr.get(); |
1498 | | |
1499 | | // skip contexts without tensors |
1500 | 0 | if (ggml_get_first_tensor(ctx) == nullptr) { |
1501 | 0 | continue; |
1502 | 0 | } |
1503 | | |
1504 | 0 | llama_buf_map buf_map; |
1505 | 0 | buf_map.reserve(n_max_backend_buffer); |
1506 | | |
1507 | | // check if it is possible to use buffer_from_host_ptr with this buffer type |
1508 | 0 | ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); |
1509 | 0 | if (!dev) { |
1510 | | // FIXME: workaround for CPU backend buft having a NULL device |
1511 | 0 | dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
1512 | 0 | if (!dev) { |
1513 | 0 | throw std::runtime_error(format("%s: no CPU backend found", __func__)); |
1514 | 0 | } |
1515 | 0 | } |
1516 | 0 | ggml_backend_dev_props props; |
1517 | 0 | ggml_backend_dev_get_props(dev, &props); |
1518 | 0 | bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; |
1519 | 0 | bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev); |
1520 | |
|
1521 | 0 | std::vector<ggml_backend_buffer_ptr> bufs; |
1522 | 0 | if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { |
1523 | 0 | GGML_ASSERT(!ml.no_alloc); |
1524 | 0 | for (uint32_t idx = 0; idx < ml.files.size(); idx++) { |
1525 | | // only the mmap region containing the tensors in the model is mapped to the backend buffer |
1526 | | // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, |
1527 | | // then we could just use metal for all layers |
1528 | | // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size |
1529 | 0 | void * addr = nullptr; |
1530 | 0 | size_t first, last; // NOLINT |
1531 | 0 | ml.get_mapping_range(&first, &last, &addr, idx, ctx); |
1532 | 0 | if (first >= last) { |
1533 | 0 | continue; |
1534 | 0 | } |
1535 | 0 | const size_t max_size = ggml_get_max_tensor_size(ctx); |
1536 | 0 | ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); |
1537 | 0 | if (buf == nullptr) { |
1538 | 0 | throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); |
1539 | 0 | } |
1540 | 0 | bufs.emplace_back(buf); |
1541 | 0 | buf_map.emplace(idx, buf); |
1542 | 0 | } |
1543 | 0 | } else { |
1544 | 0 | ggml_backend_buffer_t buf; |
1545 | 0 | if (ml.no_alloc) { |
1546 | 0 | buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer |
1547 | 0 | for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { |
1548 | 0 | t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them |
1549 | 0 | } |
1550 | 0 | } else { |
1551 | 0 | buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer |
1552 | 0 | } |
1553 | 0 | if (buf == nullptr) { |
1554 | 0 | throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); |
1555 | 0 | } |
1556 | 0 | if (use_mlock && ggml_backend_buffer_is_host(buf)) { |
1557 | 0 | pimpl->mlock_bufs.emplace_back(new llama_mlock); |
1558 | 0 | auto & mlock_buf = pimpl->mlock_bufs.back(); |
1559 | 0 | mlock_buf->init (ggml_backend_buffer_get_base(buf)); |
1560 | 0 | mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); |
1561 | 0 | } |
1562 | 0 | bufs.emplace_back(buf); |
1563 | 0 | for (uint32_t idx = 0; idx < ml.files.size(); idx++) { |
1564 | 0 | buf_map.emplace(idx, buf); |
1565 | 0 | } |
1566 | 0 | } |
1567 | | |
1568 | 0 | for (auto & buf : bufs) { |
1569 | | // indicate that this buffer contains weights |
1570 | | // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight |
1571 | 0 | ggml_backend_buffer_set_usage(buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); |
1572 | 0 | } |
1573 | |
|
1574 | 0 | pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs)); |
1575 | |
|
1576 | 0 | ctx_buf_maps.emplace_back(ctx, buf_map); |
1577 | 0 | } |
1578 | | |
1579 | 0 | if (llama_supports_gpu_offload()) { |
1580 | 0 | const int n_gpu = std::min(n_gpu_layers, n_layer_all); |
1581 | |
|
1582 | 0 | int n_repeating = n_gpu; |
1583 | 0 | if (n_repeating > 0) { |
1584 | 0 | LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__); |
1585 | 0 | n_repeating--; |
1586 | 0 | } |
1587 | 0 | LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); |
1588 | |
|
1589 | 0 | const int max_backend_supported_layers = n_layer_all + 1; |
1590 | 0 | const int max_offloadable_layers = n_layer_all + 1; |
1591 | |
|
1592 | 0 | LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); |
1593 | 0 | } |
1594 | | |
1595 | | // print memory requirements per buffer type |
1596 | 0 | for (auto & [_, bufs] : pimpl->ctxs_bufs) { |
1597 | 0 | for (auto & buf: bufs) { |
1598 | 0 | LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", |
1599 | 0 | __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); |
1600 | 0 | } |
1601 | 0 | } |
1602 | |
|
1603 | 0 | if (ml.no_alloc) { |
1604 | 0 | return true; |
1605 | 0 | } |
1606 | | |
1607 | | // load tensor data |
1608 | 0 | for (auto & [ctx, buf_map] : ctx_buf_maps) { |
1609 | 0 | if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { |
1610 | 0 | return false; |
1611 | 0 | } |
1612 | 0 | } |
1613 | | |
1614 | 0 | if (use_mmap_buffer) { |
1615 | 0 | for (auto & mapping : ml.mappings) { |
1616 | 0 | pimpl->mappings.emplace_back(std::move(mapping)); |
1617 | 0 | } |
1618 | 0 | } |
1619 | |
|
1620 | 0 | return true; |
1621 | 0 | } |
1622 | | |
1623 | 0 | ggml_tensor * llama_model_base::create_tensor(llama_model_loader & ml, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) { |
1624 | 0 | const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list; |
1625 | 0 | return ml.create_tensor( |
1626 | 0 | hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer, |
1627 | 0 | tn, ne, flags); |
1628 | 0 | } |
1629 | | |
1630 | 0 | std::string llama_model::arch_name() const { |
1631 | 0 | return llm_arch_name(arch); |
1632 | 0 | } |
1633 | | |
1634 | 0 | std::string llama_model::type_name() const { |
1635 | 0 | return llm_type_name(type); |
1636 | 0 | } |
1637 | | |
1638 | 0 | std::string llama_model::desc() const { |
1639 | 0 | return pimpl->desc_str; |
1640 | 0 | } |
1641 | | |
1642 | 0 | size_t llama_model::size() const { |
1643 | 0 | return pimpl->n_bytes; |
1644 | 0 | } |
1645 | | |
1646 | 0 | size_t llama_model::n_tensors() const { |
1647 | 0 | return tensors_by_name.size(); |
1648 | 0 | } |
1649 | | |
1650 | 0 | size_t llama_model::n_devices() const { |
1651 | 0 | return devices.size(); |
1652 | 0 | } |
1653 | | |
1654 | 0 | const float * llama_model::tensor_split() const { |
1655 | 0 | return params.tensor_split; |
1656 | 0 | } |
1657 | | |
1658 | 0 | uint32_t llama_model::n_gpu_layers() const { |
1659 | | // note: plus 1 for the "output" layer |
1660 | 0 | return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer_all + 1; |
1661 | 0 | } |
1662 | | |
1663 | 0 | llama_split_mode llama_model::split_mode() const { |
1664 | 0 | return params.split_mode; |
1665 | 0 | } |
1666 | | |
1667 | 0 | std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const { |
1668 | 0 | std::map<ggml_backend_buffer_type_t, size_t> ret; |
1669 | 0 | for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) { |
1670 | 0 | if (hparams.no_alloc) { |
1671 | 0 | GGML_ASSERT(bufs.size() == 1); |
1672 | 0 | ggml_backend_buffer_t buf = bufs[0].get(); |
1673 | 0 | GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr); |
1674 | 0 | ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf); |
1675 | 0 | ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); |
1676 | 0 | } else { |
1677 | 0 | for (const auto & buf : bufs) { |
1678 | | // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base |
1679 | 0 | ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); |
1680 | 0 | } |
1681 | 0 | } |
1682 | 0 | } |
1683 | 0 | return ret; |
1684 | 0 | } |
1685 | | |
1686 | 0 | uint64_t llama_model::n_elements() const { |
1687 | 0 | return pimpl->n_elements; |
1688 | 0 | } |
1689 | | |
1690 | 0 | void llama_model::print_info() const { |
1691 | 0 | const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train); |
1692 | |
|
1693 | 0 | auto print_f = [](const std::function<int32_t(uint32_t)> & f, uint32_t n) { |
1694 | 0 | bool is_var = false; |
1695 | |
|
1696 | 0 | std::vector<int32_t> v; |
1697 | 0 | for (uint32_t i = 0; i < n; ++i) { |
1698 | 0 | v.push_back(f(i)); |
1699 | 0 | if (v[i] != v[0]) { |
1700 | 0 | is_var = true; |
1701 | 0 | } |
1702 | 0 | } |
1703 | |
|
1704 | 0 | std::stringstream ss; |
1705 | |
|
1706 | 0 | if (is_var) { |
1707 | 0 | ss << "["; |
1708 | 0 | for (uint32_t i = 0; i < n; ++i) { |
1709 | 0 | ss << v[i]; |
1710 | 0 | if (i < n - 1) { |
1711 | 0 | ss << ", "; |
1712 | 0 | } |
1713 | 0 | } |
1714 | 0 | ss << "]"; |
1715 | 0 | } else { |
1716 | 0 | ss << v[0]; |
1717 | 0 | } |
1718 | |
|
1719 | 0 | return ss.str(); |
1720 | 0 | }; |
1721 | | |
1722 | | // hparams |
1723 | 0 | LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str()); |
1724 | 0 | LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only); |
1725 | 0 | LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc); |
1726 | |
|
1727 | 0 | if (!hparams.vocab_only) { |
1728 | 0 | LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); |
1729 | 0 | LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); |
1730 | 0 | LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); |
1731 | 0 | LLAMA_LOG_INFO("%s: n_embd_out = %u\n", __func__, hparams.n_embd_out()); |
1732 | 0 | LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); |
1733 | 0 | LLAMA_LOG_INFO("%s: n_layer_all = %u\n", __func__, hparams.n_layer_all); |
1734 | 0 | LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer_all).c_str()); |
1735 | 0 | LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer_all).c_str()); |
1736 | 0 | LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); |
1737 | 0 | LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); |
1738 | 0 | LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); |
1739 | 0 | LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); |
1740 | 0 | LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); |
1741 | 0 | LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer_all).c_str()); |
1742 | 0 | LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer_all).c_str()); |
1743 | 0 | LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer_all).c_str()); |
1744 | 0 | LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); |
1745 | 0 | LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); |
1746 | 0 | LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); |
1747 | 0 | LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); |
1748 | 0 | LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); |
1749 | 0 | LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); |
1750 | 0 | LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); |
1751 | 0 | LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer_all).c_str()); |
1752 | 0 | LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); |
1753 | 0 | LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); |
1754 | 0 | LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); |
1755 | 0 | LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used); |
1756 | 0 | LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn); |
1757 | 0 | LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); |
1758 | 0 | LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); |
1759 | 0 | LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); |
1760 | 0 | LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); |
1761 | 0 | LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); |
1762 | 0 | if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { |
1763 | 0 | LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa); |
1764 | 0 | LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa); |
1765 | 0 | LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa); |
1766 | 0 | LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa); |
1767 | 0 | LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa); |
1768 | 0 | } |
1769 | 0 | LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); |
1770 | 0 | LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); |
1771 | 0 | LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); |
1772 | 0 | if (arch == LLM_ARCH_GRANITE && |
1773 | 0 | std::any_of(hparams.deepstack_mapping_arr.begin(), |
1774 | 0 | hparams.deepstack_mapping_arr.end(), |
1775 | 0 | [](const auto & entry) { return entry >= 0; })) { |
1776 | 0 | LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, |
1777 | 0 | print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, |
1778 | 0 | hparams.n_layer_all).c_str()); |
1779 | 0 | } |
1780 | | // MRoPE (Multi-axis Rotary Position Embedding) sections |
1781 | 0 | if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { |
1782 | 0 | LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]); |
1783 | 0 | } |
1784 | 0 | if (!classifier_labels.empty()) { |
1785 | 0 | LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out); |
1786 | |
|
1787 | 0 | size_t i = 0; |
1788 | 0 | for (const auto & label : classifier_labels) { |
1789 | 0 | LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str()); |
1790 | 0 | } |
1791 | 0 | } |
1792 | |
|
1793 | 0 | if (arch == LLM_ARCH_MAMBA || |
1794 | 0 | arch == LLM_ARCH_MAMBA2 || |
1795 | 0 | arch == LLM_ARCH_JAMBA || |
1796 | 0 | arch == LLM_ARCH_FALCON_H1 || |
1797 | 0 | arch == LLM_ARCH_PLAMO2 || |
1798 | 0 | arch == LLM_ARCH_GRANITE_HYBRID || |
1799 | 0 | arch == LLM_ARCH_QWEN3NEXT || |
1800 | 0 | arch == LLM_ARCH_QWEN35 || |
1801 | 0 | arch == LLM_ARCH_QWEN35MOE || |
1802 | 0 | arch == LLM_ARCH_NEMOTRON_H || |
1803 | 0 | arch == LLM_ARCH_NEMOTRON_H_MOE) { |
1804 | 0 | LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); |
1805 | 0 | LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); |
1806 | 0 | LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); |
1807 | 0 | LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); |
1808 | 0 | LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group); |
1809 | 0 | LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); |
1810 | 0 | } |
1811 | |
|
1812 | 0 | LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str()); |
1813 | 0 | if (pimpl->n_elements >= 1e12) { |
1814 | 0 | LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12); |
1815 | 0 | } else if (pimpl->n_elements >= 1e9) { |
1816 | 0 | LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9); |
1817 | 0 | } else if (pimpl->n_elements >= 1e6) { |
1818 | 0 | LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6); |
1819 | 0 | } else { |
1820 | 0 | LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3); |
1821 | 0 | } |
1822 | | |
1823 | | // general kv |
1824 | 0 | LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str()); |
1825 | |
|
1826 | 0 | if (arch == LLM_ARCH_DEEPSEEK) { |
1827 | 0 | LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); |
1828 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1829 | 0 | LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); |
1830 | 0 | LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); |
1831 | 0 | } |
1832 | |
|
1833 | 0 | if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { |
1834 | 0 | LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); |
1835 | 0 | LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); |
1836 | 0 | LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); |
1837 | 0 | LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla()); |
1838 | 0 | LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla()); |
1839 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1840 | 0 | LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); |
1841 | 0 | LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); |
1842 | 0 | LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); |
1843 | 0 | LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); |
1844 | 0 | } |
1845 | |
|
1846 | 0 | if (arch == LLM_ARCH_QWEN2MOE) { |
1847 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1848 | 0 | LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); |
1849 | 0 | } |
1850 | |
|
1851 | 0 | if (arch == LLM_ARCH_MELLUM || |
1852 | 0 | arch == LLM_ARCH_COHERE2MOE || |
1853 | 0 | arch == LLM_ARCH_QWEN3MOE || |
1854 | 0 | arch == LLM_ARCH_OPENAI_MOE || |
1855 | 0 | arch == LLM_ARCH_QWEN3VLMOE || |
1856 | 0 | arch == LLM_ARCH_RND1) { |
1857 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1858 | 0 | } |
1859 | |
|
1860 | 0 | if (arch == LLM_ARCH_MINICPM || |
1861 | 0 | arch == LLM_ARCH_GRANITE || |
1862 | 0 | arch == LLM_ARCH_GRANITE_MOE || |
1863 | 0 | arch == LLM_ARCH_GRANITE_HYBRID || |
1864 | 0 | arch == LLM_ARCH_NEMOTRON_H_MOE) { |
1865 | 0 | LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); |
1866 | 0 | LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); |
1867 | 0 | LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); |
1868 | 0 | LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); |
1869 | 0 | } |
1870 | |
|
1871 | 0 | if (arch == LLM_ARCH_BAILINGMOE) { |
1872 | 0 | LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); |
1873 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1874 | 0 | LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); |
1875 | 0 | LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); |
1876 | 0 | LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); |
1877 | 0 | } |
1878 | |
|
1879 | 0 | if (arch == LLM_ARCH_BAILINGMOE2) { |
1880 | 0 | LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); |
1881 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1882 | 0 | LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); |
1883 | 0 | LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); |
1884 | 0 | LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); |
1885 | 0 | LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); |
1886 | 0 | LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); |
1887 | 0 | LLAMA_LOG_INFO("%s: n_layer_nextn = %d\n", __func__, hparams.n_layer_nextn); |
1888 | 0 | } |
1889 | |
|
1890 | 0 | if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { |
1891 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1892 | 0 | LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); |
1893 | 0 | } |
1894 | |
|
1895 | 0 | if (arch == LLM_ARCH_GROVEMOE) { |
1896 | 0 | LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); |
1897 | 0 | LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp); |
1898 | 0 | LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts); |
1899 | 0 | LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale); |
1900 | 0 | } |
1901 | 0 | } |
1902 | |
|
1903 | 0 | vocab.print_info(); |
1904 | 0 | } |
1905 | | |
1906 | 0 | ggml_backend_dev_t llama_model::dev_layer(int il) const { |
1907 | 0 | return pimpl->dev_layer.at(il).dev; |
1908 | 0 | } |
1909 | | |
1910 | 0 | ggml_backend_dev_t llama_model::dev_output() const { |
1911 | 0 | return pimpl->dev_output.dev; |
1912 | 0 | } |
1913 | | |
1914 | | template<typename F> |
1915 | 0 | static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { |
1916 | 0 | ggml_init_params params = { |
1917 | 0 | /*.mem_size =*/ ggml_tensor_overhead()*8, |
1918 | 0 | /*.mem_buffer =*/ NULL, |
1919 | 0 | /*.no_alloc =*/ true, |
1920 | 0 | }; |
1921 | |
|
1922 | 0 | ggml_context_ptr ctx { ggml_init(params) }; |
1923 | 0 | if (!ctx) { |
1924 | 0 | throw std::runtime_error(format("failed to create ggml context")); |
1925 | 0 | } |
1926 | | |
1927 | 0 | ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) }; |
1928 | 0 | ggml_tensor * op_tensor = fn(ctx.get()); |
1929 | 0 | for (int i = 0; i < GGML_MAX_SRC; i++) { |
1930 | 0 | if (op_tensor->src[i] != nullptr) { |
1931 | 0 | assert(op_tensor->src[i]->buffer == nullptr); |
1932 | 0 | op_tensor->src[i]->buffer = buf.get(); |
1933 | 0 | } |
1934 | 0 | } |
1935 | |
|
1936 | 0 | bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor); |
1937 | |
|
1938 | 0 | return op_supported; |
1939 | 0 | } |
1940 | | |
1941 | | template<typename F> |
1942 | 0 | static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) { |
1943 | 0 | for (const auto & cur : buft_list) { |
1944 | 0 | ggml_backend_dev_t cur_dev = cur.first; |
1945 | 0 | ggml_backend_buffer_type_t cur_buft = cur.second; |
1946 | 0 | if (buft_supported(cur_buft, cur_dev, fn)) { |
1947 | 0 | return cur_buft; |
1948 | 0 | } |
1949 | 0 | } |
1950 | | |
1951 | 0 | throw std::runtime_error(format("no suitable buffer type found")); |
1952 | 0 | } |
1953 | | |
1954 | 0 | ggml_backend_buffer_type_t llama_model::select_buft(int il) const { |
1955 | 0 | return ::select_buft( |
1956 | 0 | *pimpl->dev_layer.at(il).buft_list, |
1957 | 0 | [&](ggml_context * ctx) { |
1958 | 0 | ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); |
1959 | 0 | ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); |
1960 | 0 | return ggml_add(ctx, cur, layer_dir); |
1961 | 0 | }); |
1962 | 0 | } |
1963 | | |
1964 | 0 | bool llama_model::has_tensor_overrides() const { |
1965 | 0 | return pimpl->has_tensor_overrides; |
1966 | 0 | } |
1967 | | |
1968 | 0 | const ggml_tensor * llama_model::get_tensor(const char * name) const { |
1969 | 0 | auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), |
1970 | 0 | [name](const std::pair<std::string, ggml_tensor *> & it) { |
1971 | 0 | return it.first == name; |
1972 | 0 | }); |
1973 | 0 | if (it == tensors_by_name.end()) { |
1974 | 0 | return nullptr; |
1975 | 0 | } |
1976 | | |
1977 | 0 | return it->second; |
1978 | 0 | } |
1979 | | |
1980 | 0 | float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const { |
1981 | 0 | return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; |
1982 | 0 | } |
1983 | | |
1984 | 0 | float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const { |
1985 | 0 | return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale; |
1986 | 0 | } |
1987 | | |
1988 | 0 | ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const { |
1989 | 0 | const uint32_t n_ctx_seq = cparams.n_ctx_seq; |
1990 | | |
1991 | | // choose long/short freq factors based on the context size |
1992 | 0 | if (layers[il].rope_freqs != nullptr) { |
1993 | 0 | return layers[il].rope_freqs; |
1994 | 0 | } |
1995 | | |
1996 | 0 | if (n_ctx_seq > hparams.n_ctx_orig_yarn) { |
1997 | 0 | return layers[il].rope_long; |
1998 | 0 | } |
1999 | | |
2000 | 0 | return layers[il].rope_short; |
2001 | 0 | } |
2002 | | |
2003 | 0 | llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const { |
2004 | 0 | llama_memory_i * res; |
2005 | |
|
2006 | 0 | switch (arch) { |
2007 | | // Models that need specific instantiation should be handled in the |
2008 | | // switch statement |
2009 | 0 | case LLM_ARCH_BERT: |
2010 | 0 | case LLM_ARCH_JINA_BERT_V2: |
2011 | 0 | case LLM_ARCH_JINA_BERT_V3: |
2012 | 0 | case LLM_ARCH_NOMIC_BERT: |
2013 | 0 | case LLM_ARCH_NOMIC_BERT_MOE: |
2014 | 0 | case LLM_ARCH_NEO_BERT: |
2015 | 0 | case LLM_ARCH_EUROBERT: |
2016 | 0 | case LLM_ARCH_WAVTOKENIZER_DEC: |
2017 | 0 | case LLM_ARCH_MODERN_BERT: |
2018 | 0 | case LLM_ARCH_GEMMA_EMBEDDING: |
2019 | 0 | case LLM_ARCH_DREAM: |
2020 | 0 | case LLM_ARCH_LLADA: |
2021 | 0 | case LLM_ARCH_LLADA_MOE: |
2022 | 0 | case LLM_ARCH_RND1: |
2023 | 0 | { |
2024 | 0 | res = nullptr; |
2025 | 0 | } break; |
2026 | 0 | case LLM_ARCH_DEEPSEEK32: |
2027 | 0 | { |
2028 | 0 | res = new llama_kv_cache_dsa( |
2029 | 0 | *this, |
2030 | 0 | params.type_k, |
2031 | 0 | params.type_v, |
2032 | 0 | !cparams.flash_attn, |
2033 | 0 | cparams.offload_kqv, |
2034 | 0 | cparams.kv_unified, |
2035 | 0 | cparams.n_ctx_seq, |
2036 | 0 | cparams.n_seq_max, |
2037 | 0 | 1, |
2038 | 0 | hparams.n_swa, |
2039 | 0 | hparams.swa_type, |
2040 | 0 | nullptr, |
2041 | 0 | nullptr); |
2042 | 0 | } break; |
2043 | | // Models that need standard caching should rely on recurrent/hybrid |
2044 | | // checks |
2045 | 0 | default: |
2046 | 0 | { |
2047 | | // The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain |
2048 | | // attention KV cache for the MTP context instead of the hybrid wrapper. |
2049 | 0 | const bool mtp_on_hybrid_qwen35 = |
2050 | 0 | params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && |
2051 | 0 | (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE); |
2052 | |
|
2053 | 0 | if (llm_arch_is_recurrent(arch)) { |
2054 | 0 | res = new llama_memory_recurrent( |
2055 | 0 | *this, |
2056 | 0 | GGML_TYPE_F32, |
2057 | 0 | GGML_TYPE_F32, |
2058 | 0 | cparams.offload_kqv, |
2059 | 0 | std::max((uint32_t) 1, cparams.n_seq_max), |
2060 | 0 | cparams.n_seq_max, |
2061 | 0 | cparams.n_rs_seq, |
2062 | 0 | nullptr); |
2063 | 0 | } else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) { |
2064 | | // The main difference between hybrid architectures is the |
2065 | | // layer filters, so pick the right one here |
2066 | 0 | llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; |
2067 | 0 | llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; |
2068 | 0 | if (arch == LLM_ARCH_FALCON_H1) { |
2069 | 0 | filter_attn = [&](uint32_t) { return true; }; |
2070 | 0 | filter_recr = [&](uint32_t) { return true; }; |
2071 | 0 | } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { |
2072 | 0 | filter_attn = [&](uint32_t il) { |
2073 | 0 | return !hparams.is_recr(il) && hparams.n_ff(il) == 0; |
2074 | 0 | }; |
2075 | 0 | filter_recr = [&](uint32_t il) { |
2076 | 0 | return hparams.is_recr(il) && hparams.n_ff(il) == 0; |
2077 | 0 | }; |
2078 | 0 | } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { |
2079 | 0 | filter_attn = [&](uint32_t il) { |
2080 | 0 | return il < hparams.n_layer() && !hparams.is_recr(il); |
2081 | 0 | }; |
2082 | 0 | filter_recr = [&](uint32_t il) { |
2083 | 0 | return il < hparams.n_layer() && hparams.is_recr(il); |
2084 | 0 | }; |
2085 | 0 | } |
2086 | |
|
2087 | 0 | if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { |
2088 | | // Use hybrid-iswa for hybrid models with SWA |
2089 | 0 | res = new llama_memory_hybrid_iswa( |
2090 | 0 | /* model */ *this, |
2091 | 0 | /* attn_type_k */ params.type_k, |
2092 | 0 | /* attn_type_v */ params.type_v, |
2093 | 0 | /* attn_v_trans */ !cparams.flash_attn, |
2094 | 0 | /* attn_swa_full */ params.swa_full, |
2095 | 0 | /* attn_kv_size */ cparams.n_ctx_seq, |
2096 | 0 | /* attn_n_ubatch */ cparams.n_ubatch, |
2097 | 0 | /* attn_n_pad */ 1, |
2098 | 0 | /* recurrent_type_r */ GGML_TYPE_F32, |
2099 | 0 | /* recurrent_type_s */ GGML_TYPE_F32, |
2100 | 0 | /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max), |
2101 | 0 | /* n_seq_max */ cparams.n_seq_max, |
2102 | 0 | /* n_rs_seq */ cparams.n_rs_seq, |
2103 | 0 | /* offload */ cparams.offload_kqv, |
2104 | 0 | /* unified */ cparams.kv_unified, |
2105 | 0 | /* filter_attn */ std::move(filter_attn), |
2106 | 0 | /* filter_recr */ std::move(filter_recr)); |
2107 | 0 | } else { |
2108 | 0 | res = new llama_memory_hybrid( |
2109 | 0 | /* model */ *this, |
2110 | 0 | /* attn_type_k */ params.type_k, |
2111 | 0 | /* attn_type_v */ params.type_v, |
2112 | 0 | /* attn_v_trans */ !cparams.flash_attn, |
2113 | 0 | /* attn_kv_size */ cparams.n_ctx_seq, |
2114 | 0 | /* attn_n_pad */ 1, |
2115 | 0 | /* attn_n_swa */ hparams.n_swa, |
2116 | 0 | /* attn_swa_type */ hparams.swa_type, |
2117 | 0 | /* recurrent_type_k */ GGML_TYPE_F32, |
2118 | 0 | /* recurrent_type_v */ GGML_TYPE_F32, |
2119 | 0 | /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), |
2120 | 0 | /* n_seq_max */ cparams.n_seq_max, |
2121 | 0 | /* n_rs_seq */ cparams.n_rs_seq, |
2122 | 0 | /* offload */ cparams.offload_kqv, |
2123 | 0 | /* unified */ cparams.kv_unified, |
2124 | 0 | /* filter_attn */ std::move(filter_attn), |
2125 | 0 | /* filter_recr */ std::move(filter_recr)); |
2126 | 0 | } |
2127 | 0 | } else { |
2128 | 0 | llama_kv_cache::layer_filter_cb filter = nullptr; |
2129 | 0 | llama_memory_i::layer_reuse_cb reuse = nullptr; |
2130 | 0 | llama_kv_cache::layer_share_cb share = nullptr; |
2131 | |
|
2132 | 0 | if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { |
2133 | 0 | reuse = [&](uint32_t il) { |
2134 | 0 | GGML_ASSERT(hparams.n_layer_kv_from_start >= 2); |
2135 | |
|
2136 | 0 | if (il >= (uint32_t)hparams.n_layer_kv_from_start) { |
2137 | 0 | return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); |
2138 | 0 | } |
2139 | | |
2140 | 0 | return -1; |
2141 | 0 | }; |
2142 | 0 | } |
2143 | |
|
2144 | 0 | if (mtp_on_hybrid_qwen35) { |
2145 | 0 | filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; |
2146 | 0 | } |
2147 | |
|
2148 | 0 | if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) { |
2149 | 0 | if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) { |
2150 | 0 | filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; |
2151 | 0 | } else { |
2152 | 0 | filter = [&](uint32_t il) { return il < hparams.n_layer(); }; |
2153 | 0 | } |
2154 | 0 | } |
2155 | |
|
2156 | 0 | if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { |
2157 | 0 | GGML_ASSERT(hparams.is_swa_any()); |
2158 | |
|
2159 | 0 | if (arch == LLM_ARCH_GEMMA4_ASSISTANT) { |
2160 | 0 | llama_memory_t mem_other = llama_get_memory(cparams.ctx_other); |
2161 | |
|
2162 | 0 | share = [&](int32_t il) { |
2163 | 0 | const llama_model * model_other = llama_get_model(cparams.ctx_other); |
2164 | |
|
2165 | 0 | if (hparams.is_swa(il)) { |
2166 | 0 | return llama_model_n_layer(model_other) - 2; |
2167 | 0 | } |
2168 | | |
2169 | 0 | return llama_model_n_layer(model_other) - 1; |
2170 | 0 | }; |
2171 | |
|
2172 | 0 | res = new llama_kv_cache_iswa( |
2173 | 0 | *this, |
2174 | 0 | params.type_k, |
2175 | 0 | params.type_v, |
2176 | 0 | !cparams.flash_attn, |
2177 | 0 | cparams.offload_kqv, |
2178 | 0 | params.swa_full, |
2179 | 0 | cparams.kv_unified, |
2180 | 0 | cparams.n_ctx_seq, |
2181 | 0 | cparams.n_seq_max, |
2182 | 0 | cparams.n_ubatch, |
2183 | 0 | 1, |
2184 | 0 | mem_other, |
2185 | 0 | filter, |
2186 | 0 | reuse, |
2187 | 0 | share); |
2188 | 0 | } else { |
2189 | 0 | res = new llama_kv_cache_iswa( |
2190 | 0 | *this, |
2191 | 0 | params.type_k, |
2192 | 0 | params.type_v, |
2193 | 0 | !cparams.flash_attn, |
2194 | 0 | cparams.offload_kqv, |
2195 | 0 | params.swa_full, |
2196 | 0 | cparams.kv_unified, |
2197 | 0 | cparams.n_ctx_seq, |
2198 | 0 | cparams.n_seq_max, |
2199 | 0 | cparams.n_ubatch, |
2200 | 0 | 1, |
2201 | 0 | nullptr, |
2202 | 0 | filter, |
2203 | 0 | reuse, |
2204 | 0 | share); |
2205 | 0 | } |
2206 | 0 | } else { |
2207 | 0 | GGML_ASSERT(!hparams.is_swa_any()); |
2208 | |
|
2209 | 0 | res = new llama_kv_cache( |
2210 | 0 | *this, |
2211 | 0 | hparams, |
2212 | 0 | params.type_k, |
2213 | 0 | params.type_v, |
2214 | 0 | !cparams.flash_attn, |
2215 | 0 | cparams.offload_kqv, |
2216 | 0 | cparams.kv_unified, |
2217 | 0 | cparams.n_ctx_seq, |
2218 | 0 | cparams.n_seq_max, |
2219 | 0 | 1, |
2220 | 0 | hparams.n_swa, |
2221 | 0 | hparams.swa_type, |
2222 | 0 | nullptr, |
2223 | 0 | filter, |
2224 | 0 | nullptr, |
2225 | 0 | nullptr); |
2226 | 0 | } |
2227 | 0 | } |
2228 | 0 | } |
2229 | 0 | } |
2230 | | |
2231 | 0 | return res; |
2232 | 0 | } |
2233 | | |
2234 | 0 | ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { |
2235 | 0 | std::unique_ptr<llm_graph_context> llm = build_arch_graph(params); |
2236 | | |
2237 | | // add on pooling layer |
2238 | 0 | llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm); |
2239 | | |
2240 | | // add backend sampling layers (if any) |
2241 | 0 | llm->build_sampling(); |
2242 | | |
2243 | | // if the gguf model was converted with --sentence-transformers-dense-modules |
2244 | | // there will be two additional dense projection layers |
2245 | | // dense linear projections are applied after pooling |
2246 | | // TODO: move reranking logic here and generalize |
2247 | 0 | llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers); |
2248 | |
|
2249 | 0 | llm->res->set_outputs(params); |
2250 | |
|
2251 | 0 | return llm->res->get_gf(); |
2252 | 0 | } |
2253 | | |
2254 | | |
2255 | | // |
2256 | | // interface implementation |
2257 | | // |
2258 | | |
2259 | 0 | llama_model_params llama_model_default_params() { |
2260 | 0 | llama_model_params result = { |
2261 | 0 | /*.devices =*/ nullptr, |
2262 | 0 | /*.tensor_buft_overrides =*/ nullptr, |
2263 | 0 | /*.n_gpu_layers =*/ -1, |
2264 | 0 | /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, |
2265 | 0 | /*.main_gpu =*/ 0, |
2266 | 0 | /*.tensor_split =*/ nullptr, |
2267 | 0 | /*.progress_callback =*/ nullptr, |
2268 | 0 | /*.progress_callback_user_data =*/ nullptr, |
2269 | 0 | /*.kv_overrides =*/ nullptr, |
2270 | 0 | /*.vocab_only =*/ false, |
2271 | 0 | /*.use_mmap =*/ true, |
2272 | 0 | /*.use_direct_io =*/ false, |
2273 | 0 | /*.use_mlock =*/ false, |
2274 | 0 | /*.check_tensors =*/ false, |
2275 | 0 | /*.use_extra_bufts =*/ true, |
2276 | 0 | /*.no_host =*/ false, |
2277 | 0 | /*.no_alloc =*/ false, |
2278 | 0 | }; |
2279 | |
|
2280 | 0 | return result; |
2281 | 0 | } |
2282 | | |
2283 | 0 | const llama_vocab * llama_model_get_vocab(const llama_model * model) { |
2284 | 0 | return &model->vocab; |
2285 | 0 | } |
2286 | | |
2287 | 0 | void llama_free_model(llama_model * model) { |
2288 | 0 | llama_model_free(model); |
2289 | 0 | } |
2290 | | |
2291 | 0 | void llama_model_free(llama_model * model) { |
2292 | 0 | delete model; |
2293 | 0 | } |
2294 | | |
2295 | 0 | int32_t llama_model_n_ctx_train(const llama_model * model) { |
2296 | 0 | return model->hparams.n_ctx_train; |
2297 | 0 | } |
2298 | | |
2299 | 0 | int32_t llama_model_n_embd(const llama_model * model) { |
2300 | 0 | return model->hparams.n_embd; |
2301 | 0 | } |
2302 | | |
2303 | 0 | int32_t llama_model_n_embd_inp(const llama_model * model) { |
2304 | 0 | return model->hparams.n_embd_inp(); |
2305 | 0 | } |
2306 | | |
2307 | 0 | int32_t llama_model_n_embd_out(const llama_model * model) { |
2308 | 0 | return model->hparams.n_embd_out(); |
2309 | 0 | } |
2310 | | |
2311 | 0 | int32_t llama_model_n_layer(const llama_model * model) { |
2312 | 0 | return model->hparams.n_layer(); |
2313 | 0 | } |
2314 | | |
2315 | 0 | int32_t llama_model_n_layer_nextn(const llama_model * model) { |
2316 | 0 | return model->hparams.n_layer_nextn; |
2317 | 0 | } |
2318 | | |
2319 | 0 | int32_t llama_model_n_head(const llama_model * model) { |
2320 | 0 | return model->hparams.n_head(); |
2321 | 0 | } |
2322 | | |
2323 | 0 | int32_t llama_model_n_head_kv(const llama_model * model) { |
2324 | 0 | return model->hparams.n_head_kv(); |
2325 | 0 | } |
2326 | | |
2327 | 0 | int32_t llama_model_n_swa(const llama_model * model) { |
2328 | 0 | return model->hparams.n_swa; |
2329 | 0 | } |
2330 | | |
2331 | | |
2332 | 0 | uint32_t llama_model_n_cls_out(const struct llama_model * model) { |
2333 | 0 | return model->hparams.n_cls_out; |
2334 | 0 | } |
2335 | | |
2336 | 0 | const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) { |
2337 | 0 | if (i < model->classifier_labels.size()) { |
2338 | 0 | return model->classifier_labels[i].c_str(); |
2339 | 0 | } |
2340 | | |
2341 | 0 | return nullptr; |
2342 | 0 | } |
2343 | | |
2344 | | // deprecated |
2345 | 0 | int32_t llama_n_ctx_train(const llama_model * model) { |
2346 | 0 | return llama_model_n_ctx_train(model); |
2347 | 0 | } |
2348 | | |
2349 | | // deprecated |
2350 | 0 | int32_t llama_n_embd(const llama_model * model) { |
2351 | 0 | return llama_model_n_embd(model); |
2352 | 0 | } |
2353 | | |
2354 | | // deprecated |
2355 | 0 | int32_t llama_n_layer(const llama_model * model) { |
2356 | 0 | return llama_model_n_layer(model); |
2357 | 0 | } |
2358 | | |
2359 | | // deprecated |
2360 | 0 | int32_t llama_n_head(const llama_model * model) { |
2361 | 0 | return llama_model_n_head(model); |
2362 | 0 | } |
2363 | | |
2364 | 0 | llama_rope_type llama_model_rope_type(const llama_model * model) { |
2365 | 0 | switch (model->arch) { |
2366 | | // these models do not use RoPE |
2367 | 0 | case LLM_ARCH_CLIP: |
2368 | 0 | case LLM_ARCH_GPT2: |
2369 | 0 | case LLM_ARCH_GPTJ: |
2370 | 0 | case LLM_ARCH_MPT: |
2371 | 0 | case LLM_ARCH_REFACT: |
2372 | 0 | case LLM_ARCH_BLOOM: |
2373 | 0 | case LLM_ARCH_MAMBA: |
2374 | 0 | case LLM_ARCH_MAMBA2: |
2375 | 0 | case LLM_ARCH_JAMBA: |
2376 | 0 | case LLM_ARCH_JINA_BERT_V2: |
2377 | 0 | case LLM_ARCH_T5: |
2378 | 0 | case LLM_ARCH_T5ENCODER: |
2379 | 0 | case LLM_ARCH_JAIS: |
2380 | 0 | case LLM_ARCH_RWKV6: |
2381 | 0 | case LLM_ARCH_RWKV6QWEN2: |
2382 | 0 | case LLM_ARCH_RWKV7: |
2383 | 0 | case LLM_ARCH_ARWKV7: |
2384 | 0 | case LLM_ARCH_WAVTOKENIZER_DEC: |
2385 | 0 | case LLM_ARCH_NEMOTRON_H: |
2386 | 0 | case LLM_ARCH_NEMOTRON_H_MOE: |
2387 | 0 | case LLM_ARCH_KIMI_LINEAR: |
2388 | 0 | return LLAMA_ROPE_TYPE_NONE; |
2389 | | |
2390 | | // use what we call a normal RoPE, operating on pairs of consecutive head values |
2391 | 0 | case LLM_ARCH_LLAMA: |
2392 | 0 | case LLM_ARCH_LLADA: |
2393 | 0 | case LLM_ARCH_LLAMA4: |
2394 | 0 | case LLM_ARCH_DECI: |
2395 | 0 | case LLM_ARCH_BAICHUAN: |
2396 | 0 | case LLM_ARCH_STARCODER: |
2397 | 0 | case LLM_ARCH_INTERNLM2: |
2398 | 0 | case LLM_ARCH_MINICPM: |
2399 | 0 | case LLM_ARCH_XVERSE: |
2400 | 0 | case LLM_ARCH_COMMAND_R: |
2401 | 0 | case LLM_ARCH_COHERE2: |
2402 | 0 | case LLM_ARCH_COHERE2MOE: |
2403 | 0 | case LLM_ARCH_OLMO: |
2404 | 0 | case LLM_ARCH_ARCTIC: |
2405 | 0 | case LLM_ARCH_DEEPSEEK: |
2406 | 0 | case LLM_ARCH_DEEPSEEK2: |
2407 | 0 | case LLM_ARCH_DEEPSEEK2OCR: |
2408 | 0 | case LLM_ARCH_DEEPSEEK32: |
2409 | 0 | case LLM_ARCH_PLM: |
2410 | 0 | case LLM_ARCH_CHATGLM: |
2411 | 0 | case LLM_ARCH_GRANITE: |
2412 | 0 | case LLM_ARCH_GRANITE_MOE: |
2413 | 0 | case LLM_ARCH_GRANITE_HYBRID: |
2414 | 0 | case LLM_ARCH_CHAMELEON: |
2415 | 0 | case LLM_ARCH_BAILINGMOE: |
2416 | 0 | case LLM_ARCH_NEO_BERT: |
2417 | 0 | case LLM_ARCH_SMOLLM3: |
2418 | 0 | case LLM_ARCH_ARCEE: |
2419 | 0 | case LLM_ARCH_ERNIE4_5: |
2420 | 0 | case LLM_ARCH_ERNIE4_5_MOE: |
2421 | 0 | case LLM_ARCH_MISTRAL3: |
2422 | 0 | case LLM_ARCH_EAGLE3: |
2423 | 0 | case LLM_ARCH_MISTRAL4: |
2424 | 0 | case LLM_ARCH_LLAMA_EMBED: |
2425 | 0 | case LLM_ARCH_MAINCODER: |
2426 | 0 | case LLM_ARCH_GLM_DSA: |
2427 | 0 | return LLAMA_ROPE_TYPE_NORM; |
2428 | | |
2429 | | // the pairs of head values are offset by n_rot/2 |
2430 | 0 | case LLM_ARCH_FALCON: |
2431 | 0 | case LLM_ARCH_FALCON_H1: |
2432 | 0 | case LLM_ARCH_GROK: |
2433 | 0 | case LLM_ARCH_DBRX: |
2434 | 0 | case LLM_ARCH_BERT: |
2435 | 0 | case LLM_ARCH_JINA_BERT_V3: |
2436 | 0 | case LLM_ARCH_MODERN_BERT: |
2437 | 0 | case LLM_ARCH_NOMIC_BERT: |
2438 | 0 | case LLM_ARCH_NOMIC_BERT_MOE: |
2439 | 0 | case LLM_ARCH_EUROBERT: |
2440 | 0 | case LLM_ARCH_STABLELM: |
2441 | 0 | case LLM_ARCH_BITNET: |
2442 | 0 | case LLM_ARCH_QWEN: |
2443 | 0 | case LLM_ARCH_QWEN2: |
2444 | 0 | case LLM_ARCH_DREAM: |
2445 | 0 | case LLM_ARCH_QWEN2MOE: |
2446 | 0 | case LLM_ARCH_QWEN3: |
2447 | 0 | case LLM_ARCH_QWEN3MOE: |
2448 | 0 | case LLM_ARCH_LLADA_MOE: |
2449 | 0 | case LLM_ARCH_RND1: |
2450 | 0 | case LLM_ARCH_OLMO2: |
2451 | 0 | case LLM_ARCH_OLMOE: |
2452 | 0 | case LLM_ARCH_PHI2: |
2453 | 0 | case LLM_ARCH_PHI3: |
2454 | 0 | case LLM_ARCH_PHIMOE: |
2455 | 0 | case LLM_ARCH_PLAMO: |
2456 | 0 | case LLM_ARCH_PLAMO2: |
2457 | 0 | case LLM_ARCH_PLAMO3: |
2458 | 0 | case LLM_ARCH_GEMMA: |
2459 | 0 | case LLM_ARCH_GEMMA2: |
2460 | 0 | case LLM_ARCH_GEMMA3: |
2461 | 0 | case LLM_ARCH_GEMMA3N: |
2462 | 0 | case LLM_ARCH_GEMMA4: |
2463 | 0 | case LLM_ARCH_GEMMA4_ASSISTANT: |
2464 | 0 | case LLM_ARCH_GEMMA_EMBEDDING: |
2465 | 0 | case LLM_ARCH_STARCODER2: |
2466 | 0 | case LLM_ARCH_OPENELM: |
2467 | 0 | case LLM_ARCH_GPTNEOX: |
2468 | 0 | case LLM_ARCH_CODESHELL: |
2469 | 0 | case LLM_ARCH_ORION: |
2470 | 0 | case LLM_ARCH_NEMOTRON: |
2471 | 0 | case LLM_ARCH_EXAONE: |
2472 | 0 | case LLM_ARCH_EXAONE4: |
2473 | 0 | case LLM_ARCH_EXAONE_MOE: |
2474 | 0 | case LLM_ARCH_MINICPM3: |
2475 | 0 | case LLM_ARCH_BAILINGMOE2: |
2476 | 0 | case LLM_ARCH_DOTS1: |
2477 | 0 | case LLM_ARCH_HUNYUAN_MOE: |
2478 | 0 | case LLM_ARCH_JAIS2: |
2479 | 0 | case LLM_ARCH_OPENAI_MOE: |
2480 | 0 | case LLM_ARCH_HUNYUAN_DENSE: |
2481 | 0 | case LLM_ARCH_LFM2: |
2482 | 0 | case LLM_ARCH_LFM2MOE: |
2483 | 0 | case LLM_ARCH_SMALLTHINKER: |
2484 | 0 | case LLM_ARCH_SEED_OSS: |
2485 | 0 | case LLM_ARCH_GROVEMOE: |
2486 | 0 | case LLM_ARCH_APERTUS: |
2487 | 0 | case LLM_ARCH_MINIMAX_M2: |
2488 | 0 | case LLM_ARCH_COGVLM: |
2489 | 0 | case LLM_ARCH_PANGU_EMBED: |
2490 | 0 | case LLM_ARCH_AFMOE: |
2491 | 0 | case LLM_ARCH_QWEN3NEXT: |
2492 | 0 | case LLM_ARCH_MIMO2: |
2493 | 0 | case LLM_ARCH_STEP35: |
2494 | 0 | case LLM_ARCH_TALKIE: |
2495 | 0 | case LLM_ARCH_MELLUM: |
2496 | 0 | return LLAMA_ROPE_TYPE_NEOX; |
2497 | | |
2498 | 0 | case LLM_ARCH_QWEN2VL: |
2499 | 0 | case LLM_ARCH_PADDLEOCR: |
2500 | 0 | return LLAMA_ROPE_TYPE_MROPE; |
2501 | 0 | case LLM_ARCH_QWEN3VL: |
2502 | 0 | case LLM_ARCH_QWEN3VLMOE: |
2503 | 0 | case LLM_ARCH_QWEN35: |
2504 | 0 | case LLM_ARCH_QWEN35MOE: |
2505 | 0 | return LLAMA_ROPE_TYPE_IMROPE; |
2506 | | |
2507 | 0 | case LLM_ARCH_GLM4: |
2508 | 0 | return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM; |
2509 | 0 | case LLM_ARCH_GLM4_MOE: |
2510 | 0 | return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; |
2511 | | |
2512 | 0 | case LLM_ARCH_HUNYUAN_VL: |
2513 | 0 | return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; |
2514 | | |
2515 | | // all model arches should be listed explicitly here |
2516 | 0 | case LLM_ARCH_UNKNOWN: |
2517 | 0 | GGML_ABORT("unknown architecture"); |
2518 | 0 | } |
2519 | | |
2520 | 0 | return LLAMA_ROPE_TYPE_NONE; |
2521 | 0 | } |
2522 | | |
2523 | 0 | float llama_model_rope_freq_scale_train(const llama_model * model) { |
2524 | 0 | return model->hparams.rope_freq_scale_train; |
2525 | 0 | } |
2526 | | |
2527 | 0 | int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) { |
2528 | 0 | const auto & it = model->gguf_kv.find(key); |
2529 | 0 | if (it == model->gguf_kv.end()) { |
2530 | 0 | if (buf_size > 0) { |
2531 | 0 | buf[0] = '\0'; |
2532 | 0 | } |
2533 | 0 | return -1; |
2534 | 0 | } |
2535 | 0 | return snprintf(buf, buf_size, "%s", it->second.c_str()); |
2536 | 0 | } |
2537 | | |
2538 | 0 | int32_t llama_model_meta_count(const llama_model * model) { |
2539 | 0 | return (int)model->gguf_kv.size(); |
2540 | 0 | } |
2541 | | |
2542 | 0 | const char * llama_model_meta_key_str(llama_model_meta_key key) { |
2543 | 0 | switch (key) { |
2544 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence"; |
2545 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k"; |
2546 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p"; |
2547 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p"; |
2548 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability"; |
2549 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold"; |
2550 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp"; |
2551 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n"; |
2552 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat"; |
2553 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat"; |
2554 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau"; |
2555 | 0 | case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta"; |
2556 | 0 | default: return nullptr; |
2557 | 0 | } |
2558 | 0 | } |
2559 | | |
2560 | 0 | int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) { |
2561 | 0 | if (i < 0 || i >= (int)model->gguf_kv.size()) { |
2562 | 0 | if (buf_size > 0) { |
2563 | 0 | buf[0] = '\0'; |
2564 | 0 | } |
2565 | 0 | return -1; |
2566 | 0 | } |
2567 | 0 | auto it = model->gguf_kv.begin(); |
2568 | 0 | std::advance(it, i); |
2569 | 0 | return snprintf(buf, buf_size, "%s", it->first.c_str()); |
2570 | 0 | } |
2571 | | |
2572 | 0 | int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) { |
2573 | 0 | if (i < 0 || i >= (int)model->gguf_kv.size()) { |
2574 | 0 | if (buf_size > 0) { |
2575 | 0 | buf[0] = '\0'; |
2576 | 0 | } |
2577 | 0 | return -1; |
2578 | 0 | } |
2579 | 0 | auto it = model->gguf_kv.begin(); |
2580 | 0 | std::advance(it, i); |
2581 | 0 | return snprintf(buf, buf_size, "%s", it->second.c_str()); |
2582 | 0 | } |
2583 | | |
2584 | 0 | int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) { |
2585 | 0 | return snprintf(buf, buf_size, "%s", model->desc().c_str()); |
2586 | 0 | } |
2587 | | |
2588 | 0 | uint64_t llama_model_size(const llama_model * model) { |
2589 | 0 | return model->size(); |
2590 | 0 | } |
2591 | | |
2592 | 0 | const char * llama_model_chat_template(const llama_model * model, const char * name) { |
2593 | 0 | const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) |
2594 | 0 | : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); |
2595 | 0 | const auto & it = model->gguf_kv.find(key); |
2596 | 0 | if (it == model->gguf_kv.end()) { |
2597 | | // one-off fix for very popular models (so we are not flooded with issues) |
2598 | | // do not extend this list unless absolutely necessary |
2599 | | // Mistral-Small-2503 does not have built-in chat template |
2600 | 0 | llama_vocab_pre_type pre_type = model->vocab.get_pre_type(); |
2601 | 0 | if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) { |
2602 | 0 | return "mistral-v7-tekken"; |
2603 | 0 | } |
2604 | | |
2605 | 0 | return nullptr; |
2606 | 0 | } |
2607 | | |
2608 | 0 | return it->second.c_str(); |
2609 | 0 | } |
2610 | | |
2611 | 0 | uint64_t llama_model_n_params(const llama_model * model) { |
2612 | 0 | return model->n_elements(); |
2613 | 0 | } |
2614 | | |
2615 | 0 | bool llama_model_has_encoder(const llama_model * model) { |
2616 | 0 | switch (model->arch) { |
2617 | 0 | case LLM_ARCH_T5: |
2618 | 0 | case LLM_ARCH_T5ENCODER: |
2619 | 0 | case LLM_ARCH_EAGLE3: return true; |
2620 | 0 | default: return false; |
2621 | 0 | } |
2622 | 0 | } |
2623 | | |
2624 | 0 | bool llama_model_has_decoder(const llama_model * model) { |
2625 | 0 | switch (model->arch) { |
2626 | 0 | case LLM_ARCH_T5ENCODER: return false; |
2627 | 0 | default: return true; |
2628 | 0 | } |
2629 | 0 | } |
2630 | | |
2631 | 0 | llama_token llama_model_decoder_start_token(const llama_model * model) { |
2632 | 0 | return model->hparams.dec_start_token_id; |
2633 | 0 | } |
2634 | | |
2635 | 0 | bool llama_model_is_recurrent(const llama_model * model) { |
2636 | 0 | return llm_arch_is_recurrent(model->arch); |
2637 | 0 | } |
2638 | | |
2639 | 0 | bool llama_model_is_hybrid(const llama_model * model) { |
2640 | 0 | return llm_arch_is_hybrid(model->arch); |
2641 | 0 | } |
2642 | | |
2643 | 0 | bool llama_model_is_diffusion(const llama_model * model) { |
2644 | 0 | return llm_arch_is_diffusion(model->arch); |
2645 | 0 | } |
2646 | | |
2647 | 0 | const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) { |
2648 | 0 | return model->tensors_by_name; |
2649 | 0 | } |
2650 | | |
2651 | 0 | int32_t llama_model_n_expert(const struct llama_model * model) { |
2652 | 0 | return model->hparams.n_expert; |
2653 | 0 | } |
2654 | | |
2655 | 0 | int32_t llama_model_n_devices(const struct llama_model * model) { |
2656 | 0 | return (int32_t)model->devices.size(); |
2657 | 0 | } |
2658 | | |
2659 | 0 | ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) { |
2660 | 0 | if (i < 0 || i >= (int)model->devices.size()) { |
2661 | 0 | return nullptr; |
2662 | 0 | } |
2663 | 0 | return model->devices[i].dev; |
2664 | 0 | } |
2665 | | |
2666 | | // |
2667 | | // llama_model_base |
2668 | | // |
2669 | | |
2670 | 0 | llama_model_base::llama_model_base(const struct llama_model_params & params) : llama_model(params), model(this), tn(model->arch), |
2671 | 0 | TENSOR_DUPLICATED (llama_model_loader::TENSOR_DUPLICATED), |
2672 | 0 | TENSOR_NOT_REQUIRED (llama_model_loader::TENSOR_NOT_REQUIRED), |
2673 | 0 | TENSOR_SKIP (llama_model_loader::TENSOR_SKIP), |
2674 | 0 | TENSOR_SKIP_IF_VIRTUAL(llama_model_loader::TENSOR_SKIP_IF_VIRTUAL) {} |
2675 | | |
2676 | 0 | ggml_tensor * llama_model_base::create_tensor(const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) { |
2677 | 0 | GGML_ASSERT(ml != nullptr); |
2678 | 0 | return create_tensor(*ml, tn, ne, flags); |
2679 | 0 | } |
2680 | | |
2681 | 0 | void llama_model_base::create_tensor_gate_up_exps(llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) { |
2682 | 0 | layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED); |
2683 | 0 | if (layer.ffn_gate_up_exps == nullptr) { |
2684 | 0 | layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); |
2685 | 0 | layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); |
2686 | 0 | } |
2687 | 0 | } |
2688 | | |
2689 | | void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid, |
2690 | | int64_t n_embd_, int64_t n_embd_q_, int64_t n_embd_k_, int64_t n_embd_v_, |
2691 | 0 | int flags) { |
2692 | 0 | const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_; |
2693 | 0 | layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL); |
2694 | 0 | if (layer.wqkv) { |
2695 | 0 | layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL); |
2696 | 0 | } else { |
2697 | 0 | layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags); |
2698 | 0 | layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags); |
2699 | 0 | layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags); |
2700 | 0 | layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED); |
2701 | 0 | layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED); |
2702 | 0 | layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED); |
2703 | 0 | } |
2704 | 0 | } |
2705 | | |
2706 | 0 | const int32_t * llama_model_target_layer_ids(const struct llama_model * model) { |
2707 | 0 | const auto & v = model->target_layer_ids; |
2708 | 0 | return v.empty() ? nullptr : v.data(); |
2709 | 0 | } |
2710 | | |
2711 | 0 | uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) { |
2712 | 0 | return (uint32_t) model->target_layer_ids.size(); |
2713 | 0 | } |