/src/llama.cpp/src/llama-hparams.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include "llama.h" |
4 | | |
5 | | #include <array> |
6 | | #include <cassert> |
7 | | |
8 | | // bump if necessary |
9 | 0 | #define LLAMA_MAX_LAYERS 512 |
10 | | #define LLAMA_MAX_EXPERTS 512 // Qwen3 Next |
11 | | |
12 | | enum llama_expert_gating_func_type { |
13 | | LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, |
14 | | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1, |
15 | | LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2, |
16 | | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits |
17 | | }; |
18 | | |
19 | | enum llama_swa_type { |
20 | | LLAMA_SWA_TYPE_NONE = 0, |
21 | | LLAMA_SWA_TYPE_STANDARD = 1, |
22 | | LLAMA_SWA_TYPE_CHUNKED = 2, |
23 | | LLAMA_SWA_TYPE_SYMMETRIC = 3, |
24 | | }; |
25 | | |
26 | | // forward declaration; full definition in llama-graph.h |
27 | | enum llm_ffn_op_type : int; |
28 | | |
29 | | struct llama_hparams_posnet { |
30 | | uint32_t n_embd; |
31 | | uint32_t n_layer; |
32 | | }; |
33 | | |
34 | | struct llama_hparams_convnext { |
35 | | uint32_t n_embd; |
36 | | uint32_t n_layer; |
37 | | }; |
38 | | |
39 | | struct llama_hparams { |
40 | | // note: use the `_impl` suffix to avoid name conflict between members and getters |
41 | | // for example: n_embd_out() vs n_embd_out_impl |
42 | | |
43 | | bool vocab_only; |
44 | | bool no_alloc; |
45 | | bool rope_finetuned; |
46 | | bool use_par_res; |
47 | | bool swin_norm; |
48 | | bool norm_before_residual = false; |
49 | | |
50 | | uint32_t n_ctx_train; // context size the model was trained on |
51 | | uint32_t n_embd; |
52 | | uint32_t n_layer_all; |
53 | | uint32_t n_layer_nextn = 0; |
54 | | uint32_t n_expert = 0; |
55 | | uint32_t n_expert_used = 0; |
56 | | uint32_t n_rel_attn_bkts = 0; |
57 | | |
58 | | // TODO: this needs to be reworked |
59 | | int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache |
60 | | |
61 | | // different head size for full_attention and SWA layers |
62 | | uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads |
63 | | uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head |
64 | | uint32_t n_embd_head_k_swa; |
65 | | uint32_t n_embd_head_v_swa; |
66 | | |
67 | | // different RoPE dimensions for full_attention and SWA layers |
68 | | uint32_t n_rot_full; |
69 | | uint32_t n_rot_swa; |
70 | | |
71 | | // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA |
72 | | uint32_t n_embd_head_k_mla_impl = 0; |
73 | | uint32_t n_embd_head_v_mla_impl = 0; |
74 | | |
75 | | // for WavTokenizer |
76 | | struct llama_hparams_posnet posnet; |
77 | | struct llama_hparams_convnext convnext; |
78 | | |
79 | | uint32_t n_shortconv_l_cache = 0; |
80 | | |
81 | | std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr; |
82 | | std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; |
83 | | std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; |
84 | | |
85 | | uint32_t n_layer_dense_lead = 0; |
86 | | uint32_t n_lora_q = 0; |
87 | | uint32_t n_lora_kv = 0; |
88 | | uint32_t n_ff_exp = 0; |
89 | | uint32_t n_ff_shexp = 0; |
90 | | uint32_t n_ff_chexp = 0; |
91 | | uint32_t n_expert_shared = 0; |
92 | | uint32_t n_norm_groups = 0; |
93 | | uint32_t n_expert_groups = 0; |
94 | | uint32_t n_group_used = 0; |
95 | | uint32_t n_group_experts = 0; |
96 | | |
97 | | float expert_group_scale = 0.05f; |
98 | | float expert_weights_scale = 0.0f; |
99 | | bool expert_weights_norm = false; |
100 | | uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; |
101 | | uint32_t moe_every_n_layers = 0; |
102 | | uint32_t moe_latent_size = 0; |
103 | | |
104 | | float f_norm_eps; |
105 | | float f_norm_rms_eps; |
106 | | float f_norm_group_eps; |
107 | | |
108 | | float f_attn_logit_softcapping = 50.0f; |
109 | | float f_router_logit_softcapping = 30.0f; |
110 | | float f_final_logit_softcapping = 30.0f; |
111 | | |
112 | | // for RWKV |
113 | | uint32_t rescale_every_n_layers = 0; |
114 | | uint32_t time_mix_extra_dim = 0; |
115 | | uint32_t time_decay_extra_dim = 0; |
116 | | uint32_t wkv_head_size = 0; |
117 | | uint32_t token_shift_count = 2; |
118 | | uint32_t n_lora_decay = 0; |
119 | | uint32_t n_lora_iclr = 0; |
120 | | uint32_t n_lora_value_res_mix = 0; |
121 | | uint32_t n_lora_gate = 0; |
122 | | |
123 | | float rope_attn_factor = 1.0f; |
124 | | float rope_freq_base_train; |
125 | | float rope_freq_base_train_swa = 10000.0f; |
126 | | float rope_freq_scale_train; |
127 | | float rope_freq_scale_train_swa = 1.0f; |
128 | | float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE |
129 | | |
130 | | uint32_t n_ctx_orig_yarn; |
131 | | float rope_yarn_log_mul = 0.0f; |
132 | | |
133 | | float yarn_ext_factor = -1.0f; |
134 | | float yarn_attn_factor = 1.0f; |
135 | | float yarn_beta_fast = 32.0f; |
136 | | float yarn_beta_slow = 1.0f; |
137 | | |
138 | | std::array<int, 4> rope_sections; |
139 | | |
140 | | // Sliding Window Attention (SWA) |
141 | | llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; |
142 | | // the size of the sliding window (0 - no SWA) |
143 | | uint32_t n_swa = 0; |
144 | | |
145 | | // if is_swa_impl[il] == 1, then layer il is SWA |
146 | | // if is_swa_impl[il] == 0, then layer il is dense (i.e. non-SWA) |
147 | | // by default, all layers are dense |
148 | | // note: using uint32_t type for compatibility reason |
149 | | std::array<uint32_t, LLAMA_MAX_LAYERS> is_swa_impl; |
150 | | |
151 | | // for hybrid state space models |
152 | | std::array<uint32_t, LLAMA_MAX_LAYERS> is_recr_impl; |
153 | | |
154 | | // for State Space Models |
155 | | uint32_t ssm_d_conv = 0; |
156 | | uint32_t ssm_d_inner = 0; |
157 | | uint32_t ssm_d_state = 0; |
158 | | uint32_t ssm_dt_rank = 0; |
159 | | uint32_t ssm_n_group = 0; |
160 | | |
161 | | // for Kimi Linear KDA |
162 | | uint32_t n_embd_head_kda = 0; |
163 | | |
164 | | bool ssm_dt_b_c_rms = false; |
165 | | |
166 | | float f_clamp_kqv = 0.0f; |
167 | | float f_max_alibi_bias = 0.0f; |
168 | | float f_logit_scale = 0.0f; |
169 | | |
170 | | // Additional scale factors (Granite/Granite MoE) |
171 | | float f_residual_scale = 0.0f; |
172 | | float f_embedding_scale = 0.0f; |
173 | | float f_attention_scale = 0.0f; |
174 | | |
175 | | // grok-2 |
176 | | float f_attn_out_scale = 0.0f; |
177 | | uint32_t attn_temp_length = 0; |
178 | | |
179 | | float f_attn_value_scale = 0.0f; |
180 | | |
181 | | bool causal_attn = true; |
182 | | bool use_alibi = false; |
183 | | bool attn_soft_cap = false; |
184 | | bool use_kq_norm = false; |
185 | | |
186 | | // for Classifiers |
187 | | uint32_t n_cls_out = 1; |
188 | | |
189 | | // input embedding dimension (0 = use n_embd) |
190 | | uint32_t n_embd_inp_impl = 0; |
191 | | |
192 | | // encoder input embedding dimension (0 = use n_embd_inp()) |
193 | | // e.g. the eagle3 encoder fuses target_layers * target_hidden features |
194 | | uint32_t n_embd_inp_enc_impl = 0; |
195 | | |
196 | | // output embedding dimension (0 = use n_embd) |
197 | | uint32_t n_embd_out_impl = 0; |
198 | | |
199 | | // llama4 smallthinker |
200 | | uint32_t n_moe_layer_step = 0; |
201 | | uint32_t n_no_rope_layer_step = 4; |
202 | | uint32_t n_attn_temp_floor_scale = 0; |
203 | | float f_attn_temp_scale = 0.0f; |
204 | | float f_attn_temp_offset = 0.0f; // offset position index |
205 | | |
206 | | // gemma3n altup |
207 | | uint32_t n_altup = 4; // altup_num_inputs |
208 | | uint32_t i_altup_act = 0; // altup_active_idx |
209 | | uint32_t laurel_rank = 64; |
210 | | uint32_t n_embd_altup = 256; |
211 | | |
212 | | // needed for sentence-transformers dense layers |
213 | | uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense |
214 | | uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense |
215 | | uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense |
216 | | uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense |
217 | | |
218 | | // xIELU |
219 | | std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n; |
220 | | std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p; |
221 | | std::array<float, LLAMA_MAX_LAYERS> xielu_beta; |
222 | | std::array<float, LLAMA_MAX_LAYERS> xielu_eps; |
223 | | |
224 | | // DSA (deepseek sparse attention) |
225 | | uint32_t indexer_n_head = 0; |
226 | | uint32_t indexer_head_size = 0; |
227 | | uint32_t indexer_top_k = 0; |
228 | | |
229 | | // qwen3vl deepstack |
230 | | // When parsed from GGUF, this implies the first N layers consume the first |
231 | | // N deepstack embeddings. Use deepstack_mapping_arr if you need a more |
232 | | // complex mapping. If using deepstack_mapping_arr, also make sure to set |
233 | | // n_deepstack_layers to the number of unique deepstack layers so that |
234 | | // n_embd_imp is accurate (see granite.cpp). |
235 | | // TODO: can be expressed via the `new n_embd_inp_impl` and remove this param |
236 | | uint32_t n_deepstack_layers = 0; |
237 | | |
238 | | // deepstack layer array (Granite4 Vision) |
239 | | // -1 => no deepstack |
240 | | // >=0 => input embedding index for deepstack injection |
241 | | std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr; |
242 | | |
243 | | // gemma4 per-layer embedding |
244 | | uint32_t n_embd_per_layer = 0; |
245 | | |
246 | | // needed by encoder-decoder models (e.g. T5, FLAN-T5) |
247 | | // ref: https://github.com/ggml-org/llama.cpp/pull/8141 |
248 | | llama_token dec_start_token_id = LLAMA_TOKEN_NULL; |
249 | | uint32_t dec_n_layer = 0; |
250 | | |
251 | | enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; |
252 | | enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; |
253 | | enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; |
254 | | |
255 | | |
256 | | // Resolved FFN gated activation flavor for archs that read |
257 | | // `<arch>.hidden_activation` from the GGUF (e.g. ModernBert derivatives). |
258 | | // Defaults to LLM_FFN_NONE (sentinel = 0); the mapping from the GGUF |
259 | | // string to a real op is done at hparam-load time via |
260 | | // llm_ffn_op_type_from_string() in llama-model.cpp, mirroring how |
261 | | // rope_scaling_type_train is handled. |
262 | | enum llm_ffn_op_type llm_ffn_op; |
263 | | |
264 | | // Step35: optional per-layer clamps for (Swi)GLU |
265 | | std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN |
266 | | std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert |
267 | | |
268 | | // this value n_pattern means that every nth layer is dense (i.e. non-SWA) |
269 | | // dense_first means whether the pattern is start with a dense layer |
270 | | // note that if n_pattern == 0, all layers are SWA |
271 | | // if n_pattern == 1, all layers are dense |
272 | | // example 1: n_pattern = 3, dense_first = false |
273 | | // il == 0: swa |
274 | | // il == 1: swa |
275 | | // il == 2: dense |
276 | | // il == 3: swa |
277 | | // il == 4: swa |
278 | | // il == 5: dense |
279 | | // il == 6: swa |
280 | | // etc ... |
281 | | // example 2: n_pattern = 2, dense_first = true |
282 | | // il == 0: dense |
283 | | // il == 1: swa |
284 | | // il == 2: dense |
285 | | // il == 3: swa |
286 | | // etc ... |
287 | | void set_swa_pattern(uint32_t n_pattern, bool dense_first = false); |
288 | | |
289 | | // return true if one of the layers is SWA |
290 | | bool is_swa_any() const; |
291 | | |
292 | | bool is_swa(uint32_t il) const; |
293 | | |
294 | | void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); |
295 | | |
296 | | // whether or not the given layer is recurrent (for hybrid models) |
297 | | bool is_recr(uint32_t il) const; |
298 | | |
299 | | uint32_t n_head(uint32_t il = 0) const; |
300 | | |
301 | | uint32_t n_head_kv(uint32_t il = 0) const; |
302 | | |
303 | | uint32_t n_ff(uint32_t il = 0) const; |
304 | | |
305 | | uint32_t n_gqa(uint32_t il = 0) const; |
306 | | |
307 | | uint32_t n_rot(uint32_t il = 0) const; |
308 | | |
309 | | // dimension of main + auxiliary input embeddings |
310 | | uint32_t n_embd_inp() const; |
311 | | |
312 | | // dimension of the encoder input embeddings |
313 | | uint32_t n_embd_inp_enc() const; |
314 | | |
315 | | // dimension of output embeddings |
316 | | uint32_t n_embd_out() const; |
317 | | |
318 | | // dimension of key/value embeddings for each head (per layer) |
319 | | uint32_t n_embd_head_k(uint32_t il = 0) const; |
320 | | uint32_t n_embd_head_v(uint32_t il = 0) const; |
321 | | |
322 | | // dimension of key embeddings across all k-v heads |
323 | | uint32_t n_embd_k_gqa(uint32_t il = 0) const; |
324 | | |
325 | | // dimension of value embeddings across all k-v heads |
326 | | uint32_t n_embd_v_gqa(uint32_t il = 0) const; |
327 | | |
328 | | // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa |
329 | | bool is_n_embd_k_gqa_variable() const; |
330 | | bool is_n_embd_v_gqa_variable() const; |
331 | | |
332 | | // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers |
333 | | uint32_t n_embd_k_gqa_max() const; |
334 | | uint32_t n_embd_v_gqa_max() const; |
335 | | |
336 | | // dimension of the rolling state embeddings |
337 | | // corresponds to Mamba's conv_states size or RWKV's token_shift states size |
338 | | uint32_t n_embd_r() const; |
339 | | |
340 | | // dimension of the recurrent state embeddings |
341 | | uint32_t n_embd_s() const; |
342 | | |
343 | | uint32_t n_pos_per_embd() const; |
344 | | |
345 | | // note: currently only support if either all or none of the layers are MLA |
346 | | bool is_mla() const; |
347 | | |
348 | | uint32_t n_embd_head_k_mla() const; |
349 | | uint32_t n_embd_head_v_mla() const; |
350 | | |
351 | | bool has_kv(uint32_t il) const; |
352 | | |
353 | | // number of effective layers (excludes nextn layers) |
354 | | uint32_t n_layer() const; |
355 | | |
356 | | // note that this function uses different SWA parameters from those in the hparams |
357 | | // note: inlined on purpose for performance reasons |
358 | | // TODO: think of a better place for this function |
359 | | // TODO: pack the SWA params in a struct? |
360 | 0 | static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) { |
361 | 0 | assert(p0 >= 0 && p1 >= 0); |
362 | |
|
363 | 0 | switch (swa_type) { |
364 | 0 | case LLAMA_SWA_TYPE_NONE: |
365 | 0 | { |
366 | 0 | } break; |
367 | 0 | case LLAMA_SWA_TYPE_STANDARD: |
368 | 0 | { |
369 | 0 | if (p1 - p0 >= (int32_t) n_swa) { |
370 | 0 | return true; |
371 | 0 | } |
372 | 0 | } break; |
373 | 0 | case LLAMA_SWA_TYPE_CHUNKED: |
374 | 0 | { |
375 | 0 | const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa; |
376 | |
|
377 | 0 | if (p0 < pos_chunk_start) { |
378 | 0 | return true; |
379 | 0 | } |
380 | 0 | } break; |
381 | 0 | case LLAMA_SWA_TYPE_SYMMETRIC: |
382 | 0 | { |
383 | 0 | const int32_t half_n_swa = (int32_t) n_swa / 2; |
384 | 0 | const int32_t pos_diff = p1 - p0; |
385 | | |
386 | | // Mask if outside the symmetric window |
387 | 0 | if (pos_diff < -half_n_swa || pos_diff > half_n_swa) { |
388 | 0 | return true; |
389 | 0 | } |
390 | 0 | } break; |
391 | 0 | } |
392 | | |
393 | 0 | return false; |
394 | 0 | } |
395 | | |
396 | | |
397 | | bool use_mrope() const; |
398 | | }; |
399 | | |
400 | | static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); |