/src/llama.cpp/src/models/step35.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | 0 | void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { |
4 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
5 | |
|
6 | 0 | hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; |
7 | | |
8 | | // full_attention layer only use half of the RoPE dimensions |
9 | 0 | hparams.n_rot_full = hparams.n_rot_full / 2; |
10 | | |
11 | | // MoE + SWA parameters |
12 | 0 | ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); |
13 | 0 | ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); |
14 | 0 | ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); |
15 | 0 | ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); |
16 | 0 | ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); |
17 | | |
18 | | // Step35 uses sigmoid gating by default (if not set in GGUF) |
19 | 0 | if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { |
20 | 0 | hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; |
21 | 0 | } |
22 | |
|
23 | 0 | ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); |
24 | 0 | ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); |
25 | |
|
26 | 0 | ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); |
27 | |
|
28 | 0 | ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer(), false); |
29 | 0 | ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false); |
30 | | |
31 | | // NextN/MTP (Step3p5): extra decoder block appended beyond the main stack. |
32 | 0 | ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); |
33 | 0 | GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); |
34 | |
|
35 | 0 | switch (hparams.n_layer()) { |
36 | 0 | case 45: type = LLM_TYPE_196B_A11B; break; |
37 | 0 | default: type = LLM_TYPE_UNKNOWN; |
38 | 0 | } |
39 | 0 | } |
40 | | |
41 | 0 | void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { |
42 | 0 | LLAMA_LOAD_LOCALS; |
43 | |
|
44 | 0 | const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); |
45 | | // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP |
46 | | // tensors live in a separate file (e.g. user split target/draft). Mark |
47 | | // MTP tensors NOT_REQUIRED so the trunk loads cleanly. |
48 | 0 | const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; |
49 | 0 | const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); |
50 | 0 | const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; |
51 | 0 | const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; |
52 | |
|
53 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); |
54 | | |
55 | | // output |
56 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); |
57 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, trunk_flags); |
58 | | |
59 | | // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor |
60 | | // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer. |
61 | 0 | uint32_t n_rot_max = 0; |
62 | 0 | for (int i = 0; i < n_layer; ++i) { |
63 | 0 | n_rot_max = std::max(n_rot_max, hparams.n_rot(i)); |
64 | 0 | } |
65 | 0 | if (n_rot_max == 0) { |
66 | 0 | n_rot_max = n_rot; |
67 | 0 | } |
68 | |
|
69 | 0 | auto load_block_trunk = [&](int i, int flags) { |
70 | 0 | auto & layer = layers[i]; |
71 | |
|
72 | 0 | const uint32_t n_head_l = hparams.n_head(i); |
73 | 0 | const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); |
74 | 0 | const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); |
75 | |
|
76 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); |
77 | 0 | layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); |
78 | 0 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); |
79 | | |
80 | | // optional rope factors (llama3) / longrope tensors |
81 | 0 | if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { |
82 | 0 | layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); |
83 | 0 | layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); |
84 | 0 | } else { |
85 | 0 | layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); |
86 | 0 | } |
87 | |
|
88 | 0 | create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, flags); |
89 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, flags); |
90 | | |
91 | | // head-wise attention gate (Step35 self_attn.g_proj) |
92 | 0 | layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); |
93 | |
|
94 | 0 | layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); |
95 | | |
96 | | // dense MLP (leading dense blocks) |
97 | 0 | layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); |
98 | 0 | layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED); |
99 | 0 | layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); |
100 | | |
101 | | // MoE routed experts + selection bias (router_bias) |
102 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp; |
103 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); |
104 | 0 | layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); |
105 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED); |
106 | 0 | layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); |
107 | 0 | layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); |
108 | | |
109 | | // shared expert MLP |
110 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); |
111 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); |
112 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); |
113 | 0 | }; |
114 | |
|
115 | 0 | auto load_block_mtp = [&](int i, bool is_first_mtp) { |
116 | 0 | auto & layer = layers[i]; |
117 | |
|
118 | 0 | const uint32_t n_head_l = hparams.n_head(i); |
119 | 0 | const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); |
120 | 0 | const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); |
121 | | |
122 | | // The MTP block is a full Step3p5 decoder layer (mtp_block) plus the |
123 | | // NextN-specific wiring (enorm/hnorm/eh_proj + optional shared head). |
124 | | // `mtp_flags` becomes NOT_REQUIRED when the GGUF is trunk-only. |
125 | | // |
126 | | // Only the FIRST MTP block (i == n_main) is required for the |
127 | | // single-block MTP runtime; trailing MTP blocks are always tolerated |
128 | | // as missing so pruned GGUFs (block 0 only) load cleanly. Override |
129 | | // mtp_flags to NOT_REQUIRED for those. |
130 | 0 | const int eff_mtp_flags = is_first_mtp ? mtp_flags : (mtp_flags | TENSOR_NOT_REQUIRED); |
131 | |
|
132 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, eff_mtp_flags); |
133 | 0 | layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); |
134 | 0 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); |
135 | |
|
136 | 0 | if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { |
137 | 0 | layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); |
138 | 0 | layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); |
139 | 0 | } else { |
140 | 0 | layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); |
141 | 0 | } |
142 | |
|
143 | 0 | create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, eff_mtp_flags); |
144 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, eff_mtp_flags); |
145 | |
|
146 | 0 | layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); |
147 | |
|
148 | 0 | layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, eff_mtp_flags); |
149 | | |
150 | | // dense MLP (leading dense blocks) — present if the MTP block isn't MoE |
151 | 0 | layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); |
152 | 0 | layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED); |
153 | 0 | layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); |
154 | | |
155 | | // MoE routed experts + selection bias (router_bias) |
156 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp; |
157 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); |
158 | 0 | layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); |
159 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED); |
160 | 0 | layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); |
161 | 0 | layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); |
162 | |
|
163 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); |
164 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); |
165 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); |
166 | | |
167 | | // NextN-specific tensors that define the MTP block. |
168 | 0 | layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, eff_mtp_flags); |
169 | 0 | layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, eff_mtp_flags); |
170 | 0 | layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, eff_mtp_flags); |
171 | 0 | layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
172 | 0 | layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
173 | 0 | layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); |
174 | 0 | }; |
175 | |
|
176 | 0 | for (int i = 0; i < n_layer; ++i) { |
177 | 0 | load_block_trunk(i, trunk_flags); |
178 | 0 | } |
179 | | // Only the first MTP block (i == n_main) is required at runtime — the |
180 | | // single-block-MTP graph in build_arch_graph always uses that one. |
181 | | // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with |
182 | | // all MTP layers still works) but tolerated when absent via the pruning |
183 | | // path. See scripts/prune_step35_extra_mtp.py for the pruner. |
184 | 0 | for (int i = n_layer; i < n_layer_all; ++i) { |
185 | 0 | load_block_mtp(i, /*is_first_mtp=*/ i == n_layer); |
186 | 0 | } |
187 | 0 | } |
188 | | |
189 | 0 | std::unique_ptr<llm_graph_context> llama_model_step35::build_arch_graph(const llm_graph_params & params) const { |
190 | 0 | if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { |
191 | 0 | return std::make_unique<graph_mtp>(*this, params); |
192 | 0 | } |
193 | 0 | return std::make_unique<graph>(*this, params); |
194 | 0 | } |
195 | | |
196 | 0 | llama_model_step35::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { |
197 | 0 | ggml_tensor * cur; |
198 | 0 | ggml_tensor * inpL; |
199 | |
|
200 | 0 | inpL = build_inp_embd(model.tok_embd); |
201 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
202 | 0 | auto * inp_attn = build_attn_inp_kv_iswa(); |
203 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
204 | | |
205 | | // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. |
206 | 0 | for (int il = 0; il < n_layer; ++il) { |
207 | 0 | ggml_tensor * inpSA = inpL; |
208 | |
|
209 | 0 | const uint32_t n_head_l = hparams.n_head(il); |
210 | 0 | const uint32_t n_head_kv_l = hparams.n_head_kv(il); |
211 | |
|
212 | 0 | const float freq_base_l = model.get_rope_freq_base(cparams, il); |
213 | 0 | const float freq_scale_l = model.get_rope_freq_scale(cparams, il); |
214 | |
|
215 | 0 | cur = inpL; |
216 | | |
217 | | // dump pre-attn RMSNorm input to pinpoint layer boundary issues |
218 | 0 | cb(cur, "attn_norm_in", il); |
219 | | |
220 | | // self-attention |
221 | 0 | { |
222 | 0 | cur = build_norm(cur, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); |
223 | 0 | cb(cur, "attn_norm", il); |
224 | 0 | ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); |
225 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
226 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
227 | |
|
228 | 0 | cb(Qcur, "Qcur", il); |
229 | 0 | cb(Kcur, "Kcur", il); |
230 | 0 | cb(Vcur, "Vcur", il); |
231 | |
|
232 | 0 | Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); |
233 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); |
234 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); |
235 | | |
236 | | // Q/K per-head RMSNorm (Step35 q_norm / k_norm) |
237 | 0 | if (model.layers[il].attn_q_norm) { |
238 | 0 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); |
239 | 0 | cb(Qcur, "Qcur_normed", il); |
240 | 0 | } |
241 | 0 | if (model.layers[il].attn_k_norm) { |
242 | 0 | Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); |
243 | 0 | cb(Kcur, "Kcur_normed", il); |
244 | 0 | } |
245 | | |
246 | | // RoPE (partial rotary factors per layer) |
247 | 0 | const bool is_swa = hparams.is_swa(il); |
248 | 0 | ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); |
249 | 0 | const int64_t n_rot_l = hparams.n_rot(il); |
250 | 0 | Qcur = ggml_rope_ext( |
251 | 0 | ctx0, Qcur, inp_pos, rope_factors, |
252 | 0 | n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, |
253 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
254 | 0 | ); |
255 | 0 | Kcur = ggml_rope_ext( |
256 | 0 | ctx0, Kcur, inp_pos, rope_factors, |
257 | 0 | n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, |
258 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
259 | 0 | ); |
260 | 0 | cb(Qcur, "Qcur_pos", il); |
261 | 0 | cb(Kcur, "Kcur_pos", il); |
262 | |
|
263 | 0 | const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k)); |
264 | 0 | ggml_tensor * attn_out = build_attn(inp_attn, |
265 | 0 | nullptr, nullptr, nullptr, |
266 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
267 | 0 | cb(attn_out, "attn_out", il); |
268 | | // head-wise attention gate: sigmoid(g_proj(x)) in torch |
269 | 0 | if (model.layers[il].wqkv_gate) { |
270 | 0 | ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens] |
271 | 0 | cb(gate, "attn_gate", il); |
272 | |
|
273 | 0 | gate = ggml_sigmoid(ctx0, gate); |
274 | 0 | cb(gate, "attn_gate_sigmoid", il); |
275 | | |
276 | | // reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens] |
277 | 0 | ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); |
278 | 0 | ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); |
279 | 0 | cb(gate_3d, "attn_gate_3d", il); |
280 | |
|
281 | 0 | attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); |
282 | 0 | cb(attn_3d, "attn_gated_3d", il); |
283 | |
|
284 | 0 | attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); |
285 | 0 | cb(attn_out, "attn_gated", il); |
286 | 0 | } |
287 | | |
288 | | // output projection |
289 | 0 | cur = build_lora_mm(model.layers[il].wo, attn_out, model.layers[il].wo_s); |
290 | 0 | cb(cur, "attn_proj", il); |
291 | 0 | } |
292 | |
|
293 | 0 | if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { |
294 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
295 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
296 | 0 | } |
297 | |
|
298 | 0 | ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
299 | 0 | cb(ffn_inp, "ffn_inp", il); |
300 | |
|
301 | 0 | cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); |
302 | 0 | cb(cur, "ffn_norm", il); |
303 | | |
304 | | // feed-forward |
305 | 0 | if (model.layers[il].ffn_gate_inp == nullptr) { |
306 | | // dense MLP |
307 | 0 | cur = build_ffn(cur, |
308 | 0 | model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, |
309 | 0 | model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, |
310 | 0 | model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, |
311 | 0 | nullptr, |
312 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
313 | 0 | cb(cur, "ffn_out", il); |
314 | 0 | } else { |
315 | | // MoE routed experts |
316 | 0 | ggml_tensor * moe_out = build_moe_ffn(cur, |
317 | 0 | model.layers[il].ffn_gate_inp, |
318 | 0 | model.layers[il].ffn_up_exps, |
319 | 0 | model.layers[il].ffn_gate_exps, |
320 | 0 | model.layers[il].ffn_down_exps, |
321 | 0 | model.layers[il].ffn_exp_probs_b, |
322 | 0 | n_expert, n_expert_used, |
323 | 0 | LLM_FFN_SILU, hparams.expert_weights_norm, |
324 | 0 | hparams.expert_weights_scale, |
325 | 0 | (llama_expert_gating_func_type) hparams.expert_gating_func, |
326 | 0 | il); |
327 | 0 | cb(moe_out, "ffn_moe_out", il); |
328 | | |
329 | | // shared expert MLP (always added on MoE layers in Step35) |
330 | 0 | ggml_tensor * sh_out = build_ffn(cur, |
331 | 0 | model.layers[il].ffn_up_shexp, nullptr, nullptr, |
332 | 0 | model.layers[il].ffn_gate_shexp, nullptr, nullptr, |
333 | 0 | model.layers[il].ffn_down_shexp, nullptr, nullptr, |
334 | 0 | nullptr, |
335 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
336 | 0 | cb(sh_out, "ffn_shared_out", il); |
337 | |
|
338 | 0 | cur = ggml_add(ctx0, moe_out, sh_out); |
339 | 0 | cb(cur, "ffn_out", il); |
340 | 0 | } |
341 | 0 | cur = ggml_add(ctx0, cur, ffn_inp); |
342 | |
|
343 | 0 | cur = build_cvec(cur, il); |
344 | 0 | cb(cur, "l_out", il); |
345 | | |
346 | | // input for next layer |
347 | 0 | inpL = cur; |
348 | 0 | } |
349 | |
|
350 | 0 | cur = inpL; |
351 | |
|
352 | 0 | cb(cur, "h_nextn", -1); |
353 | 0 | res->t_h_nextn = cur; |
354 | |
|
355 | 0 | if (!cparams.embeddings_nextn_masked && inp_out_ids) { |
356 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
357 | 0 | } |
358 | |
|
359 | 0 | cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); |
360 | 0 | cb(cur, "result_norm", -1); |
361 | 0 | res->t_embd = cur; |
362 | |
|
363 | 0 | cur = build_lora_mm(model.output, cur, model.output_s); |
364 | 0 | cb(cur, "result_output", -1); |
365 | 0 | res->t_logits = cur; |
366 | |
|
367 | 0 | ggml_build_forward_expand(gf, cur); |
368 | 0 | } |
369 | | |
370 | | // LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE) |
371 | | llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) |
372 | 0 | : llm_graph_context(params) { |
373 | 0 | GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0"); |
374 | | |
375 | | // Single-block MTP only: always run the first trained MTP block (Qwen |
376 | | // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to |
377 | | // be a much deeper refactor than this PR justifies; the trailing MTP |
378 | | // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just |
379 | | // block 0) also work — see load_arch_tensors below and |
380 | | // scripts/prune_step35_extra_mtp.py. |
381 | 0 | const int il = hparams.n_layer(); |
382 | 0 | const auto & layer = model.layers[il]; |
383 | |
|
384 | 0 | GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); |
385 | 0 | GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); |
386 | 0 | GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); |
387 | |
|
388 | 0 | const uint32_t n_head_l = hparams.n_head(il); |
389 | 0 | const uint32_t n_head_kv_l = hparams.n_head_kv(il); |
390 | |
|
391 | 0 | const float freq_base_l = model.get_rope_freq_base(cparams, il); |
392 | 0 | const float freq_scale_l = model.get_rope_freq_scale(cparams, il); |
393 | |
|
394 | 0 | auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd); |
395 | |
|
396 | 0 | inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |
397 | 0 | ggml_set_input(inp->tokens); |
398 | |
|
399 | 0 | inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); |
400 | 0 | ggml_set_input(inp->embd); |
401 | 0 | ggml_set_name(inp->embd, "mtp_h_input"); |
402 | |
|
403 | 0 | ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; |
404 | |
|
405 | 0 | ggml_tensor * h_input = inp->embd; |
406 | 0 | ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); |
407 | 0 | cb(tok_embd, "mtp_tok_embd", il); |
408 | |
|
409 | 0 | res->add_input(std::move(inp)); |
410 | |
|
411 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
412 | 0 | auto * inp_attn = build_attn_inp_kv_iswa(); |
413 | |
|
414 | 0 | ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); |
415 | 0 | cb(h_norm, "mtp_hnorm", il); |
416 | |
|
417 | 0 | ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); |
418 | 0 | cb(e_norm, "mtp_enorm", il); |
419 | |
|
420 | 0 | ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); |
421 | 0 | cb(concat, "mtp_concat", il); |
422 | |
|
423 | 0 | ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); |
424 | 0 | cb(cur, "mtp_eh_proj", il); |
425 | |
|
426 | 0 | ggml_tensor * inpSA = cur; |
427 | | |
428 | | // mtp_block: full Step3p5 decoder layer (attention with optional head-wise gate, then MoE/dense FFN) |
429 | 0 | cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); |
430 | 0 | cb(cur, "mtp_attn_norm", il); |
431 | |
|
432 | 0 | ggml_tensor * Qcur = build_lora_mm(layer.wq, cur, layer.wq_s); |
433 | 0 | ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); |
434 | 0 | ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); |
435 | 0 | cb(Qcur, "mtp_Qcur", il); |
436 | 0 | cb(Kcur, "mtp_Kcur", il); |
437 | 0 | cb(Vcur, "mtp_Vcur", il); |
438 | |
|
439 | 0 | Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); |
440 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); |
441 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); |
442 | |
|
443 | 0 | if (layer.attn_q_norm) { |
444 | 0 | Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); |
445 | 0 | cb(Qcur, "mtp_Qcur_normed", il); |
446 | 0 | } |
447 | 0 | if (layer.attn_k_norm) { |
448 | 0 | Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); |
449 | 0 | cb(Kcur, "mtp_Kcur_normed", il); |
450 | 0 | } |
451 | |
|
452 | 0 | const bool is_swa = hparams.is_swa(il); |
453 | 0 | ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); |
454 | 0 | const int64_t n_rot_l = hparams.n_rot(il); |
455 | |
|
456 | 0 | Qcur = ggml_rope_ext( |
457 | 0 | ctx0, Qcur, inp_pos, rope_factors, |
458 | 0 | n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, |
459 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
460 | 0 | Kcur = ggml_rope_ext( |
461 | 0 | ctx0, Kcur, inp_pos, rope_factors, |
462 | 0 | n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, |
463 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
464 | 0 | cb(Qcur, "mtp_Qcur_pos", il); |
465 | 0 | cb(Kcur, "mtp_Kcur_pos", il); |
466 | |
|
467 | 0 | const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k)); |
468 | 0 | ggml_tensor * attn_out = build_attn(inp_attn, |
469 | 0 | nullptr, nullptr, nullptr, |
470 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
471 | 0 | cb(attn_out, "mtp_attn_out", il); |
472 | | |
473 | | // head-wise attention gate: sigmoid(g_proj(x)) |
474 | 0 | if (layer.wqkv_gate) { |
475 | 0 | ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, cur); // [n_head_l, n_tokens] |
476 | 0 | cb(gate, "mtp_attn_gate", il); |
477 | |
|
478 | 0 | gate = ggml_sigmoid(ctx0, gate); |
479 | 0 | cb(gate, "mtp_attn_gate_sigmoid", il); |
480 | |
|
481 | 0 | ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); |
482 | 0 | ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); |
483 | 0 | cb(gate_3d, "mtp_attn_gate_3d", il); |
484 | |
|
485 | 0 | attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); |
486 | 0 | cb(attn_3d, "mtp_attn_gated_3d", il); |
487 | |
|
488 | 0 | attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); |
489 | 0 | cb(attn_out, "mtp_attn_gated", il); |
490 | 0 | } |
491 | |
|
492 | 0 | cur = build_lora_mm(layer.wo, attn_out, layer.wo_s); |
493 | 0 | cb(cur, "mtp_attn_proj", il); |
494 | |
|
495 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
496 | 0 | cb(cur, "mtp_attn_residual", il); |
497 | |
|
498 | 0 | ggml_tensor * ffn_inp = cur; |
499 | 0 | cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il); |
500 | 0 | cb(cur, "mtp_ffn_norm", il); |
501 | | |
502 | | // FFN: dense MLP or MoE (mirrors trunk path) |
503 | 0 | if (layer.ffn_gate_inp == nullptr) { |
504 | 0 | cur = build_ffn(cur, |
505 | 0 | layer.ffn_up, layer.ffn_up_b, nullptr, |
506 | 0 | layer.ffn_gate, layer.ffn_gate_b, nullptr, |
507 | 0 | layer.ffn_down, layer.ffn_down_b, nullptr, |
508 | 0 | nullptr, |
509 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
510 | 0 | cb(cur, "mtp_ffn_out", il); |
511 | 0 | } else { |
512 | 0 | ggml_tensor * moe_out = build_moe_ffn(cur, |
513 | 0 | layer.ffn_gate_inp, |
514 | 0 | layer.ffn_up_exps, |
515 | 0 | layer.ffn_gate_exps, |
516 | 0 | layer.ffn_down_exps, |
517 | 0 | layer.ffn_exp_probs_b, |
518 | 0 | n_expert, n_expert_used, |
519 | 0 | LLM_FFN_SILU, hparams.expert_weights_norm, |
520 | 0 | hparams.expert_weights_scale, |
521 | 0 | (llama_expert_gating_func_type) hparams.expert_gating_func, |
522 | 0 | il); |
523 | 0 | cb(moe_out, "mtp_ffn_moe_out", il); |
524 | |
|
525 | 0 | ggml_tensor * sh_out = build_ffn(cur, |
526 | 0 | layer.ffn_up_shexp, nullptr, nullptr, |
527 | 0 | layer.ffn_gate_shexp, nullptr, nullptr, |
528 | 0 | layer.ffn_down_shexp, nullptr, nullptr, |
529 | 0 | nullptr, |
530 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
531 | 0 | cb(sh_out, "mtp_ffn_shared_out", il); |
532 | |
|
533 | 0 | cur = ggml_add(ctx0, moe_out, sh_out); |
534 | 0 | cb(cur, "mtp_ffn_out", il); |
535 | 0 | } |
536 | 0 | cur = ggml_add(ctx0, cur, ffn_inp); |
537 | 0 | cb(cur, "mtp_post_ffn", il); |
538 | | |
539 | | // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. |
540 | 0 | cb(cur, "h_nextn", -1); |
541 | 0 | res->t_h_nextn = cur; |
542 | |
|
543 | 0 | ggml_tensor * head_norm_w = layer.nextn.shared_head_norm |
544 | 0 | ? layer.nextn.shared_head_norm |
545 | 0 | : model.output_norm; |
546 | 0 | GGML_ASSERT(head_norm_w && "STEP35 MTP: missing both nextn.shared_head_norm and output_norm"); |
547 | 0 | cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); |
548 | 0 | cb(cur, "mtp_shared_head_norm", -1); |
549 | |
|
550 | 0 | ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; |
551 | 0 | GGML_ASSERT(head_w && "STEP35 MTP: missing LM head (nextn.shared_head_head or model.output)"); |
552 | 0 | cur = build_lora_mm(head_w, cur); |
553 | 0 | cb(cur, "result_output", -1); |
554 | |
|
555 | 0 | res->t_logits = cur; |
556 | 0 | ggml_build_forward_expand(gf, cur); |
557 | 0 | } |