/src/llama.cpp/src/models/qwen35moe.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | #include "llama-memory-recurrent.h" |
3 | | |
4 | 0 | void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { |
5 | 0 | ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); |
6 | 0 | ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); |
7 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
8 | |
|
9 | 0 | ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); |
10 | | |
11 | | // Load linear attention (gated delta net) parameters |
12 | 0 | ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); |
13 | 0 | ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); |
14 | 0 | ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); |
15 | 0 | ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); |
16 | 0 | ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); |
17 | | |
18 | | // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack |
19 | 0 | ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); |
20 | 0 | GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); |
21 | | |
22 | | // Mark recurrent layers (linear attention layers). MTP layers are dense |
23 | | // attention-only and must be flagged non-recurrent. |
24 | 0 | if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { |
25 | 0 | uint32_t full_attn_interval = 4; |
26 | 0 | ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); |
27 | 0 | for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { |
28 | 0 | hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); |
29 | 0 | } |
30 | 0 | } |
31 | |
|
32 | 0 | switch (hparams.n_layer()) { |
33 | 0 | case 40: type = LLM_TYPE_35B_A3B; break; |
34 | 0 | case 48: type = LLM_TYPE_122B_A10B; break; |
35 | 0 | case 60: type = LLM_TYPE_397B_A17B; break; |
36 | 0 | default: type = LLM_TYPE_UNKNOWN; |
37 | 0 | } |
38 | 0 | } |
39 | | |
40 | 0 | void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { |
41 | 0 | LLAMA_LOAD_LOCALS; |
42 | |
|
43 | 0 | const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); |
44 | 0 | const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; |
45 | |
|
46 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); |
47 | | |
48 | | // output |
49 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); |
50 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
51 | | |
52 | | // if output is NULL, init from the input tok embed |
53 | 0 | if (output == NULL) { |
54 | 0 | output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); |
55 | 0 | } |
56 | |
|
57 | 0 | auto load_block_trunk = [&](int il, int flags) { |
58 | 0 | auto & layer = layers[il]; |
59 | |
|
60 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; |
61 | 0 | const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; |
62 | | |
63 | | // Calculate dimensions from hyperparameters |
64 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
65 | 0 | const int64_t head_v_dim = hparams.ssm_d_state; |
66 | 0 | const int64_t n_k_heads = hparams.ssm_n_group; |
67 | 0 | const int64_t n_v_heads = hparams.ssm_dt_rank; |
68 | 0 | const int64_t key_dim = head_k_dim * n_k_heads; |
69 | 0 | const int64_t value_dim = head_v_dim * n_v_heads; |
70 | 0 | const int64_t conv_dim = key_dim * 2 + value_dim; |
71 | |
|
72 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); |
73 | 0 | layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); |
74 | |
|
75 | 0 | if (!hparams.is_recr(il)) { |
76 | | // Attention layers |
77 | 0 | create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); |
78 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); |
79 | | |
80 | | // Q/K normalization for attention layers |
81 | 0 | layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags); |
82 | 0 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags); |
83 | 0 | } else { |
84 | | // Linear attention (gated delta net) specific tensors |
85 | | // Create tensors with calculated dimensions |
86 | 0 | layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); |
87 | 0 | layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); |
88 | 0 | layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags); |
89 | 0 | layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags); |
90 | 0 | layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags); |
91 | 0 | layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags); |
92 | 0 | layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags); |
93 | 0 | layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags); |
94 | 0 | layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags); |
95 | 0 | } |
96 | | |
97 | | // Routed experts |
98 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, flags); |
99 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, flags); |
100 | 0 | create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, flags); |
101 | | |
102 | | // Shared experts |
103 | 0 | layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, flags); |
104 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); |
105 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); |
106 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, flags); |
107 | 0 | }; |
108 | |
|
109 | 0 | auto load_block_mtp = [&](int il) { |
110 | 0 | auto & layer = layers[il]; |
111 | |
|
112 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; |
113 | 0 | const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; |
114 | | |
115 | | // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN. |
116 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0); |
117 | 0 | layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0); |
118 | |
|
119 | 0 | create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); |
120 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0); |
121 | 0 | layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0); |
122 | 0 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0); |
123 | | |
124 | | // Routed experts |
125 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, 0); |
126 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, 0); |
127 | 0 | create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, 0); |
128 | | |
129 | | // Shared experts |
130 | 0 | layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, 0); |
131 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); |
132 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); |
133 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, 0); |
134 | | |
135 | | // NextN-specific tensors that define the MTP block. |
136 | 0 | layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0); |
137 | 0 | layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0); |
138 | 0 | layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0); |
139 | 0 | layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
140 | 0 | layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
141 | 0 | layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); |
142 | 0 | }; |
143 | |
|
144 | 0 | for (int i = 0; i < n_layer; ++i) { |
145 | 0 | load_block_trunk(i, trunk_flags); |
146 | 0 | } |
147 | 0 | for (int i = n_layer; i < n_layer_all; ++i) { |
148 | 0 | load_block_mtp(i); |
149 | 0 | } |
150 | 0 | } |
151 | | |
152 | 0 | std::unique_ptr<llm_graph_context> llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const { |
153 | 0 | if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { |
154 | 0 | return std::make_unique<graph_mtp>(*this, params); |
155 | 0 | } |
156 | 0 | return std::make_unique<graph>(*this, params); |
157 | 0 | } |
158 | | |
159 | | llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_params & params) : |
160 | 0 | llm_build_delta_net_base(params), model(model) { |
161 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
162 | |
|
163 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
164 | |
|
165 | 0 | int sections[4]; |
166 | 0 | std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
167 | |
|
168 | 0 | ggml_tensor * cur; |
169 | 0 | ggml_tensor * inpL; |
170 | |
|
171 | 0 | inpL = build_inp_embd(model.tok_embd); |
172 | |
|
173 | 0 | cb(inpL, "model.input_embed", -1); |
174 | |
|
175 | 0 | auto * inp = build_inp_mem_hybrid(); |
176 | |
|
177 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
178 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
179 | | |
180 | | // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. |
181 | 0 | for (int il = 0; il < n_layer; ++il) { |
182 | 0 | res->t_layer_inp[il] = inpL; |
183 | |
|
184 | 0 | ggml_tensor * inpSA = inpL; |
185 | |
|
186 | 0 | cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); |
187 | 0 | cb(cur, "attn_norm", il); |
188 | |
|
189 | 0 | ggml_build_forward_expand(gf, cur); |
190 | | |
191 | | // Determine layer type and build appropriate attention mechanism |
192 | 0 | if (hparams.is_recr(il)) { |
193 | | // Linear attention layer (gated delta net) |
194 | 0 | cur = build_layer_attn_linear(inp->get_recr(), cur, il); |
195 | 0 | } else { |
196 | | // Full attention layer |
197 | 0 | cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); |
198 | 0 | } |
199 | |
|
200 | 0 | if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { |
201 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
202 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
203 | 0 | } |
204 | | |
205 | | // Residual connection |
206 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
207 | 0 | cb(cur, "attn_residual", il); |
208 | | |
209 | | // Save the tensor before post-attention norm for residual connection |
210 | 0 | ggml_tensor * ffn_residual = cur; |
211 | | |
212 | | // Post-attention norm |
213 | 0 | ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); |
214 | 0 | cb(attn_post_norm, "attn_post_norm", il); |
215 | | |
216 | | // MOE FFN layer |
217 | 0 | cur = build_layer_ffn(attn_post_norm, il); |
218 | 0 | cb(cur, "ffn_out", il); |
219 | | |
220 | | // Residual connection for FFN - add to the tensor from before post_attention_layernorm |
221 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
222 | 0 | cb(cur, "post_moe", il); |
223 | |
|
224 | 0 | cur = build_cvec(cur, il); |
225 | 0 | cb(cur, "l_out", il); |
226 | | |
227 | | // Input for next layer |
228 | 0 | inpL = cur; |
229 | 0 | } |
230 | 0 | cur = inpL; |
231 | | |
232 | | // post-norm hidden state feeds both the LM head and the MTP seed below |
233 | 0 | cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); |
234 | |
|
235 | 0 | cb(cur, "h_nextn", -1); |
236 | 0 | res->t_h_nextn = cur; |
237 | |
|
238 | 0 | if (!cparams.embeddings_nextn_masked && inp_out_ids) { |
239 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
240 | 0 | } |
241 | |
|
242 | 0 | cb(cur, "result_norm", -1); |
243 | 0 | res->t_embd = cur; |
244 | | |
245 | | // LM head |
246 | 0 | cur = build_lora_mm(model.output, cur, model.output_s); |
247 | |
|
248 | 0 | cb(cur, "result_output", -1); |
249 | 0 | res->t_logits = cur; |
250 | |
|
251 | 0 | ggml_build_forward_expand(gf, cur); |
252 | 0 | } |
253 | | |
254 | | std::pair<ggml_tensor *, ggml_tensor *> llama_model_qwen35moe::graph::build_qkvz( |
255 | | ggml_tensor * input, |
256 | 0 | int il) { |
257 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
258 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
259 | |
|
260 | 0 | ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s); |
261 | 0 | qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); |
262 | 0 | cb(qkv_mixed, "linear_attn_qkv_mixed", il); |
263 | |
|
264 | 0 | ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s); |
265 | 0 | cb(z, "z", il); |
266 | |
|
267 | 0 | return { qkv_mixed, z }; |
268 | 0 | } |
269 | | |
270 | | ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated( |
271 | | ggml_tensor * input, |
272 | | ggml_tensor * weights, |
273 | | ggml_tensor * gate, |
274 | 0 | int layer) { |
275 | 0 | ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); |
276 | 0 | ggml_tensor * gated_silu = ggml_silu(ctx0, gate); |
277 | |
|
278 | 0 | return ggml_mul(ctx0, normalized, gated_silu); |
279 | 0 | } |
280 | | |
281 | | ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn( |
282 | | llm_graph_input_attn_kv * inp, |
283 | | ggml_tensor * cur, |
284 | | ggml_tensor * inp_pos, |
285 | | int * sections, |
286 | 0 | int il) { |
287 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
288 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
289 | | |
290 | | // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention |
291 | | |
292 | | // Qwen3Next uses a single Q projection that outputs query + gate |
293 | 0 | ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ] |
294 | 0 | cb(Qcur_full, "Qcur_full", il); |
295 | |
|
296 | 0 | ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
297 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
298 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0); |
299 | 0 | cb(Qcur, "Qcur_reshaped", il); |
300 | | |
301 | | // Apply Q normalization |
302 | 0 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); |
303 | 0 | cb(Qcur, "Qcur_normed", il); |
304 | |
|
305 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); |
306 | 0 | cb(Kcur, "Kcur", il); |
307 | |
|
308 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); |
309 | 0 | cb(Vcur, "Vcur", il); |
310 | | |
311 | | // Apply K normalization |
312 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
313 | 0 | Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); |
314 | 0 | cb(Kcur, "Kcur_normed", il); |
315 | |
|
316 | 0 | ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
317 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
318 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
319 | 0 | ggml_element_size(Qcur_full) * n_embd_head); |
320 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
321 | 0 | cb(gate, "gate_reshaped", il); |
322 | |
|
323 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
324 | | |
325 | | // Apply IMRoPE |
326 | 0 | Qcur = ggml_rope_multi( |
327 | 0 | ctx0, Qcur, inp_pos, nullptr, |
328 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
329 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
330 | 0 | ); |
331 | |
|
332 | 0 | Kcur = ggml_rope_multi( |
333 | 0 | ctx0, Kcur, inp_pos, nullptr, |
334 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
335 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
336 | 0 | ); |
337 | |
|
338 | 0 | cb(Qcur, "Qcur", il); |
339 | 0 | cb(Kcur, "Kcur", il); |
340 | 0 | cb(Vcur, "Vcur", il); |
341 | | |
342 | | // Attention computation |
343 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
344 | |
|
345 | 0 | cur = build_attn(inp, |
346 | 0 | nullptr, nullptr, nullptr, |
347 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
348 | 0 | cb(cur, "attn_pregate", il); |
349 | |
|
350 | 0 | ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate); |
351 | 0 | cb(gate_sigmoid, "gate_sigmoid", il); |
352 | |
|
353 | 0 | cur = ggml_mul(ctx0, cur, gate_sigmoid); |
354 | 0 | cb(cur, "attn_gated", il); |
355 | |
|
356 | 0 | cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s); |
357 | 0 | cb(cur, "attn_output", il); |
358 | |
|
359 | 0 | return cur; |
360 | 0 | } |
361 | | |
362 | | ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear( |
363 | | llm_graph_input_rs * inp, |
364 | | ggml_tensor * cur, |
365 | 0 | int il) { |
366 | 0 | const auto * mctx_cur = inp->mctx; |
367 | |
|
368 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
369 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
370 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
371 | 0 | const int64_t num_k_heads = hparams.ssm_n_group; |
372 | 0 | const int64_t num_v_heads = hparams.ssm_dt_rank; |
373 | 0 | const int64_t head_v_dim = d_inner / num_v_heads; |
374 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
375 | |
|
376 | 0 | GGML_ASSERT(n_seqs != 0); |
377 | 0 | GGML_ASSERT(ubatch.equal_seqs()); |
378 | 0 | GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); |
379 | | |
380 | | // Input projections |
381 | 0 | auto qkvz = build_qkvz(cur, il); |
382 | 0 | ggml_tensor * qkv_mixed = qkvz.first; |
383 | 0 | ggml_tensor * z = qkvz.second; |
384 | |
|
385 | 0 | ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s); |
386 | 0 | beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); |
387 | 0 | cb(beta, "beta", il); |
388 | |
|
389 | 0 | beta = ggml_sigmoid(ctx0, beta); |
390 | 0 | cb(beta, "beta_sigmoid", il); |
391 | |
|
392 | 0 | ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); |
393 | 0 | alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); |
394 | 0 | cb(alpha, "alpha", il); |
395 | |
|
396 | 0 | ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); |
397 | 0 | ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); |
398 | 0 | cb(alpha_softplus, "a_softplus", il); |
399 | |
|
400 | 0 | ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus |
401 | 0 | cb(gate, "gate", il); |
402 | |
|
403 | 0 | gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); |
404 | |
|
405 | 0 | ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); |
406 | 0 | ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); |
407 | |
|
408 | 0 | ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; |
409 | 0 | const int64_t conv_kernel_size = conv_kernel->ne[0]; |
410 | 0 | const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; |
411 | |
|
412 | 0 | ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il); |
413 | |
|
414 | 0 | ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); |
415 | 0 | state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); |
416 | 0 | cb(state, "state_predelta", il); |
417 | |
|
418 | 0 | ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); |
419 | 0 | cb(conv_output_proper, "conv_output_raw", il); |
420 | |
|
421 | 0 | ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); |
422 | 0 | cb(conv_output_silu, "conv_output_silu", il); |
423 | |
|
424 | 0 | ggml_tensor * conv_qkv_mix = conv_output_silu; |
425 | | |
426 | | // Calculate the total conv dimension |
427 | 0 | int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; |
428 | 0 | int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); |
429 | | |
430 | | // Extract the convolved Q, K, V from conv_output |
431 | 0 | ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
432 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
433 | 0 | nb1_qkv, |
434 | 0 | nb1_qkv * n_seq_tokens, |
435 | 0 | 0); |
436 | |
|
437 | 0 | ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
438 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
439 | 0 | nb1_qkv, |
440 | 0 | nb1_qkv * n_seq_tokens, |
441 | 0 | head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); |
442 | |
|
443 | 0 | ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs, |
444 | 0 | ggml_row_size(conv_qkv_mix->type, head_v_dim), |
445 | 0 | nb1_qkv, |
446 | 0 | nb1_qkv * n_seq_tokens, |
447 | 0 | ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads)); |
448 | |
|
449 | 0 | cb(q_conv, "q_conv", il); |
450 | 0 | cb(k_conv, "k_conv", il); |
451 | 0 | cb(v_conv, "v_conv", il); |
452 | |
|
453 | 0 | const float eps_norm = hparams.f_norm_rms_eps; |
454 | |
|
455 | 0 | q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm); |
456 | 0 | k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm); |
457 | | |
458 | | //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
459 | | //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
460 | | //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
461 | | |
462 | | // if head keys and value keys are different, repeat to force tensors into matching shapes |
463 | | // note: need explicit repeat only if we are not using the fused GDN. |
464 | 0 | if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) { |
465 | 0 | GGML_ASSERT(num_v_heads % num_k_heads == 0); |
466 | 0 | q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
467 | 0 | k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
468 | 0 | } |
469 | |
|
470 | 0 | cb(q_conv, "q_conv_predelta", il); |
471 | 0 | cb(k_conv, "k_conv_predelta", il); |
472 | 0 | cb(v_conv, "v_conv_predelta", il); |
473 | |
|
474 | 0 | ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il); |
475 | | |
476 | | // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] |
477 | 0 | ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
478 | | |
479 | | // Apply gated normalization: self.norm(core_attn_out, z) |
480 | 0 | ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il); |
481 | | |
482 | | // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim] |
483 | 0 | ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); |
484 | 0 | cb(final_output, "final_output", il); |
485 | | |
486 | | // Output projection |
487 | 0 | cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s); |
488 | 0 | cb(cur, "linear_attn_out", il); |
489 | | |
490 | | // Reshape back to original dimensions |
491 | 0 | cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); |
492 | |
|
493 | 0 | return cur; |
494 | 0 | } |
495 | | |
496 | 0 | ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, const int il) { |
497 | | // Check if this is an MoE layer |
498 | 0 | GGML_ASSERT(model.layers[il].ffn_gate_inp != nullptr); |
499 | |
|
500 | 0 | ggml_tensor * moe_out = |
501 | 0 | build_moe_ffn(cur, |
502 | 0 | model.layers[il].ffn_gate_inp, |
503 | 0 | model.layers[il].ffn_up_exps, |
504 | 0 | model.layers[il].ffn_gate_exps, |
505 | 0 | model.layers[il].ffn_down_exps, |
506 | 0 | nullptr, |
507 | 0 | n_expert, n_expert_used, |
508 | 0 | LLM_FFN_SILU, true, |
509 | 0 | hparams.expert_weights_scale, |
510 | 0 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, |
511 | 0 | nullptr, model.layers[il].ffn_gate_up_exps, |
512 | 0 | model.layers[il].ffn_up_exps_s, |
513 | 0 | model.layers[il].ffn_gate_exps_s, |
514 | 0 | model.layers[il].ffn_down_exps_s); |
515 | 0 | cb(moe_out, "ffn_moe_out", il); |
516 | | |
517 | | // Add shared experts if present - following Qwen3Next reference implementation |
518 | 0 | if (model.layers[il].ffn_up_shexp != nullptr) { |
519 | 0 | ggml_tensor * ffn_shexp = |
520 | 0 | build_ffn(cur, |
521 | 0 | model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s, |
522 | 0 | model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s, |
523 | 0 | model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s, |
524 | 0 | NULL, |
525 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
526 | 0 | cb(ffn_shexp, "ffn_shexp", il); |
527 | | |
528 | | // Apply shared expert gating as in the reference implementation |
529 | | // The shared expert has its own gate that is sigmoided |
530 | | // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token) |
531 | 0 | ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); |
532 | 0 | cb(shared_gate, "shared_expert_gate", il); |
533 | | |
534 | | // Apply sigmoid to the gate |
535 | 0 | shared_gate = ggml_sigmoid(ctx0, shared_gate); |
536 | 0 | cb(shared_gate, "shared_expert_gate_sigmoid", il); |
537 | | |
538 | | |
539 | | // Apply the gate to the shared expert output |
540 | 0 | ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); |
541 | 0 | cb(ffn_shexp, "ffn_shexp_gated", il); |
542 | |
|
543 | 0 | cur = ggml_add(ctx0, moe_out, ffn_shexp); |
544 | 0 | cb(cur, "ffn_out", il); |
545 | 0 | } else { |
546 | 0 | cur = moe_out; |
547 | 0 | } |
548 | |
|
549 | 0 | return cur; |
550 | 0 | } |
551 | | |
552 | | // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE |
553 | | llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) |
554 | 0 | : llm_graph_context(params) { |
555 | 0 | GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0"); |
556 | 0 | GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); |
557 | |
|
558 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
559 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
560 | |
|
561 | 0 | const int il = hparams.n_layer(); |
562 | 0 | const auto & layer = model.layers[il]; |
563 | |
|
564 | 0 | GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); |
565 | 0 | GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); |
566 | 0 | GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); |
567 | 0 | GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); |
568 | |
|
569 | 0 | int sections[4]; |
570 | 0 | std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
571 | | |
572 | | // TODO: extract in a common llm_graph_context::build_inp_embd_h() |
573 | 0 | auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd); |
574 | |
|
575 | 0 | inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |
576 | 0 | ggml_set_input(inp->tokens); |
577 | |
|
578 | 0 | inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); |
579 | 0 | ggml_set_input(inp->embd); |
580 | | |
581 | | // TODO: make static using `ggml_build_forward_select()` |
582 | | // see llm_graph_context::build_inp_embd() for reference |
583 | 0 | ggml_tensor * tok_embd; |
584 | 0 | if (ubatch.token) { |
585 | 0 | ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; |
586 | |
|
587 | 0 | tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); |
588 | 0 | } else { |
589 | 0 | tok_embd = inp->embd; |
590 | 0 | } |
591 | 0 | cb(tok_embd, "mtp_tok_embd", il); |
592 | |
|
593 | 0 | inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); |
594 | 0 | ggml_set_input(inp->h); |
595 | 0 | ggml_set_name(inp->h, "mtp_h_input"); |
596 | |
|
597 | 0 | ggml_tensor * h_embd = inp->h; |
598 | |
|
599 | 0 | res->add_input(std::move(inp)); |
600 | |
|
601 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
602 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
603 | |
|
604 | 0 | auto * inp_attn = build_attn_inp_kv(); |
605 | |
|
606 | 0 | ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); |
607 | 0 | cb(h_norm, "mtp_hnorm", il); |
608 | |
|
609 | 0 | ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); |
610 | 0 | cb(e_norm, "mtp_enorm", il); |
611 | |
|
612 | 0 | ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); |
613 | 0 | cb(concat, "mtp_concat", il); |
614 | |
|
615 | 0 | ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); |
616 | 0 | cb(cur, "mtp_eh_proj", il); |
617 | |
|
618 | 0 | ggml_tensor * inpSA = cur; |
619 | |
|
620 | 0 | cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); |
621 | 0 | cb(cur, "mtp_attn_norm", il); |
622 | |
|
623 | 0 | ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); |
624 | 0 | cb(Qcur_full, "mtp_Qcur_full", il); |
625 | |
|
626 | 0 | ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, |
627 | 0 | n_embd_head, n_head, n_tokens, |
628 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
629 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
630 | 0 | 0); |
631 | 0 | Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); |
632 | 0 | cb(Qcur, "mtp_Qcur_normed", il); |
633 | |
|
634 | 0 | ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, |
635 | 0 | n_embd_head, n_head, n_tokens, |
636 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
637 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
638 | 0 | ggml_element_size(Qcur_full) * n_embd_head); |
639 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
640 | 0 | cb(gate, "mtp_gate", il); |
641 | |
|
642 | 0 | ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); |
643 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
644 | 0 | Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); |
645 | 0 | cb(Kcur, "mtp_Kcur_normed", il); |
646 | |
|
647 | 0 | ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); |
648 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
649 | 0 | cb(Vcur, "mtp_Vcur", il); |
650 | |
|
651 | 0 | Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, |
652 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
653 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
654 | 0 | Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, |
655 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
656 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
657 | |
|
658 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f |
659 | 0 | ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
660 | |
|
661 | 0 | cur = build_attn(inp_attn, |
662 | 0 | nullptr, nullptr, nullptr, |
663 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
664 | 0 | cb(cur, "mtp_attn_pregate", il); |
665 | |
|
666 | 0 | cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); |
667 | 0 | cur = build_lora_mm(layer.wo, cur, layer.wo_s); |
668 | 0 | cb(cur, "mtp_attn_out", il); |
669 | |
|
670 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
671 | 0 | cb(cur, "mtp_attn_residual", il); |
672 | |
|
673 | 0 | ggml_tensor * ffn_residual = cur; |
674 | 0 | cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); |
675 | 0 | cb(cur, "mtp_attn_post_norm", il); |
676 | | |
677 | | // MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe). |
678 | 0 | ggml_tensor * moe_out = |
679 | 0 | build_moe_ffn(cur, |
680 | 0 | layer.ffn_gate_inp, |
681 | 0 | layer.ffn_up_exps, |
682 | 0 | layer.ffn_gate_exps, |
683 | 0 | layer.ffn_down_exps, |
684 | 0 | nullptr, |
685 | 0 | n_expert, n_expert_used, |
686 | 0 | LLM_FFN_SILU, true, |
687 | 0 | hparams.expert_weights_scale, |
688 | 0 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, |
689 | 0 | nullptr, layer.ffn_gate_up_exps, |
690 | 0 | layer.ffn_up_exps_s, |
691 | 0 | layer.ffn_gate_exps_s, |
692 | 0 | layer.ffn_down_exps_s); |
693 | 0 | cb(moe_out, "mtp_ffn_moe_out", il); |
694 | |
|
695 | 0 | if (layer.ffn_up_shexp != nullptr) { |
696 | 0 | ggml_tensor * ffn_shexp = |
697 | 0 | build_ffn(cur, |
698 | 0 | layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, |
699 | 0 | layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, |
700 | 0 | layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, |
701 | 0 | nullptr, |
702 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
703 | 0 | cb(ffn_shexp, "mtp_ffn_shexp", il); |
704 | |
|
705 | 0 | ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur); |
706 | 0 | shared_gate = ggml_sigmoid(ctx0, shared_gate); |
707 | 0 | cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il); |
708 | |
|
709 | 0 | ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); |
710 | 0 | cb(ffn_shexp, "mtp_ffn_shexp_gated", il); |
711 | |
|
712 | 0 | cur = ggml_add(ctx0, moe_out, ffn_shexp); |
713 | 0 | } else { |
714 | 0 | cur = moe_out; |
715 | 0 | } |
716 | 0 | cb(cur, "mtp_ffn_out", il); |
717 | |
|
718 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
719 | 0 | cb(cur, "mtp_post_ffn", il); |
720 | |
|
721 | 0 | ggml_tensor * head_norm_w = layer.nextn.shared_head_norm |
722 | 0 | ? layer.nextn.shared_head_norm |
723 | 0 | : model.output_norm; |
724 | 0 | GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm"); |
725 | 0 | cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); |
726 | |
|
727 | 0 | cb(cur, "h_nextn", -1); |
728 | 0 | res->t_h_nextn= cur; |
729 | |
|
730 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
731 | 0 | cb(cur, "mtp_shared_head_norm", -1); |
732 | |
|
733 | 0 | ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; |
734 | 0 | ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; |
735 | 0 | GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); |
736 | 0 | cur = build_lora_mm(head_w, cur, head_s); |
737 | 0 | cb(cur, "result_output", -1); |
738 | |
|
739 | 0 | res->t_logits = cur; |
740 | 0 | ggml_build_forward_expand(gf, cur); |
741 | 0 | } |