/src/llama.cpp/src/models/qwen35moe.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | #include "llama-memory-recurrent.h" |
3 | | |
4 | 0 | void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { |
5 | 0 | ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); |
6 | 0 | ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); |
7 | 0 | ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
8 | |
|
9 | 0 | ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); |
10 | | |
11 | | // Load linear attention (gated delta net) parameters |
12 | 0 | ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); |
13 | 0 | ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); |
14 | 0 | ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); |
15 | 0 | ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); |
16 | 0 | ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); |
17 | | |
18 | | // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack |
19 | 0 | ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); |
20 | 0 | GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); |
21 | | |
22 | | // Mark recurrent layers (linear attention layers). MTP layers are dense |
23 | | // attention-only and must be flagged non-recurrent. |
24 | 0 | if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { |
25 | 0 | uint32_t full_attn_interval = 4; |
26 | 0 | ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); |
27 | 0 | for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { |
28 | 0 | hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); |
29 | 0 | } |
30 | 0 | } |
31 | |
|
32 | 0 | switch (hparams.n_layer()) { |
33 | 0 | case 40: type = LLM_TYPE_35B_A3B; break; |
34 | 0 | case 48: type = LLM_TYPE_122B_A10B; break; |
35 | 0 | case 60: type = LLM_TYPE_397B_A17B; break; |
36 | 0 | default: type = LLM_TYPE_UNKNOWN; |
37 | 0 | } |
38 | 0 | } |
39 | | |
40 | 0 | void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { |
41 | 0 | LLAMA_LOAD_LOCALS; |
42 | |
|
43 | 0 | const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); |
44 | 0 | const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; |
45 | |
|
46 | 0 | tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); |
47 | | |
48 | | // output |
49 | 0 | output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); |
50 | 0 | output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
51 | | |
52 | | // if output is NULL, init from the input tok embed |
53 | 0 | if (output == NULL) { |
54 | 0 | output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); |
55 | 0 | } |
56 | |
|
57 | 0 | auto load_block_trunk = [&](int il, int flags) { |
58 | 0 | auto & layer = layers[il]; |
59 | |
|
60 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; |
61 | 0 | const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; |
62 | | |
63 | | // Calculate dimensions from hyperparameters |
64 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
65 | 0 | const int64_t head_v_dim = hparams.ssm_d_state; |
66 | 0 | const int64_t n_k_heads = hparams.ssm_n_group; |
67 | 0 | const int64_t n_v_heads = hparams.ssm_dt_rank; |
68 | 0 | const int64_t key_dim = head_k_dim * n_k_heads; |
69 | 0 | const int64_t value_dim = head_v_dim * n_v_heads; |
70 | 0 | const int64_t conv_dim = key_dim * 2 + value_dim; |
71 | |
|
72 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); |
73 | 0 | layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); |
74 | |
|
75 | 0 | if (!hparams.is_recr(il)) { |
76 | | // Attention layers |
77 | 0 | create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); |
78 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); |
79 | | |
80 | | // Q/K normalization for attention layers |
81 | 0 | layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags); |
82 | 0 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags); |
83 | 0 | } else { |
84 | | // Linear attention (gated delta net) specific tensors |
85 | | // Create tensors with calculated dimensions |
86 | 0 | layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); |
87 | 0 | layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); |
88 | 0 | layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags); |
89 | 0 | layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags); |
90 | 0 | layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags); |
91 | 0 | layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags); |
92 | 0 | layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags); |
93 | 0 | layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags); |
94 | 0 | layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags); |
95 | 0 | } |
96 | | |
97 | | // Routed experts |
98 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, flags); |
99 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, flags); |
100 | 0 | create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, flags); |
101 | | |
102 | | // Shared experts |
103 | 0 | layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, flags); |
104 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); |
105 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); |
106 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, flags); |
107 | 0 | }; |
108 | |
|
109 | 0 | auto load_block_mtp = [&](int il) { |
110 | 0 | auto & layer = layers[il]; |
111 | |
|
112 | 0 | const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; |
113 | 0 | const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; |
114 | | |
115 | | // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN. |
116 | 0 | layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0); |
117 | 0 | layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0); |
118 | |
|
119 | 0 | create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); |
120 | 0 | layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0); |
121 | 0 | layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0); |
122 | 0 | layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0); |
123 | | |
124 | | // Routed experts |
125 | 0 | layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, 0); |
126 | 0 | layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, 0); |
127 | 0 | create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, 0); |
128 | | |
129 | | // Shared experts |
130 | 0 | layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, 0); |
131 | 0 | layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); |
132 | 0 | layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); |
133 | 0 | layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, 0); |
134 | | |
135 | | // NextN-specific tensors that define the MTP block. |
136 | 0 | layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0); |
137 | 0 | layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0); |
138 | 0 | layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0); |
139 | 0 | layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
140 | 0 | layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); |
141 | 0 | layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); |
142 | 0 | }; |
143 | |
|
144 | 0 | for (int i = 0; i < n_layer; ++i) { |
145 | 0 | load_block_trunk(i, trunk_flags); |
146 | 0 | } |
147 | 0 | for (int i = n_layer; i < n_layer_all; ++i) { |
148 | 0 | load_block_mtp(i); |
149 | 0 | } |
150 | 0 | } |
151 | | |
152 | 0 | std::unique_ptr<llm_graph_context> llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const { |
153 | 0 | if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { |
154 | 0 | return std::make_unique<graph_mtp>(*this, params); |
155 | 0 | } |
156 | 0 | return std::make_unique<graph>(*this, params); |
157 | 0 | } |
158 | | |
159 | | llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_params & params) : |
160 | 0 | llm_build_delta_net_base(params), model(model) { |
161 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
162 | |
|
163 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
164 | |
|
165 | 0 | int sections[4]; |
166 | 0 | std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
167 | |
|
168 | 0 | ggml_tensor * cur; |
169 | 0 | ggml_tensor * inpL; |
170 | |
|
171 | 0 | inpL = build_inp_embd(model.tok_embd); |
172 | |
|
173 | 0 | cb(inpL, "model.input_embed", -1); |
174 | |
|
175 | 0 | auto * inp = build_inp_mem_hybrid(); |
176 | |
|
177 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
178 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
179 | | |
180 | | // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. |
181 | 0 | for (int il = 0; il < n_layer; ++il) { |
182 | 0 | ggml_tensor * inpSA = inpL; |
183 | |
|
184 | 0 | cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); |
185 | 0 | cb(cur, "attn_norm", il); |
186 | |
|
187 | 0 | ggml_build_forward_expand(gf, cur); |
188 | | |
189 | | // Determine layer type and build appropriate attention mechanism |
190 | 0 | if (hparams.is_recr(il)) { |
191 | | // Linear attention layer (gated delta net) |
192 | 0 | cur = build_layer_attn_linear(inp->get_recr(), cur, il); |
193 | 0 | } else { |
194 | | // Full attention layer |
195 | 0 | cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); |
196 | 0 | } |
197 | |
|
198 | 0 | if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { |
199 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
200 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
201 | 0 | } |
202 | | |
203 | | // Residual connection |
204 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
205 | 0 | cb(cur, "attn_residual", il); |
206 | | |
207 | | // Save the tensor before post-attention norm for residual connection |
208 | 0 | ggml_tensor * ffn_residual = cur; |
209 | | |
210 | | // Post-attention norm |
211 | 0 | ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); |
212 | 0 | cb(attn_post_norm, "attn_post_norm", il); |
213 | | |
214 | | // MOE FFN layer |
215 | 0 | cur = build_layer_ffn(attn_post_norm, il); |
216 | 0 | cb(cur, "ffn_out", il); |
217 | | |
218 | | // Residual connection for FFN - add to the tensor from before post_attention_layernorm |
219 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
220 | 0 | cb(cur, "post_moe", il); |
221 | |
|
222 | 0 | cur = build_cvec(cur, il); |
223 | 0 | cb(cur, "l_out", il); |
224 | | |
225 | | // Input for next layer |
226 | 0 | inpL = cur; |
227 | 0 | } |
228 | 0 | cur = inpL; |
229 | | |
230 | | // post-norm hidden state feeds both the LM head and the MTP seed below |
231 | 0 | cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); |
232 | |
|
233 | 0 | cb(cur, "h_nextn", -1); |
234 | 0 | res->t_h_nextn = cur; |
235 | |
|
236 | 0 | if (!cparams.embeddings_nextn_masked && inp_out_ids) { |
237 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
238 | 0 | } |
239 | |
|
240 | 0 | cb(cur, "result_norm", -1); |
241 | 0 | res->t_embd = cur; |
242 | | |
243 | | // LM head |
244 | 0 | cur = build_lora_mm(model.output, cur, model.output_s); |
245 | |
|
246 | 0 | cb(cur, "result_output", -1); |
247 | 0 | res->t_logits = cur; |
248 | |
|
249 | 0 | ggml_build_forward_expand(gf, cur); |
250 | 0 | } |
251 | | |
252 | | std::pair<ggml_tensor *, ggml_tensor *> llama_model_qwen35moe::graph::build_qkvz( |
253 | | ggml_tensor * input, |
254 | 0 | int il) { |
255 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
256 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
257 | |
|
258 | 0 | ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s); |
259 | 0 | qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); |
260 | 0 | cb(qkv_mixed, "linear_attn_qkv_mixed", il); |
261 | |
|
262 | 0 | ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s); |
263 | 0 | cb(z, "z", il); |
264 | |
|
265 | 0 | return { qkv_mixed, z }; |
266 | 0 | } |
267 | | |
268 | | ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated( |
269 | | ggml_tensor * input, |
270 | | ggml_tensor * weights, |
271 | | ggml_tensor * gate, |
272 | 0 | int layer) { |
273 | 0 | ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); |
274 | 0 | ggml_tensor * gated_silu = ggml_silu(ctx0, gate); |
275 | |
|
276 | 0 | return ggml_mul(ctx0, normalized, gated_silu); |
277 | 0 | } |
278 | | |
279 | | ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn( |
280 | | llm_graph_input_attn_kv * inp, |
281 | | ggml_tensor * cur, |
282 | | ggml_tensor * inp_pos, |
283 | | int * sections, |
284 | 0 | int il) { |
285 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
286 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
287 | | |
288 | | // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention |
289 | | |
290 | | // Qwen3Next uses a single Q projection that outputs query + gate |
291 | 0 | ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ] |
292 | 0 | cb(Qcur_full, "Qcur_full", il); |
293 | |
|
294 | 0 | ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
295 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
296 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0); |
297 | 0 | cb(Qcur, "Qcur_reshaped", il); |
298 | | |
299 | | // Apply Q normalization |
300 | 0 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); |
301 | 0 | cb(Qcur, "Qcur_normed", il); |
302 | |
|
303 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); |
304 | 0 | cb(Kcur, "Kcur", il); |
305 | |
|
306 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); |
307 | 0 | cb(Vcur, "Vcur", il); |
308 | | |
309 | | // Apply K normalization |
310 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
311 | 0 | Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); |
312 | 0 | cb(Kcur, "Kcur_normed", il); |
313 | |
|
314 | 0 | ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
315 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
316 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
317 | 0 | ggml_element_size(Qcur_full) * n_embd_head); |
318 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
319 | 0 | cb(gate, "gate_reshaped", il); |
320 | |
|
321 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
322 | | |
323 | | // Apply IMRoPE |
324 | 0 | Qcur = ggml_rope_multi( |
325 | 0 | ctx0, Qcur, inp_pos, nullptr, |
326 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
327 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
328 | 0 | ); |
329 | |
|
330 | 0 | Kcur = ggml_rope_multi( |
331 | 0 | ctx0, Kcur, inp_pos, nullptr, |
332 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
333 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
334 | 0 | ); |
335 | |
|
336 | 0 | cb(Qcur, "Qcur", il); |
337 | 0 | cb(Kcur, "Kcur", il); |
338 | 0 | cb(Vcur, "Vcur", il); |
339 | | |
340 | | // Attention computation |
341 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
342 | |
|
343 | 0 | cur = build_attn(inp, |
344 | 0 | nullptr, nullptr, nullptr, |
345 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
346 | 0 | cb(cur, "attn_pregate", il); |
347 | |
|
348 | 0 | ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate); |
349 | 0 | cb(gate_sigmoid, "gate_sigmoid", il); |
350 | |
|
351 | 0 | cur = ggml_mul(ctx0, cur, gate_sigmoid); |
352 | 0 | cb(cur, "attn_gated", il); |
353 | |
|
354 | 0 | cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s); |
355 | 0 | cb(cur, "attn_output", il); |
356 | |
|
357 | 0 | return cur; |
358 | 0 | } |
359 | | |
360 | | ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear( |
361 | | llm_graph_input_rs * inp, |
362 | | ggml_tensor * cur, |
363 | 0 | int il) { |
364 | 0 | const auto * mctx_cur = inp->mctx; |
365 | |
|
366 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
367 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
368 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
369 | 0 | const int64_t num_k_heads = hparams.ssm_n_group; |
370 | 0 | const int64_t num_v_heads = hparams.ssm_dt_rank; |
371 | 0 | const int64_t head_v_dim = d_inner / num_v_heads; |
372 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
373 | |
|
374 | 0 | GGML_ASSERT(n_seqs != 0); |
375 | 0 | GGML_ASSERT(ubatch.equal_seqs()); |
376 | 0 | GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); |
377 | | |
378 | | // Input projections |
379 | 0 | auto qkvz = build_qkvz(cur, il); |
380 | 0 | ggml_tensor * qkv_mixed = qkvz.first; |
381 | 0 | ggml_tensor * z = qkvz.second; |
382 | |
|
383 | 0 | ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s); |
384 | 0 | beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); |
385 | 0 | cb(beta, "beta", il); |
386 | |
|
387 | 0 | beta = ggml_sigmoid(ctx0, beta); |
388 | 0 | cb(beta, "beta_sigmoid", il); |
389 | |
|
390 | 0 | ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); |
391 | 0 | alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); |
392 | 0 | cb(alpha, "alpha", il); |
393 | |
|
394 | 0 | ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); |
395 | 0 | ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); |
396 | 0 | cb(alpha_softplus, "a_softplus", il); |
397 | |
|
398 | 0 | ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus |
399 | 0 | cb(gate, "gate", il); |
400 | |
|
401 | 0 | gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); |
402 | |
|
403 | 0 | ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); |
404 | 0 | ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); |
405 | |
|
406 | 0 | ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; |
407 | 0 | const int64_t conv_kernel_size = conv_kernel->ne[0]; |
408 | 0 | const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; |
409 | |
|
410 | 0 | ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il); |
411 | |
|
412 | 0 | ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); |
413 | 0 | state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); |
414 | 0 | cb(state, "state_predelta", il); |
415 | |
|
416 | 0 | ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); |
417 | 0 | cb(conv_output_proper, "conv_output_raw", il); |
418 | |
|
419 | 0 | ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); |
420 | 0 | cb(conv_output_silu, "conv_output_silu", il); |
421 | |
|
422 | 0 | ggml_tensor * conv_qkv_mix = conv_output_silu; |
423 | | |
424 | | // Calculate the total conv dimension |
425 | 0 | int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; |
426 | 0 | int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); |
427 | | |
428 | | // Extract the convolved Q, K, V from conv_output |
429 | 0 | ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
430 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
431 | 0 | nb1_qkv, |
432 | 0 | nb1_qkv * n_seq_tokens, |
433 | 0 | 0); |
434 | |
|
435 | 0 | ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
436 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
437 | 0 | nb1_qkv, |
438 | 0 | nb1_qkv * n_seq_tokens, |
439 | 0 | head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); |
440 | |
|
441 | 0 | ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs, |
442 | 0 | ggml_row_size(conv_qkv_mix->type, head_v_dim), |
443 | 0 | nb1_qkv, |
444 | 0 | nb1_qkv * n_seq_tokens, |
445 | 0 | ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads)); |
446 | |
|
447 | 0 | cb(q_conv, "q_conv", il); |
448 | 0 | cb(k_conv, "k_conv", il); |
449 | 0 | cb(v_conv, "v_conv", il); |
450 | |
|
451 | 0 | const float eps_norm = hparams.f_norm_rms_eps; |
452 | |
|
453 | 0 | q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm); |
454 | 0 | k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm); |
455 | | |
456 | | //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
457 | | //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
458 | | //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
459 | | |
460 | | // if head keys and value keys are different, repeat to force tensors into matching shapes |
461 | | // note: need explicit repeat only if we are not using the fused GDN. |
462 | 0 | if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) { |
463 | 0 | GGML_ASSERT(num_v_heads % num_k_heads == 0); |
464 | 0 | q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
465 | 0 | k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
466 | 0 | } |
467 | |
|
468 | 0 | cb(q_conv, "q_conv_predelta", il); |
469 | 0 | cb(k_conv, "k_conv_predelta", il); |
470 | 0 | cb(v_conv, "v_conv_predelta", il); |
471 | |
|
472 | 0 | ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il); |
473 | | |
474 | | // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] |
475 | 0 | ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
476 | | |
477 | | // Apply gated normalization: self.norm(core_attn_out, z) |
478 | 0 | ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il); |
479 | | |
480 | | // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim] |
481 | 0 | ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); |
482 | 0 | cb(final_output, "final_output", il); |
483 | | |
484 | | // Output projection |
485 | 0 | cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s); |
486 | 0 | cb(cur, "linear_attn_out", il); |
487 | | |
488 | | // Reshape back to original dimensions |
489 | 0 | cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); |
490 | |
|
491 | 0 | return cur; |
492 | 0 | } |
493 | | |
494 | 0 | ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, const int il) { |
495 | | // Check if this is an MoE layer |
496 | 0 | GGML_ASSERT(model.layers[il].ffn_gate_inp != nullptr); |
497 | |
|
498 | 0 | ggml_tensor * moe_out = |
499 | 0 | build_moe_ffn(cur, |
500 | 0 | model.layers[il].ffn_gate_inp, |
501 | 0 | model.layers[il].ffn_up_exps, |
502 | 0 | model.layers[il].ffn_gate_exps, |
503 | 0 | model.layers[il].ffn_down_exps, |
504 | 0 | nullptr, |
505 | 0 | n_expert, n_expert_used, |
506 | 0 | LLM_FFN_SILU, true, |
507 | 0 | hparams.expert_weights_scale, |
508 | 0 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, |
509 | 0 | nullptr, model.layers[il].ffn_gate_up_exps, |
510 | 0 | model.layers[il].ffn_up_exps_s, |
511 | 0 | model.layers[il].ffn_gate_exps_s, |
512 | 0 | model.layers[il].ffn_down_exps_s); |
513 | 0 | cb(moe_out, "ffn_moe_out", il); |
514 | | |
515 | | // Add shared experts if present - following Qwen3Next reference implementation |
516 | 0 | if (model.layers[il].ffn_up_shexp != nullptr) { |
517 | 0 | ggml_tensor * ffn_shexp = |
518 | 0 | build_ffn(cur, |
519 | 0 | model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s, |
520 | 0 | model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s, |
521 | 0 | model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s, |
522 | 0 | NULL, |
523 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
524 | 0 | cb(ffn_shexp, "ffn_shexp", il); |
525 | | |
526 | | // Apply shared expert gating as in the reference implementation |
527 | | // The shared expert has its own gate that is sigmoided |
528 | | // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token) |
529 | 0 | ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); |
530 | 0 | cb(shared_gate, "shared_expert_gate", il); |
531 | | |
532 | | // Apply sigmoid to the gate |
533 | 0 | shared_gate = ggml_sigmoid(ctx0, shared_gate); |
534 | 0 | cb(shared_gate, "shared_expert_gate_sigmoid", il); |
535 | | |
536 | | |
537 | | // Apply the gate to the shared expert output |
538 | 0 | ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); |
539 | 0 | cb(ffn_shexp, "ffn_shexp_gated", il); |
540 | |
|
541 | 0 | cur = ggml_add(ctx0, moe_out, ffn_shexp); |
542 | 0 | cb(cur, "ffn_out", il); |
543 | 0 | } else { |
544 | 0 | cur = moe_out; |
545 | 0 | } |
546 | |
|
547 | 0 | return cur; |
548 | 0 | } |
549 | | |
550 | | // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE |
551 | | llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) |
552 | 0 | : llm_graph_context(params) { |
553 | 0 | GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0"); |
554 | 0 | GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); |
555 | |
|
556 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
557 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
558 | |
|
559 | 0 | const int il = hparams.n_layer(); |
560 | 0 | const auto & layer = model.layers[il]; |
561 | |
|
562 | 0 | GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); |
563 | 0 | GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); |
564 | 0 | GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); |
565 | 0 | GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); |
566 | |
|
567 | 0 | int sections[4]; |
568 | 0 | std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
569 | | |
570 | | // TODO: extract in a common llm_graph_context::build_inp_embd_h() |
571 | 0 | auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd); |
572 | |
|
573 | 0 | inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |
574 | 0 | ggml_set_input(inp->tokens); |
575 | |
|
576 | 0 | inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); |
577 | 0 | ggml_set_input(inp->embd); |
578 | | |
579 | | // TODO: make static using `ggml_build_forward_select()` |
580 | | // see llm_graph_context::build_inp_embd() for reference |
581 | 0 | ggml_tensor * tok_embd; |
582 | 0 | if (ubatch.token) { |
583 | 0 | ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; |
584 | |
|
585 | 0 | tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); |
586 | 0 | } else { |
587 | 0 | tok_embd = inp->embd; |
588 | 0 | } |
589 | 0 | cb(tok_embd, "mtp_tok_embd", il); |
590 | |
|
591 | 0 | inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); |
592 | 0 | ggml_set_input(inp->h); |
593 | 0 | ggml_set_name(inp->h, "mtp_h_input"); |
594 | |
|
595 | 0 | ggml_tensor * h_embd = inp->h; |
596 | |
|
597 | 0 | res->add_input(std::move(inp)); |
598 | |
|
599 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
600 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
601 | |
|
602 | 0 | auto * inp_attn = build_attn_inp_kv(); |
603 | |
|
604 | 0 | ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); |
605 | 0 | cb(h_norm, "mtp_hnorm", il); |
606 | |
|
607 | 0 | ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); |
608 | 0 | cb(e_norm, "mtp_enorm", il); |
609 | |
|
610 | 0 | ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); |
611 | 0 | cb(concat, "mtp_concat", il); |
612 | |
|
613 | 0 | ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); |
614 | 0 | cb(cur, "mtp_eh_proj", il); |
615 | |
|
616 | 0 | ggml_tensor * inpSA = cur; |
617 | |
|
618 | 0 | cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); |
619 | 0 | cb(cur, "mtp_attn_norm", il); |
620 | |
|
621 | 0 | ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); |
622 | 0 | cb(Qcur_full, "mtp_Qcur_full", il); |
623 | |
|
624 | 0 | ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, |
625 | 0 | n_embd_head, n_head, n_tokens, |
626 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
627 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
628 | 0 | 0); |
629 | 0 | Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); |
630 | 0 | cb(Qcur, "mtp_Qcur_normed", il); |
631 | |
|
632 | 0 | ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, |
633 | 0 | n_embd_head, n_head, n_tokens, |
634 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
635 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
636 | 0 | ggml_element_size(Qcur_full) * n_embd_head); |
637 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
638 | 0 | cb(gate, "mtp_gate", il); |
639 | |
|
640 | 0 | ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); |
641 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
642 | 0 | Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); |
643 | 0 | cb(Kcur, "mtp_Kcur_normed", il); |
644 | |
|
645 | 0 | ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); |
646 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
647 | 0 | cb(Vcur, "mtp_Vcur", il); |
648 | |
|
649 | 0 | Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, |
650 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
651 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
652 | 0 | Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, |
653 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
654 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
655 | |
|
656 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f |
657 | 0 | ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
658 | |
|
659 | 0 | cur = build_attn(inp_attn, |
660 | 0 | nullptr, nullptr, nullptr, |
661 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
662 | 0 | cb(cur, "mtp_attn_pregate", il); |
663 | |
|
664 | 0 | cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); |
665 | 0 | cur = build_lora_mm(layer.wo, cur, layer.wo_s); |
666 | 0 | cb(cur, "mtp_attn_out", il); |
667 | |
|
668 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
669 | 0 | cb(cur, "mtp_attn_residual", il); |
670 | |
|
671 | 0 | ggml_tensor * ffn_residual = cur; |
672 | 0 | cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); |
673 | 0 | cb(cur, "mtp_attn_post_norm", il); |
674 | | |
675 | | // MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe). |
676 | 0 | ggml_tensor * moe_out = |
677 | 0 | build_moe_ffn(cur, |
678 | 0 | layer.ffn_gate_inp, |
679 | 0 | layer.ffn_up_exps, |
680 | 0 | layer.ffn_gate_exps, |
681 | 0 | layer.ffn_down_exps, |
682 | 0 | nullptr, |
683 | 0 | n_expert, n_expert_used, |
684 | 0 | LLM_FFN_SILU, true, |
685 | 0 | hparams.expert_weights_scale, |
686 | 0 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, |
687 | 0 | nullptr, layer.ffn_gate_up_exps, |
688 | 0 | layer.ffn_up_exps_s, |
689 | 0 | layer.ffn_gate_exps_s, |
690 | 0 | layer.ffn_down_exps_s); |
691 | 0 | cb(moe_out, "mtp_ffn_moe_out", il); |
692 | |
|
693 | 0 | if (layer.ffn_up_shexp != nullptr) { |
694 | 0 | ggml_tensor * ffn_shexp = |
695 | 0 | build_ffn(cur, |
696 | 0 | layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, |
697 | 0 | layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, |
698 | 0 | layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, |
699 | 0 | nullptr, |
700 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
701 | 0 | cb(ffn_shexp, "mtp_ffn_shexp", il); |
702 | |
|
703 | 0 | ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur); |
704 | 0 | shared_gate = ggml_sigmoid(ctx0, shared_gate); |
705 | 0 | cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il); |
706 | |
|
707 | 0 | ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); |
708 | 0 | cb(ffn_shexp, "mtp_ffn_shexp_gated", il); |
709 | |
|
710 | 0 | cur = ggml_add(ctx0, moe_out, ffn_shexp); |
711 | 0 | } else { |
712 | 0 | cur = moe_out; |
713 | 0 | } |
714 | 0 | cb(cur, "mtp_ffn_out", il); |
715 | |
|
716 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
717 | 0 | cb(cur, "mtp_post_ffn", il); |
718 | |
|
719 | 0 | ggml_tensor * head_norm_w = layer.nextn.shared_head_norm |
720 | 0 | ? layer.nextn.shared_head_norm |
721 | 0 | : model.output_norm; |
722 | 0 | GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm"); |
723 | 0 | cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); |
724 | |
|
725 | 0 | cb(cur, "h_nextn", -1); |
726 | 0 | res->t_h_nextn= cur; |
727 | |
|
728 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
729 | 0 | cb(cur, "mtp_shared_head_norm", -1); |
730 | |
|
731 | 0 | ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; |
732 | 0 | ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; |
733 | 0 | GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); |
734 | 0 | cur = build_lora_mm(head_w, cur, head_s); |
735 | 0 | cb(cur, "result_output", -1); |
736 | |
|
737 | 0 | res->t_logits = cur; |
738 | 0 | ggml_build_forward_expand(gf, cur); |
739 | 0 | } |