/src/llama.cpp/src/models/qwen3next.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | | #include "llama-memory-recurrent.h" |
4 | | |
5 | | llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : |
6 | 0 | llm_build_delta_net_base(params), model(model) { |
7 | 0 | ggml_tensor * cur; |
8 | 0 | ggml_tensor * inpL; |
9 | |
|
10 | 0 | inpL = build_inp_embd(model.tok_embd); |
11 | 0 | cb(inpL, "model.embed_tokens", -1); |
12 | |
|
13 | 0 | auto * inp = build_inp_mem_hybrid(); |
14 | |
|
15 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
16 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
17 | |
|
18 | 0 | for (int il = 0; il < n_layer; ++il) { |
19 | 0 | ggml_tensor * inpSA = inpL; |
20 | |
|
21 | 0 | cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); |
22 | 0 | cb(cur, "attn_norm", il); |
23 | |
|
24 | 0 | ggml_build_forward_expand(gf, cur); |
25 | | |
26 | | // Determine layer type and build appropriate attention mechanism |
27 | 0 | if (hparams.is_recurrent(il)) { |
28 | | // Linear attention layer (gated delta net) |
29 | 0 | cur = build_layer_attn_linear(inp->get_recr(), cur, il); |
30 | 0 | } else { |
31 | | // Full attention layer |
32 | 0 | cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il); |
33 | 0 | } |
34 | |
|
35 | 0 | if (il == n_layer - 1 && inp_out_ids) { |
36 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
37 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
38 | 0 | } |
39 | | |
40 | | // Residual connection |
41 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
42 | 0 | cb(cur, "attn_residual", il); |
43 | | |
44 | | // Save the tensor before post-attention norm for residual connection |
45 | 0 | ggml_tensor * ffn_residual = cur; |
46 | | |
47 | | // Post-attention norm |
48 | 0 | ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); |
49 | 0 | cb(attn_post_norm, "attn_post_norm", il); |
50 | | |
51 | | // FFN layer (MoE or dense) - without residual connection |
52 | 0 | cur = build_layer_ffn(attn_post_norm, il); |
53 | 0 | cb(cur, "ffn_out", il); |
54 | | |
55 | | // Residual connection for FFN - add to the tensor from before post_attention_layernorm |
56 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
57 | 0 | cb(cur, "post_moe", il); |
58 | | |
59 | | // Input for next layer |
60 | 0 | inpL = cur; |
61 | 0 | } |
62 | 0 | cur = inpL; |
63 | | |
64 | | // Final norm |
65 | 0 | cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); |
66 | |
|
67 | 0 | cb(cur, "result_norm", -1); |
68 | 0 | res->t_embd = cur; |
69 | | |
70 | | // LM head |
71 | 0 | cur = build_lora_mm(model.output, cur); |
72 | |
|
73 | 0 | cb(cur, "result_output", -1); |
74 | 0 | res->t_logits = cur; |
75 | |
|
76 | 0 | ggml_build_forward_expand(gf, cur); |
77 | 0 | } |
78 | | |
79 | | // utility to get one slice from the third dimension |
80 | | // input dim: [x, y, c, b] |
81 | | // output dim: [x, y, 1, b] |
82 | 0 | static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) { |
83 | 0 | return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3], |
84 | 0 | t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c); |
85 | 0 | } |
86 | | |
87 | | ggml_tensor * llm_build_qwen3next::build_norm_gated( |
88 | | ggml_tensor * input, |
89 | | ggml_tensor * weights, |
90 | | ggml_tensor * gate, |
91 | 0 | int layer) { |
92 | 0 | ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); |
93 | 0 | ggml_tensor * gated_silu = ggml_silu(ctx0, gate); |
94 | |
|
95 | 0 | return ggml_mul(ctx0, normalized, gated_silu); |
96 | 0 | } |
97 | | |
98 | | ggml_tensor * llm_build_qwen3next::build_layer_attn( |
99 | | llm_graph_input_attn_kv * inp, |
100 | | ggml_tensor * cur, |
101 | | ggml_tensor * inp_pos, |
102 | 0 | int il) { |
103 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v; |
104 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
105 | | |
106 | | // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention |
107 | | |
108 | | // Qwen3Next uses a single Q projection that outputs query + gate |
109 | 0 | ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); |
110 | 0 | cb(Qcur_full, "Qcur_full", il); |
111 | |
|
112 | 0 | Qcur_full = ggml_reshape_4d(ctx0, Qcur_full, n_embd_head * 2, n_head, n_tokens, 1); |
113 | | |
114 | | // Split Q projection into query and gate |
115 | | // The split should be along dimension 0 (the feature dimension) |
116 | 0 | ggml_tensor * Qcur = ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1, |
117 | 0 | Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0); |
118 | 0 | cb(Qcur, "Qcur_view", il); |
119 | |
|
120 | 0 | ggml_tensor * gate = |
121 | 0 | ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1, |
122 | 0 | Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], n_embd_head * ggml_element_size(Qcur_full)); |
123 | 0 | cb(gate, "gate", il); |
124 | |
|
125 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
126 | 0 | cb(Kcur, "Kcur", il); |
127 | |
|
128 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
129 | 0 | cb(Vcur, "Vcur", il); |
130 | |
|
131 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
132 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
133 | |
|
134 | 0 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); |
135 | 0 | cb(Qcur, "Qcur_normed", il); |
136 | |
|
137 | 0 | Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); |
138 | 0 | cb(Kcur, "Kcur_normed", il); |
139 | |
|
140 | 0 | Qcur = ggml_rope_ext( |
141 | 0 | ctx0, Qcur, inp_pos, nullptr, |
142 | 0 | n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
143 | 0 | ext_factor, attn_factor, beta_fast, beta_slow); |
144 | |
|
145 | 0 | Kcur = ggml_rope_ext( |
146 | 0 | ctx0, Kcur, inp_pos, nullptr, |
147 | 0 | n_rot, rope_type, n_ctx_orig, freq_base, |
148 | 0 | freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); |
149 | |
|
150 | 0 | cb(Qcur, "Qcur", il); |
151 | 0 | cb(Kcur, "Kcur", il); |
152 | 0 | cb(Vcur, "Vcur", il); |
153 | |
|
154 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
155 | |
|
156 | 0 | cur = build_attn(inp, |
157 | 0 | nullptr, nullptr, |
158 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
159 | 0 | cb(cur, "attn_pregate", il); |
160 | | |
161 | | // TODO: CUDA is missing non-contiguous unary ops. when implemented: remove this cont |
162 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
163 | |
|
164 | 0 | gate = ggml_sigmoid(ctx0, gate); |
165 | 0 | cb(gate, "gate_sigmoid", il); |
166 | |
|
167 | 0 | gate = ggml_reshape_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
168 | |
|
169 | 0 | cur = ggml_mul(ctx0, cur, gate); |
170 | 0 | cb(cur, "attn_gated", il); |
171 | |
|
172 | 0 | cur = build_lora_mm(model.layers[il].wo, cur); |
173 | 0 | cb(cur, "attn_output", il); |
174 | |
|
175 | 0 | return cur; |
176 | 0 | } |
177 | | |
178 | | std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz( |
179 | | ggml_tensor * input, |
180 | 0 | int il) { |
181 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
182 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
183 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
184 | 0 | const int64_t num_k_heads = hparams.ssm_n_group; |
185 | 0 | const int64_t num_v_heads = hparams.ssm_dt_rank; |
186 | 0 | const int64_t head_v_dim = d_inner / num_v_heads; |
187 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
188 | |
|
189 | 0 | if (model.layers[il].wqkv) { |
190 | | // optimized path |
191 | 0 | ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input); |
192 | 0 | qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); |
193 | 0 | cb(qkv_mixed, "linear_attn_qkv_mixed", il); |
194 | |
|
195 | 0 | ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input); |
196 | 0 | cb(z, "z", il); |
197 | |
|
198 | 0 | return { qkv_mixed, z }; |
199 | 0 | } else { |
200 | | // legacy (slower) path |
201 | 0 | ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input); |
202 | 0 | cb(mixed_qkvz, "linear_attn_mixed_qkvz", il); |
203 | |
|
204 | 0 | int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads); |
205 | 0 | ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs); |
206 | | |
207 | | // Split mixed_qkvz into query, key, value, z |
208 | 0 | int64_t split_sizes_qkvz[4] = { |
209 | 0 | head_k_dim, // query size |
210 | 0 | head_k_dim, // key size |
211 | 0 | head_v_dim * num_v_heads / num_k_heads, // value size |
212 | 0 | head_v_dim * num_v_heads / num_k_heads // z size |
213 | 0 | }; |
214 | |
|
215 | 0 | ggml_tensor * query = |
216 | 0 | ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs, |
217 | 0 | mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0); |
218 | 0 | cb(query, "q", il); |
219 | |
|
220 | 0 | ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs, |
221 | 0 | mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], |
222 | 0 | split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped)); |
223 | 0 | cb(key, "k", il); |
224 | |
|
225 | 0 | ggml_tensor * value = |
226 | 0 | ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs, |
227 | 0 | mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], |
228 | 0 | (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped)); |
229 | 0 | cb(value, "v", il); |
230 | |
|
231 | 0 | ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs, |
232 | 0 | mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], |
233 | 0 | (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped)); |
234 | 0 | z = ggml_cont(ctx0, z); |
235 | 0 | cb(z, "z", il); |
236 | | |
237 | | // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions |
238 | | // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs] |
239 | 0 | ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); |
240 | 0 | cb(query_flat, "query_flat", il); |
241 | | |
242 | | // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs] |
243 | 0 | ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); |
244 | 0 | cb(key_flat, "key_flat", il); |
245 | | |
246 | | // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs] |
247 | 0 | ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); |
248 | 0 | cb(value_flat, "value_flat", il); |
249 | | |
250 | | // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs] |
251 | 0 | ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0); |
252 | 0 | qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0); |
253 | 0 | cb(qkv_mixed, "qkv_mixed", il); |
254 | |
|
255 | 0 | return { qkv_mixed, z }; |
256 | 0 | } |
257 | 0 | } |
258 | | |
259 | | ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( |
260 | | llm_graph_input_rs * inp, |
261 | | ggml_tensor * cur, |
262 | 0 | int il) { |
263 | 0 | const auto * mctx_cur = inp->mctx; |
264 | |
|
265 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
266 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
267 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
268 | 0 | const int64_t num_k_heads = hparams.ssm_n_group; |
269 | 0 | const int64_t num_v_heads = hparams.ssm_dt_rank; |
270 | 0 | const int64_t head_v_dim = d_inner / num_v_heads; |
271 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
272 | |
|
273 | 0 | const auto kv_head = mctx_cur->get_head(); |
274 | |
|
275 | 0 | GGML_ASSERT(n_seqs != 0); |
276 | 0 | GGML_ASSERT(ubatch.equal_seqs()); |
277 | 0 | GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); |
278 | | |
279 | | // Input projections |
280 | 0 | auto qkvz = build_qkvz(cur, il); |
281 | 0 | ggml_tensor * qkv_mixed = qkvz.first; |
282 | 0 | ggml_tensor * z = qkvz.second; |
283 | |
|
284 | 0 | ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur); |
285 | 0 | cb(mixed_ba, "linear_attn_mixed_ba", il); |
286 | | |
287 | | // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads] |
288 | 0 | int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; |
289 | 0 | ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); |
290 | | |
291 | | // Split mixed_ba into b and a (beta and alpha parameters) |
292 | 0 | int64_t split_sizes_ba[2] = { |
293 | 0 | num_v_heads / num_k_heads, // beta size |
294 | 0 | num_v_heads / num_k_heads // alpha size |
295 | 0 | }; |
296 | |
|
297 | 0 | ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs, |
298 | 0 | mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0); |
299 | 0 | cb(b, "b", il); |
300 | |
|
301 | 0 | ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs, |
302 | 0 | mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], |
303 | 0 | split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped)); |
304 | 0 | cb(a, "a", il); |
305 | | |
306 | | // TODO: CUDA is missing non-contiguous unary ops. when implemented: remove this cont |
307 | 0 | b = ggml_cont(ctx0, b); |
308 | |
|
309 | 0 | ggml_tensor * beta = ggml_sigmoid(ctx0, b); |
310 | | |
311 | | // Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads] |
312 | 0 | ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs); |
313 | |
|
314 | 0 | ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); |
315 | 0 | ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); |
316 | 0 | cb(alpha_softplus, "a_softplus", il); |
317 | |
|
318 | 0 | ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus |
319 | 0 | cb(gate, "gate", il); |
320 | |
|
321 | 0 | beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); |
322 | 0 | gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); |
323 | | |
324 | | // Get convolution states from cache |
325 | 0 | ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); |
326 | 0 | ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); |
327 | | |
328 | | // Build the convolution states tensor |
329 | 0 | ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); |
330 | 0 | cb(conv_states, "conv_states", il); |
331 | | |
332 | | // Calculate convolution kernel size |
333 | 0 | ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; |
334 | 0 | const int64_t conv_kernel_size = conv_kernel->ne[0]; |
335 | 0 | const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; |
336 | |
|
337 | 0 | conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); |
338 | 0 | cb(conv_states, "conv_states_reshaped", il); |
339 | |
|
340 | 0 | qkv_mixed = ggml_transpose(ctx0, qkv_mixed); |
341 | 0 | cb(qkv_mixed, "qkv_mixed_transposed", il); |
342 | |
|
343 | 0 | ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); |
344 | 0 | cb(conv_input, "conv_input", il); |
345 | | |
346 | | // Update convolution state cache |
347 | | // Extract the last (conv_kernel_size - 1) states from conv_input |
348 | 0 | ggml_tensor * last_conv_states = |
349 | 0 | ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], |
350 | 0 | conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); |
351 | 0 | cb(last_conv_states, "last_conv_states", il); |
352 | |
|
353 | 0 | ggml_tensor * state_update_target = |
354 | 0 | ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs, |
355 | 0 | kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); |
356 | 0 | cb(state_update_target, "state_update_target", il); |
357 | |
|
358 | 0 | ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); |
359 | |
|
360 | 0 | ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); |
361 | 0 | state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); |
362 | 0 | cb(state, "state_predelta", il); |
363 | |
|
364 | 0 | ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); |
365 | 0 | cb(conv_output_proper, "conv_output_raw", il); |
366 | |
|
367 | 0 | ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); |
368 | 0 | cb(conv_output_silu, "conv_output_silu", il); |
369 | |
|
370 | 0 | ggml_tensor * conv_qkv_mix = conv_output_silu; |
371 | | |
372 | | // Calculate the total conv dimension |
373 | 0 | int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; |
374 | 0 | int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); |
375 | | |
376 | | // Extract the convolved Q, K, V from conv_output |
377 | 0 | ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
378 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
379 | 0 | nb1_qkv, |
380 | 0 | nb1_qkv * n_seq_tokens, |
381 | 0 | 0); |
382 | |
|
383 | 0 | ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
384 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
385 | 0 | nb1_qkv, |
386 | 0 | nb1_qkv * n_seq_tokens, |
387 | 0 | head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); |
388 | |
|
389 | 0 | ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs, |
390 | 0 | ggml_row_size(conv_qkv_mix->type, head_v_dim), |
391 | 0 | nb1_qkv, |
392 | 0 | nb1_qkv * n_seq_tokens, |
393 | 0 | ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads)); |
394 | |
|
395 | 0 | cb(q_conv, "q_conv", il); |
396 | 0 | cb(k_conv, "k_conv", il); |
397 | 0 | cb(v_conv, "v_conv", il); |
398 | |
|
399 | 0 | const float eps_norm = hparams.f_norm_rms_eps; |
400 | |
|
401 | 0 | q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm); |
402 | 0 | k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm); |
403 | | |
404 | | //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
405 | | //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
406 | | //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
407 | | |
408 | | // if head keys and value keys are different, repeat to force tensors into matching shapes |
409 | 0 | if (num_k_heads != num_v_heads) { |
410 | 0 | GGML_ASSERT(num_v_heads % num_k_heads == 0); |
411 | 0 | int64_t repeat_factor = num_v_heads / num_k_heads; |
412 | | |
413 | | // repeat interleave: reshape to (repeat part, 1, remaining part), do repeat, then reshape back |
414 | 0 | ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs); |
415 | 0 | ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs); |
416 | | |
417 | | // Repeat along the third dimension (the new dimension with size 1) |
418 | 0 | ggml_tensor * q_repeated = |
419 | 0 | ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1); |
420 | 0 | ggml_tensor * k_repeated = |
421 | 0 | ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1); |
422 | | |
423 | | // Reshape back to merge the head and repeat dimensions |
424 | | // From [head_dim, num_k_heads, repeat_factor, n_seq_tokens * n_seqs] |
425 | | // Back to [head_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs] |
426 | 0 | q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs); |
427 | 0 | k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs); |
428 | 0 | } |
429 | |
|
430 | 0 | cb(q_conv, "q_conv_predelta", il); |
431 | 0 | cb(k_conv, "k_conv_predelta", il); |
432 | 0 | cb(v_conv, "v_conv_predelta", il); |
433 | | |
434 | | // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens |
435 | 0 | std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state) |
436 | 0 | if (n_seq_tokens == 1) { |
437 | 0 | attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il); |
438 | 0 | } else { |
439 | 0 | attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il); |
440 | 0 | } |
441 | 0 | ggml_tensor * output = attn_out.first; |
442 | 0 | ggml_tensor * new_state = attn_out.second; |
443 | 0 | cb(output, "attn_output", il); |
444 | 0 | cb(new_state, "new_state", il); |
445 | | |
446 | | // Update the recurrent states |
447 | 0 | ggml_build_forward_expand(gf, |
448 | 0 | ggml_cpy(ctx0, new_state, |
449 | 0 | ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, |
450 | 0 | kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); |
451 | | |
452 | | // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] |
453 | 0 | ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
454 | | |
455 | | // Apply gated normalization: self.norm(core_attn_out, z) |
456 | 0 | ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il); |
457 | | |
458 | | // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim] |
459 | 0 | ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); |
460 | 0 | cb(final_output, "final_output", il); |
461 | | |
462 | | // Output projection |
463 | 0 | cur = build_lora_mm(model.layers[il].ssm_out, final_output); |
464 | 0 | cb(cur, "linear_attn_out", il); |
465 | | |
466 | | // Reshape back to original dimensions |
467 | 0 | cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); |
468 | |
|
469 | 0 | return cur; |
470 | 0 | } |
471 | | |
472 | 0 | ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int il) { |
473 | | // Check if this is an MoE layer |
474 | 0 | if (model.layers[il].ffn_gate_inp != nullptr) { |
475 | | // MoE branch |
476 | 0 | ggml_tensor * moe_out = |
477 | 0 | build_moe_ffn(cur, |
478 | 0 | model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, |
479 | 0 | model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, |
480 | 0 | nullptr, |
481 | 0 | n_expert, n_expert_used, LLM_FFN_SILU, |
482 | 0 | true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, |
483 | 0 | nullptr, model.layers[il].ffn_gate_up_exps); |
484 | 0 | cb(moe_out, "ffn_moe_out", il); |
485 | | |
486 | | // Add shared experts if present - following Qwen3Next reference implementation |
487 | 0 | if (model.layers[il].ffn_up_shexp != nullptr) { |
488 | 0 | ggml_tensor * ffn_shexp = |
489 | 0 | build_ffn(cur, |
490 | 0 | model.layers[il].ffn_up_shexp, NULL, NULL, |
491 | 0 | model.layers[il].ffn_gate_shexp, NULL, NULL, |
492 | 0 | model.layers[il].ffn_down_shexp, NULL, NULL, |
493 | 0 | NULL, |
494 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
495 | 0 | cb(ffn_shexp, "ffn_shexp", il); |
496 | | |
497 | | // Apply shared expert gating as in the reference implementation |
498 | | // The shared expert has its own gate that is sigmoided |
499 | | // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token) |
500 | 0 | ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); |
501 | 0 | cb(shared_gate, "shared_expert_gate", il); |
502 | |
|
503 | 0 | shared_gate = ggml_sigmoid(ctx0, shared_gate); |
504 | 0 | cb(shared_gate, "shared_expert_gate_sigmoid", il); |
505 | |
|
506 | 0 | ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); |
507 | 0 | cb(ffn_shexp, "ffn_shexp_gated", il); |
508 | |
|
509 | 0 | cur = ggml_add(ctx0, moe_out, ffn_shexp); |
510 | 0 | cb(cur, "ffn_out", il); |
511 | 0 | } else { |
512 | 0 | cur = moe_out; |
513 | 0 | } |
514 | 0 | } else { |
515 | | // Dense FFN branch (not currently used I believe) |
516 | 0 | cur = build_ffn(cur, |
517 | 0 | model.layers[il].ffn_up, NULL, NULL, |
518 | 0 | model.layers[il].ffn_gate, NULL, NULL, |
519 | 0 | model.layers[il].ffn_down, NULL, NULL, |
520 | | NULL, |
521 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
522 | 0 | cb(cur, "ffn_out", il); |
523 | 0 | } |
524 | 0 | return cur; |
525 | 0 | } |