/src/llama.cpp/src/models/qwen35.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | | #include "llama-memory-recurrent.h" |
4 | | |
5 | | llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) : |
6 | 0 | llm_build_delta_net_base(params), model(model) { |
7 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v; |
8 | |
|
9 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
10 | |
|
11 | 0 | int sections[4]; |
12 | 0 | std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
13 | |
|
14 | 0 | ggml_tensor * cur; |
15 | 0 | ggml_tensor * inpL; |
16 | |
|
17 | 0 | inpL = build_inp_embd(model.tok_embd); |
18 | |
|
19 | 0 | cb(inpL, "model.input_embed", -1); |
20 | |
|
21 | 0 | auto * inp = build_inp_mem_hybrid(); |
22 | |
|
23 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
24 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
25 | |
|
26 | 0 | for (int il = 0; il < n_layer; ++il) { |
27 | 0 | ggml_tensor * inpSA = inpL; |
28 | |
|
29 | 0 | cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); |
30 | 0 | cb(cur, "attn_norm", il); |
31 | |
|
32 | 0 | ggml_build_forward_expand(gf, cur); |
33 | | |
34 | | // Determine layer type and build appropriate attention mechanism |
35 | 0 | if (hparams.is_recurrent(il)) { |
36 | | // Linear attention layer (gated delta net) |
37 | 0 | cur = build_layer_attn_linear(inp->get_recr(), cur, il); |
38 | 0 | } else { |
39 | | // Full attention layer |
40 | 0 | cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); |
41 | 0 | } |
42 | |
|
43 | 0 | if (il == n_layer - 1 && inp_out_ids) { |
44 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
45 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
46 | 0 | } |
47 | | |
48 | | // Residual connection |
49 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
50 | 0 | cb(cur, "attn_residual", il); |
51 | | |
52 | | // Save the tensor before post-attention norm for residual connection |
53 | 0 | ggml_tensor * ffn_residual = cur; |
54 | | |
55 | | // Post-attention norm |
56 | 0 | ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); |
57 | 0 | cb(attn_post_norm, "attn_post_norm", il); |
58 | | |
59 | | // Dense FFN layer - without residual connection |
60 | 0 | cur = build_layer_ffn(attn_post_norm, il); |
61 | 0 | cb(cur, "ffn_out", il); |
62 | | |
63 | | // Residual connection for FFN - add to the tensor from before post_attention_layernorm |
64 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
65 | 0 | cb(cur, "post_ffn", il); |
66 | | |
67 | | // Input for next layer |
68 | 0 | inpL = cur; |
69 | 0 | } |
70 | 0 | cur = inpL; |
71 | | |
72 | | // Final norm |
73 | 0 | cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); |
74 | |
|
75 | 0 | cb(cur, "result_norm", -1); |
76 | 0 | res->t_embd = cur; |
77 | | |
78 | | // LM head |
79 | 0 | cur = build_lora_mm(model.output, cur); |
80 | |
|
81 | 0 | cb(cur, "result_output", -1); |
82 | 0 | res->t_logits = cur; |
83 | |
|
84 | 0 | ggml_build_forward_expand(gf, cur); |
85 | 0 | } |
86 | | |
87 | | std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz( |
88 | | ggml_tensor * input, |
89 | 0 | int il) { |
90 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
91 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
92 | |
|
93 | 0 | ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input); |
94 | 0 | qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); |
95 | 0 | cb(qkv_mixed, "linear_attn_qkv_mixed", il); |
96 | |
|
97 | 0 | ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input); |
98 | 0 | cb(z, "z", il); |
99 | |
|
100 | 0 | return { qkv_mixed, z }; |
101 | 0 | } |
102 | | |
103 | | ggml_tensor * llm_build_qwen35::build_norm_gated( |
104 | | ggml_tensor * input, |
105 | | ggml_tensor * weights, |
106 | | ggml_tensor * gate, |
107 | 0 | int layer) { |
108 | 0 | ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); |
109 | 0 | ggml_tensor * gated_silu = ggml_silu(ctx0, gate); |
110 | |
|
111 | 0 | return ggml_mul(ctx0, normalized, gated_silu); |
112 | 0 | } |
113 | | |
114 | | ggml_tensor * llm_build_qwen35::build_layer_attn( |
115 | | llm_graph_input_attn_kv * inp, |
116 | | ggml_tensor * cur, |
117 | | ggml_tensor * inp_pos, |
118 | | int * sections, |
119 | 0 | int il) { |
120 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v; |
121 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
122 | | |
123 | | // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention |
124 | | |
125 | | // Qwen3Next uses a single Q projection that outputs query + gate |
126 | 0 | ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ] |
127 | 0 | cb(Qcur_full, "Qcur_full", il); |
128 | |
|
129 | 0 | ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
130 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
131 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0); |
132 | 0 | cb(Qcur, "Qcur_reshaped", il); |
133 | | |
134 | | // Apply Q normalization |
135 | 0 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); |
136 | 0 | cb(Qcur, "Qcur_normed", il); |
137 | |
|
138 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
139 | 0 | cb(Kcur, "Kcur", il); |
140 | |
|
141 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
142 | 0 | cb(Vcur, "Vcur", il); |
143 | | |
144 | | // Apply K normalization |
145 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
146 | 0 | Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); |
147 | 0 | cb(Kcur, "Kcur_normed", il); |
148 | |
|
149 | 0 | ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
150 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
151 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
152 | 0 | ggml_element_size(Qcur_full) * n_embd_head); |
153 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
154 | 0 | cb(gate, "gate_reshaped", il); |
155 | |
|
156 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
157 | | |
158 | | // Apply MRoPE |
159 | 0 | Qcur = ggml_rope_multi( |
160 | 0 | ctx0, Qcur, inp_pos, nullptr, |
161 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
162 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
163 | 0 | ); |
164 | |
|
165 | 0 | Kcur = ggml_rope_multi( |
166 | 0 | ctx0, Kcur, inp_pos, nullptr, |
167 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
168 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
169 | 0 | ); |
170 | |
|
171 | 0 | cb(Qcur, "Qcur", il); |
172 | 0 | cb(Kcur, "Kcur", il); |
173 | 0 | cb(Vcur, "Vcur", il); |
174 | | |
175 | | // Attention computation |
176 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
177 | |
|
178 | 0 | cur = build_attn(inp, |
179 | 0 | nullptr, nullptr, |
180 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
181 | 0 | cb(cur, "attn_pregate", il); |
182 | |
|
183 | 0 | ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate); |
184 | 0 | cb(gate_sigmoid, "gate_sigmoid", il); |
185 | |
|
186 | 0 | cur = ggml_mul(ctx0, cur, gate_sigmoid); |
187 | 0 | cb(cur, "attn_gated", il); |
188 | |
|
189 | 0 | cur = build_lora_mm(model.layers[il].wo, cur); |
190 | 0 | cb(cur, "attn_output", il); |
191 | |
|
192 | 0 | return cur; |
193 | 0 | } |
194 | | |
195 | | ggml_tensor * llm_build_qwen35::build_layer_attn_linear( |
196 | | llm_graph_input_rs * inp, |
197 | | ggml_tensor * cur, |
198 | 0 | int il) { |
199 | 0 | const auto * mctx_cur = inp->mctx; |
200 | |
|
201 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
202 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
203 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
204 | 0 | const int64_t num_k_heads = hparams.ssm_n_group; |
205 | 0 | const int64_t num_v_heads = hparams.ssm_dt_rank; |
206 | 0 | const int64_t head_v_dim = d_inner / num_v_heads; |
207 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
208 | |
|
209 | 0 | const auto kv_head = mctx_cur->get_head(); |
210 | |
|
211 | 0 | GGML_ASSERT(n_seqs != 0); |
212 | 0 | GGML_ASSERT(ubatch.equal_seqs()); |
213 | 0 | GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); |
214 | | |
215 | | // Input projections |
216 | 0 | auto qkvz = build_qkvz(cur, il); |
217 | 0 | ggml_tensor * qkv_mixed = qkvz.first; |
218 | 0 | ggml_tensor * z = qkvz.second; |
219 | |
|
220 | 0 | ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur); |
221 | 0 | beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); |
222 | 0 | cb(beta, "beta", il); |
223 | |
|
224 | 0 | beta = ggml_sigmoid(ctx0, beta); |
225 | |
|
226 | 0 | ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur); |
227 | 0 | alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); |
228 | 0 | cb(alpha, "alpha", il); |
229 | |
|
230 | 0 | ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); |
231 | 0 | ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); |
232 | 0 | cb(alpha_softplus, "a_softplus", il); |
233 | |
|
234 | 0 | ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus |
235 | 0 | cb(gate, "gate", il); |
236 | |
|
237 | 0 | gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); |
238 | | |
239 | | // Get convolution states from cache |
240 | 0 | ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); |
241 | 0 | ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); |
242 | | |
243 | | // Build the convolution states tensor |
244 | 0 | ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); |
245 | 0 | cb(conv_states, "conv_states", il); |
246 | | |
247 | | // Calculate convolution kernel size |
248 | 0 | ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; |
249 | 0 | const int64_t conv_kernel_size = conv_kernel->ne[0]; |
250 | 0 | const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; |
251 | |
|
252 | 0 | conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); |
253 | 0 | cb(conv_states, "conv_states_reshaped", il); |
254 | |
|
255 | 0 | qkv_mixed = ggml_transpose(ctx0, qkv_mixed); |
256 | 0 | cb(qkv_mixed, "qkv_mixed_transposed", il); |
257 | |
|
258 | 0 | ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); |
259 | 0 | cb(conv_input, "conv_input", il); |
260 | | |
261 | | // Update convolution state cache |
262 | | // Extract the last (conv_kernel_size - 1) states from conv_input |
263 | 0 | ggml_tensor * last_conv_states = |
264 | 0 | ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], |
265 | 0 | conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); |
266 | 0 | cb(last_conv_states, "last_conv_states", il); |
267 | |
|
268 | 0 | ggml_tensor * state_update_target = |
269 | 0 | ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs, |
270 | 0 | kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); |
271 | 0 | cb(state_update_target, "state_update_target", il); |
272 | |
|
273 | 0 | ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); |
274 | |
|
275 | 0 | ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); |
276 | 0 | state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); |
277 | 0 | cb(state, "state_predelta", il); |
278 | |
|
279 | 0 | ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); |
280 | 0 | cb(conv_output_proper, "conv_output_raw", il); |
281 | |
|
282 | 0 | ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); |
283 | 0 | cb(conv_output_silu, "conv_output_silu", il); |
284 | |
|
285 | 0 | ggml_tensor * conv_qkv_mix = conv_output_silu; |
286 | | |
287 | | // Calculate the total conv dimension |
288 | 0 | int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; |
289 | 0 | int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); |
290 | | |
291 | | // Extract the convolved Q, K, V from conv_output |
292 | 0 | ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
293 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
294 | 0 | nb1_qkv, |
295 | 0 | nb1_qkv * n_seq_tokens, |
296 | 0 | 0); |
297 | |
|
298 | 0 | ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
299 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
300 | 0 | nb1_qkv, |
301 | 0 | nb1_qkv * n_seq_tokens, |
302 | 0 | head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); |
303 | |
|
304 | 0 | ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs, |
305 | 0 | ggml_row_size(conv_qkv_mix->type, head_v_dim), |
306 | 0 | nb1_qkv, |
307 | 0 | nb1_qkv * n_seq_tokens, |
308 | 0 | ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads)); |
309 | |
|
310 | 0 | cb(q_conv, "q_conv", il); |
311 | 0 | cb(k_conv, "k_conv", il); |
312 | 0 | cb(v_conv, "v_conv", il); |
313 | |
|
314 | 0 | const float eps_norm = hparams.f_norm_rms_eps; |
315 | |
|
316 | 0 | q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm); |
317 | 0 | k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm); |
318 | | |
319 | | //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
320 | | //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
321 | | //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
322 | | |
323 | | // if head keys and value keys are different, repeat to force tensors into matching shapes |
324 | 0 | if (num_k_heads != num_v_heads) { |
325 | 0 | GGML_ASSERT(num_v_heads % num_k_heads == 0); |
326 | | // TODO: try to avoid these explicit repeats by utilizing op broadcast |
327 | 0 | q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
328 | 0 | k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
329 | 0 | } |
330 | |
|
331 | 0 | cb(q_conv, "q_conv_predelta", il); |
332 | 0 | cb(k_conv, "k_conv_predelta", il); |
333 | 0 | cb(v_conv, "v_conv_predelta", il); |
334 | | |
335 | | // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens |
336 | 0 | std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state) |
337 | 0 | if (n_seq_tokens == 1) { |
338 | 0 | attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il); |
339 | 0 | } else { |
340 | 0 | attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il); |
341 | 0 | } |
342 | 0 | ggml_tensor * output = attn_out.first; |
343 | 0 | ggml_tensor * new_state = attn_out.second; |
344 | 0 | cb(output, "attn_output", il); |
345 | 0 | cb(new_state, "new_state", il); |
346 | | |
347 | | // Update the recurrent states |
348 | 0 | ggml_build_forward_expand(gf, |
349 | 0 | ggml_cpy(ctx0, new_state, |
350 | 0 | ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, |
351 | 0 | kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); |
352 | | |
353 | | // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] |
354 | 0 | ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
355 | | |
356 | | // Apply gated normalization: self.norm(core_attn_out, z) |
357 | 0 | ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il); |
358 | | |
359 | | // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim] |
360 | 0 | ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); |
361 | 0 | cb(final_output, "final_output", il); |
362 | | |
363 | | // Output projection |
364 | 0 | cur = build_lora_mm(model.layers[il].ssm_out, final_output); |
365 | 0 | cb(cur, "linear_attn_out", il); |
366 | | |
367 | | // Reshape back to original dimensions |
368 | 0 | cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); |
369 | |
|
370 | 0 | return cur; |
371 | 0 | } |
372 | | |
373 | 0 | ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il) { |
374 | | // Qwen3.5 does not use MoE FFN |
375 | 0 | GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr); |
376 | |
|
377 | 0 | cur = build_ffn(cur, |
378 | 0 | model.layers[il].ffn_up, NULL, NULL, |
379 | 0 | model.layers[il].ffn_gate, NULL, NULL, |
380 | 0 | model.layers[il].ffn_down, NULL, NULL, |
381 | 0 | NULL, |
382 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
383 | 0 | cb(cur, "ffn_out", il); |
384 | |
|
385 | 0 | return cur; |
386 | 0 | } |