/src/llama.cpp/src/models/qwen35.cpp
Line | Count | Source |
1 | | #include "models.h" |
2 | | |
3 | | #include "llama-memory-recurrent.h" |
4 | | |
5 | | llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) : |
6 | 0 | llm_build_delta_net_base(params), model(model) { |
7 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
8 | |
|
9 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
10 | |
|
11 | 0 | int sections[4]; |
12 | 0 | std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
13 | |
|
14 | 0 | ggml_tensor * cur; |
15 | 0 | ggml_tensor * inpL; |
16 | |
|
17 | 0 | inpL = build_inp_embd(model.tok_embd); |
18 | |
|
19 | 0 | cb(inpL, "model.input_embed", -1); |
20 | |
|
21 | 0 | auto * inp = build_inp_mem_hybrid(); |
22 | |
|
23 | 0 | ggml_tensor * inp_pos = build_inp_pos(); |
24 | 0 | ggml_tensor * inp_out_ids = build_inp_out_ids(); |
25 | |
|
26 | 0 | for (int il = 0; il < n_layer; ++il) { |
27 | 0 | ggml_tensor * inpSA = inpL; |
28 | |
|
29 | 0 | cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); |
30 | 0 | cb(cur, "attn_norm", il); |
31 | |
|
32 | 0 | ggml_build_forward_expand(gf, cur); |
33 | | |
34 | | // Determine layer type and build appropriate attention mechanism |
35 | 0 | if (hparams.is_recurrent(il)) { |
36 | | // Linear attention layer (gated delta net) |
37 | 0 | cur = build_layer_attn_linear(inp->get_recr(), cur, il); |
38 | 0 | } else { |
39 | | // Full attention layer |
40 | 0 | cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); |
41 | 0 | } |
42 | |
|
43 | 0 | if (il == n_layer - 1 && inp_out_ids) { |
44 | 0 | cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
45 | 0 | inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
46 | 0 | } |
47 | | |
48 | | // Residual connection |
49 | 0 | cur = ggml_add(ctx0, cur, inpSA); |
50 | 0 | cb(cur, "attn_residual", il); |
51 | | |
52 | | // Save the tensor before post-attention norm for residual connection |
53 | 0 | ggml_tensor * ffn_residual = cur; |
54 | | |
55 | | // Post-attention norm |
56 | 0 | ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); |
57 | 0 | cb(attn_post_norm, "attn_post_norm", il); |
58 | | |
59 | | // Dense FFN layer - without residual connection |
60 | 0 | cur = build_layer_ffn(attn_post_norm, il); |
61 | 0 | cb(cur, "ffn_out", il); |
62 | | |
63 | | // Residual connection for FFN - add to the tensor from before post_attention_layernorm |
64 | 0 | cur = ggml_add(ctx0, cur, ffn_residual); |
65 | 0 | cb(cur, "post_ffn", il); |
66 | |
|
67 | 0 | cur = build_cvec(cur, il); |
68 | 0 | cb(cur, "l_out", il); |
69 | | |
70 | | // Input for next layer |
71 | 0 | inpL = cur; |
72 | 0 | } |
73 | 0 | cur = inpL; |
74 | | |
75 | | // Final norm |
76 | 0 | cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); |
77 | |
|
78 | 0 | cb(cur, "result_norm", -1); |
79 | 0 | res->t_embd = cur; |
80 | | |
81 | | // LM head |
82 | 0 | cur = build_lora_mm(model.output, cur); |
83 | |
|
84 | 0 | cb(cur, "result_output", -1); |
85 | 0 | res->t_logits = cur; |
86 | |
|
87 | 0 | ggml_build_forward_expand(gf, cur); |
88 | 0 | } |
89 | | |
90 | | std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz( |
91 | | ggml_tensor * input, |
92 | 0 | int il) { |
93 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
94 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
95 | |
|
96 | 0 | ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s); |
97 | 0 | qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); |
98 | 0 | cb(qkv_mixed, "linear_attn_qkv_mixed", il); |
99 | |
|
100 | 0 | ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s); |
101 | 0 | cb(z, "z", il); |
102 | |
|
103 | 0 | return { qkv_mixed, z }; |
104 | 0 | } |
105 | | |
106 | | ggml_tensor * llm_build_qwen35::build_norm_gated( |
107 | | ggml_tensor * input, |
108 | | ggml_tensor * weights, |
109 | | ggml_tensor * gate, |
110 | 0 | int layer) { |
111 | 0 | ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); |
112 | 0 | ggml_tensor * gated_silu = ggml_silu(ctx0, gate); |
113 | |
|
114 | 0 | return ggml_mul(ctx0, normalized, gated_silu); |
115 | 0 | } |
116 | | |
117 | | ggml_tensor * llm_build_qwen35::build_layer_attn( |
118 | | llm_graph_input_attn_kv * inp, |
119 | | ggml_tensor * cur, |
120 | | ggml_tensor * inp_pos, |
121 | | int * sections, |
122 | 0 | int il) { |
123 | 0 | const int64_t n_embd_head = hparams.n_embd_head_v(); |
124 | 0 | GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); |
125 | | |
126 | | // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention |
127 | | |
128 | | // Qwen3Next uses a single Q projection that outputs query + gate |
129 | 0 | ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ] |
130 | 0 | cb(Qcur_full, "Qcur_full", il); |
131 | |
|
132 | 0 | ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
133 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
134 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0); |
135 | 0 | cb(Qcur, "Qcur_reshaped", il); |
136 | | |
137 | | // Apply Q normalization |
138 | 0 | Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); |
139 | 0 | cb(Qcur, "Qcur_normed", il); |
140 | |
|
141 | 0 | ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); |
142 | 0 | cb(Kcur, "Kcur", il); |
143 | |
|
144 | 0 | ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); |
145 | 0 | cb(Vcur, "Vcur", il); |
146 | | |
147 | | // Apply K normalization |
148 | 0 | Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
149 | 0 | Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); |
150 | 0 | cb(Kcur, "Kcur_normed", il); |
151 | |
|
152 | 0 | ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, |
153 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2, |
154 | 0 | ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, |
155 | 0 | ggml_element_size(Qcur_full) * n_embd_head); |
156 | 0 | gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); |
157 | 0 | cb(gate, "gate_reshaped", il); |
158 | |
|
159 | 0 | Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
160 | | |
161 | | // Apply MRoPE |
162 | 0 | Qcur = ggml_rope_multi( |
163 | 0 | ctx0, Qcur, inp_pos, nullptr, |
164 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
165 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
166 | 0 | ); |
167 | |
|
168 | 0 | Kcur = ggml_rope_multi( |
169 | 0 | ctx0, Kcur, inp_pos, nullptr, |
170 | 0 | n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
171 | 0 | ext_factor, attn_factor, beta_fast, beta_slow |
172 | 0 | ); |
173 | |
|
174 | 0 | cb(Qcur, "Qcur", il); |
175 | 0 | cb(Kcur, "Kcur", il); |
176 | 0 | cb(Vcur, "Vcur", il); |
177 | | |
178 | | // Attention computation |
179 | 0 | const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
180 | |
|
181 | 0 | cur = build_attn(inp, |
182 | 0 | nullptr, nullptr, |
183 | 0 | Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); |
184 | 0 | cb(cur, "attn_pregate", il); |
185 | |
|
186 | 0 | ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate); |
187 | 0 | cb(gate_sigmoid, "gate_sigmoid", il); |
188 | |
|
189 | 0 | cur = ggml_mul(ctx0, cur, gate_sigmoid); |
190 | 0 | cb(cur, "attn_gated", il); |
191 | |
|
192 | 0 | cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s); |
193 | 0 | cb(cur, "attn_output", il); |
194 | |
|
195 | 0 | return cur; |
196 | 0 | } |
197 | | |
198 | | ggml_tensor * llm_build_qwen35::build_layer_attn_linear( |
199 | | llm_graph_input_rs * inp, |
200 | | ggml_tensor * cur, |
201 | 0 | int il) { |
202 | 0 | const auto * mctx_cur = inp->mctx; |
203 | |
|
204 | 0 | const int64_t d_inner = hparams.ssm_d_inner; |
205 | 0 | const int64_t n_seqs = ubatch.n_seqs; |
206 | 0 | const int64_t head_k_dim = hparams.ssm_d_state; |
207 | 0 | const int64_t num_k_heads = hparams.ssm_n_group; |
208 | 0 | const int64_t num_v_heads = hparams.ssm_dt_rank; |
209 | 0 | const int64_t head_v_dim = d_inner / num_v_heads; |
210 | 0 | const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
211 | |
|
212 | 0 | const auto kv_head = mctx_cur->get_head(); |
213 | |
|
214 | 0 | GGML_ASSERT(n_seqs != 0); |
215 | 0 | GGML_ASSERT(ubatch.equal_seqs()); |
216 | 0 | GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); |
217 | | |
218 | | // Input projections |
219 | 0 | auto qkvz = build_qkvz(cur, il); |
220 | 0 | ggml_tensor * qkv_mixed = qkvz.first; |
221 | 0 | ggml_tensor * z = qkvz.second; |
222 | |
|
223 | 0 | ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s); |
224 | 0 | beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); |
225 | 0 | cb(beta, "beta", il); |
226 | |
|
227 | 0 | beta = ggml_sigmoid(ctx0, beta); |
228 | |
|
229 | 0 | ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); |
230 | 0 | alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); |
231 | 0 | cb(alpha, "alpha", il); |
232 | |
|
233 | 0 | ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); |
234 | 0 | ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); |
235 | 0 | cb(alpha_softplus, "a_softplus", il); |
236 | |
|
237 | 0 | ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus |
238 | 0 | cb(gate, "gate", il); |
239 | |
|
240 | 0 | gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); |
241 | | |
242 | | // Get convolution states from cache |
243 | 0 | ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); |
244 | 0 | ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); |
245 | | |
246 | | // Build the convolution states tensor |
247 | 0 | ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); |
248 | 0 | cb(conv_states, "conv_states", il); |
249 | | |
250 | | // Calculate convolution kernel size |
251 | 0 | ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; |
252 | 0 | const int64_t conv_kernel_size = conv_kernel->ne[0]; |
253 | 0 | const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; |
254 | |
|
255 | 0 | conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); |
256 | 0 | cb(conv_states, "conv_states_reshaped", il); |
257 | |
|
258 | 0 | qkv_mixed = ggml_transpose(ctx0, qkv_mixed); |
259 | 0 | cb(qkv_mixed, "qkv_mixed_transposed", il); |
260 | |
|
261 | 0 | ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); |
262 | 0 | cb(conv_input, "conv_input", il); |
263 | | |
264 | | // Update convolution state cache |
265 | | // Extract the last (conv_kernel_size - 1) states from conv_input |
266 | 0 | ggml_tensor * last_conv_states = |
267 | 0 | ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], |
268 | 0 | conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); |
269 | 0 | cb(last_conv_states, "last_conv_states", il); |
270 | |
|
271 | 0 | ggml_tensor * state_update_target = |
272 | 0 | ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs, |
273 | 0 | kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); |
274 | 0 | cb(state_update_target, "state_update_target", il); |
275 | |
|
276 | 0 | ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); |
277 | |
|
278 | 0 | ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); |
279 | 0 | state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); |
280 | 0 | cb(state, "state_predelta", il); |
281 | |
|
282 | 0 | ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); |
283 | 0 | cb(conv_output_proper, "conv_output_raw", il); |
284 | |
|
285 | 0 | ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); |
286 | 0 | cb(conv_output_silu, "conv_output_silu", il); |
287 | |
|
288 | 0 | ggml_tensor * conv_qkv_mix = conv_output_silu; |
289 | | |
290 | | // Calculate the total conv dimension |
291 | 0 | int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; |
292 | 0 | int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); |
293 | | |
294 | | // Extract the convolved Q, K, V from conv_output |
295 | 0 | ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
296 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
297 | 0 | nb1_qkv, |
298 | 0 | nb1_qkv * n_seq_tokens, |
299 | 0 | 0); |
300 | |
|
301 | 0 | ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, |
302 | 0 | ggml_row_size(conv_qkv_mix->type, head_k_dim), |
303 | 0 | nb1_qkv, |
304 | 0 | nb1_qkv * n_seq_tokens, |
305 | 0 | head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); |
306 | |
|
307 | 0 | ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs, |
308 | 0 | ggml_row_size(conv_qkv_mix->type, head_v_dim), |
309 | 0 | nb1_qkv, |
310 | 0 | nb1_qkv * n_seq_tokens, |
311 | 0 | ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads)); |
312 | |
|
313 | 0 | cb(q_conv, "q_conv", il); |
314 | 0 | cb(k_conv, "k_conv", il); |
315 | 0 | cb(v_conv, "v_conv", il); |
316 | |
|
317 | 0 | const float eps_norm = hparams.f_norm_rms_eps; |
318 | |
|
319 | 0 | q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm); |
320 | 0 | k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm); |
321 | | |
322 | | //q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
323 | | //k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); |
324 | | //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
325 | | |
326 | | // if head keys and value keys are different, repeat to force tensors into matching shapes |
327 | | // note: need explicit repeat only if we are not using the fused GDN |
328 | 0 | if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) { |
329 | 0 | GGML_ASSERT(num_v_heads % num_k_heads == 0); |
330 | 0 | q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
331 | 0 | k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); |
332 | 0 | } |
333 | |
|
334 | 0 | cb(q_conv, "q_conv_predelta", il); |
335 | 0 | cb(k_conv, "k_conv_predelta", il); |
336 | 0 | cb(v_conv, "v_conv_predelta", il); |
337 | |
|
338 | 0 | auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il); |
339 | |
|
340 | 0 | ggml_tensor * output = attn_out.first; |
341 | 0 | ggml_tensor * new_state = attn_out.second; |
342 | 0 | cb(output, "attn_output", il); |
343 | 0 | cb(new_state, "new_state", il); |
344 | | |
345 | | // Update the recurrent states |
346 | 0 | ggml_build_forward_expand(gf, |
347 | 0 | ggml_cpy(ctx0, new_state, |
348 | 0 | ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, |
349 | 0 | kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); |
350 | | |
351 | | // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] |
352 | 0 | ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); |
353 | | |
354 | | // Apply gated normalization: self.norm(core_attn_out, z) |
355 | 0 | ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il); |
356 | | |
357 | | // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim] |
358 | 0 | ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); |
359 | 0 | cb(final_output, "final_output", il); |
360 | | |
361 | | // Output projection |
362 | 0 | cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s); |
363 | 0 | cb(cur, "linear_attn_out", il); |
364 | | |
365 | | // Reshape back to original dimensions |
366 | 0 | cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); |
367 | |
|
368 | 0 | return cur; |
369 | 0 | } |
370 | | |
371 | 0 | ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il) { |
372 | | // Qwen3.5 does not use MoE FFN |
373 | 0 | GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr); |
374 | |
|
375 | 0 | cur = build_ffn(cur, |
376 | 0 | model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s, |
377 | 0 | model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s, |
378 | 0 | model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s, |
379 | 0 | NULL, |
380 | 0 | LLM_FFN_SILU, LLM_FFN_PAR, il); |
381 | 0 | cb(cur, "ffn_out", il); |
382 | |
|
383 | 0 | return cur; |
384 | 0 | } |