Coverage Report

Created: 2026-06-22 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/models/deepseek32.cpp
Line
Count
Source
1
#include "models.h"
2
3
#include "llama-kv-cache.h"
4
#include "llama-kv-cache-dsa.h"
5
6
0
void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
7
0
    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
8
0
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
9
0
    hparams.f_norm_eps = 1e-6;  // eps for layer norm
10
0
    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
11
12
    // MoE parameters
13
0
    ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
14
0
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
15
0
    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
16
0
    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
17
0
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
18
0
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
19
20
    // deepseek MLA parameters
21
0
    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
22
0
    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
23
0
    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
24
0
    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
25
0
    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
26
0
    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
27
28
    // DSA parameters
29
0
    ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
30
0
    ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
31
0
    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
32
33
    // Expert gating function
34
0
    ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
35
36
0
    if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
37
        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
38
        // cancel the factor from the convert script
39
0
        hparams.rope_yarn_log_mul /= 0.1f;
40
0
    }
41
42
    // NextN/MTP parameters
43
0
    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
44
0
    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
45
46
0
    switch (hparams.n_layer()) {
47
0
        case 62: type = LLM_TYPE_685B_A37B; break;
48
0
        default: type = LLM_TYPE_UNKNOWN;
49
0
    }
50
0
}
51
52
0
void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
53
0
    LLAMA_LOAD_LOCALS;
54
0
    const bool is_mla = hparams.is_mla();
55
0
    if (!is_mla) {
56
0
        throw std::runtime_error("DEEPSEEK32 architecture requires MLA");
57
0
    }
58
59
    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
60
0
    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
61
0
    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
62
63
0
    const int64_t n_embd_head_qk_rope = hparams.n_rot();
64
0
    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
65
66
0
    const int64_t q_lora_rank  = hparams.n_lora_q;
67
0
    const int64_t kv_lora_rank = hparams.n_lora_kv;
68
69
0
    const int64_t n_ff_exp        = hparams.n_ff_exp;
70
0
    const int64_t n_expert_shared = hparams.n_expert_shared;
71
72
0
    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
73
74
    // output
75
0
    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
76
    // try to load output.weight, if not found, use token_embd (tied embeddings)
77
0
    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
78
0
    if (!output) {
79
0
        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
80
0
    }
81
82
0
    for (int i = 0; i < n_layer_all; ++i) {
83
0
        int flags = 0;
84
0
        if (i >= n_layer) {
85
            // skip all tensors in the NextN layers
86
            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
87
0
            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
88
0
        }
89
90
0
        auto & layer = layers[i];
91
92
0
        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
93
0
        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
94
0
        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
95
96
0
        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
97
0
        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
98
99
0
        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
100
101
        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
102
0
        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
103
0
        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
104
105
0
        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
106
107
0
        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
108
109
        // DSA indexer
110
0
        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
111
0
        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
112
0
        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
113
0
        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
114
0
        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
115
0
        if (i < (int) hparams.n_layer_dense_lead) {
116
0
            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
117
0
            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
118
0
            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
119
0
        } else {
120
0
            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
121
0
            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
122
123
0
            if (n_expert == 0) {
124
0
                throw std::runtime_error("n_expert must be > 0");
125
0
            }
126
0
            if (n_expert_used == 0) {
127
0
                throw std::runtime_error("n_expert_used must be > 0");
128
0
            }
129
130
            // MoE branch
131
0
            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
132
0
            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
133
0
            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
134
135
            // Shared expert branch
136
0
            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
137
0
            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
138
0
            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
139
0
        }
140
141
        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
142
0
        if (i >= n_layer) {
143
0
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
144
0
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
145
0
            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
146
147
            // Optional tensors
148
0
            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
149
0
            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
150
0
            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
151
0
        }
152
0
    }
153
0
}
154
155
0
std::unique_ptr<llm_graph_context> llama_model_deepseek32::build_arch_graph(const llm_graph_params & params) const {
156
0
    return std::make_unique<graph>(*this, params);
157
0
}
158
159
llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_params & params) :
160
0
    llm_graph_context(params) {
161
0
    const bool is_mla = hparams.is_mla();
162
0
    GGML_ASSERT(is_mla);
163
164
    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
165
0
    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
166
0
    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
167
0
    GGML_UNUSED(n_embd_head_v);
168
169
0
    const int64_t n_embd_head_qk_rope = hparams.n_rot();
170
0
    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
171
172
0
    const int64_t n_indexer_head = hparams.indexer_n_head;
173
0
    const int64_t n_embd_indexer_head = hparams.indexer_head_size;
174
0
    const int64_t n_embd_indexer_head_rope = hparams.n_rot();
175
0
    const int64_t n_embd_indexer_head_nope = n_embd_indexer_head - n_embd_indexer_head_rope;
176
0
    const uint32_t n_indexer_top_k = hparams.indexer_top_k;
177
178
0
    const uint32_t kv_lora_rank = hparams.n_lora_kv;
179
180
    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
181
    // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
182
    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
183
184
    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
185
0
    GGML_ASSERT(ext_factor >= 0.0f);
186
0
    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
187
188
    // use the original attn_factor to pre-scale the kq_scale
189
0
    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
190
0
    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
191
192
0
    ggml_tensor * cur;
193
0
    ggml_tensor * inpL;
194
195
    // {n_embd, n_tokens}
196
0
    inpL = build_inp_embd(model.tok_embd);
197
198
    // inp_pos - contains the positions
199
0
    ggml_tensor * inp_pos = build_inp_pos();
200
201
0
    llm_graph_input_attn_k_dsa * inp_attn_dsa = build_attn_inp_k_dsa();
202
203
0
    ggml_tensor * inp_out_ids = build_inp_out_ids();
204
205
0
    for (int il = 0; il < n_layer; ++il) {
206
0
        ggml_tensor * inpSA = inpL;
207
208
        // norm
209
0
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
210
0
        cb(cur, "attn_norm", il);
211
212
        // self_attention
213
0
        {
214
0
            ggml_tensor * qr = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
215
0
            cb(qr, "qr", il);
216
217
0
            qr = build_norm(qr, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
218
0
            cb(qr, "qr", il);
219
220
0
            ggml_tensor * top_k = nullptr;
221
222
            // lightning indexer
223
0
            {
224
0
                ggml_tensor * indexer_q = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_q_b, qr);
225
0
                cb(indexer_q, "indexer_q", il);
226
227
                // split into {n_embd_indexer_head_rope, n_indexer_head, n_tokens}
228
0
                ggml_tensor * indexer_q_pe =
229
0
                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_rope, n_indexer_head, n_tokens,
230
0
                                 ggml_row_size(indexer_q->type, n_embd_indexer_head),
231
0
                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, 0);
232
0
                cb(indexer_q_pe, "indexer_q_pe", il);
233
234
                // and {n_embd_indexer_head_nope, n_indexer_head, n_tokens}
235
0
                ggml_tensor * indexer_q_nope =
236
0
                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_nope, n_indexer_head, n_tokens,
237
0
                                 ggml_row_size(indexer_q->type, n_embd_indexer_head),
238
0
                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head,
239
0
                                 ggml_row_size(indexer_q->type, n_embd_indexer_head_nope));
240
0
                cb(indexer_q_nope, "indexer_q_nope", il);
241
242
0
                indexer_q_pe = ggml_rope_ext(ctx0, indexer_q_pe, inp_pos, nullptr, n_rot,
243
0
                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
244
0
                                     ext_factor, attn_factor, beta_fast, beta_slow);
245
0
                cb(indexer_q_pe, "indexer_q_pe", il);
246
247
                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, n_head, n_tokens}
248
0
                indexer_q = ggml_concat(ctx0, indexer_q_pe, indexer_q_nope, 0);
249
0
                cb(indexer_q, "indexer_q", il);
250
251
0
                ggml_tensor * indexer_k = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_k, cur);
252
0
                cb(indexer_k, "indexer_k", il);
253
254
0
                indexer_k = build_norm(indexer_k, model.layers[il].indexer_k_norm, model.layers[il].indexer_k_norm_b, LLM_NORM, il);
255
0
                cb(indexer_k, "indexer_k", il);
256
257
                // split into {n_embd_indexer_head_rope, 1, n_tokens}
258
0
                ggml_tensor * indexer_k_pe =
259
0
                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_rope, 1, n_tokens,
260
0
                                 ggml_row_size(indexer_k->type, n_embd_indexer_head),
261
0
                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, 0);
262
0
                cb(indexer_k_pe, "indexer_k_pe", il);
263
264
                // and {n_embd_indexer_head_nope, 1, n_tokens}
265
0
                ggml_tensor * indexer_k_nope =
266
0
                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_nope, 1, n_tokens,
267
0
                                 ggml_row_size(indexer_k->type, n_embd_indexer_head),
268
0
                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1,
269
0
                                 ggml_row_size(indexer_k->type, n_embd_indexer_head_nope));
270
0
                cb(indexer_k_nope, "indexer_k_nope", il);
271
272
0
                indexer_k_pe = ggml_rope_ext(ctx0, indexer_k_pe, inp_pos, nullptr, n_rot,
273
0
                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
274
0
                                     ext_factor, attn_factor, beta_fast, beta_slow);
275
0
                cb(indexer_k_pe, "indexer_k_pe", il);
276
277
                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, 1, n_tokens}
278
0
                indexer_k = ggml_concat(ctx0, indexer_k_pe, indexer_k_nope, 0);
279
0
                cb(indexer_k, "indexer_k", il);
280
281
                // perform Hadamard transform on indexer q and k
282
0
                indexer_q = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_q);
283
0
                cb(indexer_q, "indexer_q", il);
284
0
                indexer_k = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_k);
285
0
                cb(indexer_k, "indexer_k", il);
286
287
                // store indexer keys to KV cache
288
0
                const auto * mctx_lid = inp_attn_dsa->mctx->get_lid();
289
0
                const auto & k_idxs_lid = inp_attn_dsa->get_k_idxs_lid();
290
0
                ggml_build_forward_expand(gf, mctx_lid->cpy_k(ctx0, indexer_k, k_idxs_lid, il));
291
292
                // prepare indexer weights
293
0
                ggml_tensor * indexer_weights = ggml_mul_mat(ctx0, model.layers[il].indexer_proj, cur);
294
0
                cb(indexer_weights, "indexer_weights", il);
295
296
                // get cached indexer keys
297
0
                indexer_k = mctx_lid->get_k(ctx0, il);
298
299
                // split the batch into streams if needed
300
0
                const auto n_stream = indexer_k->ne[3];
301
0
                indexer_q = ggml_view_4d(ctx0, indexer_q, indexer_q->ne[0], indexer_q->ne[1], indexer_q->ne[2]/n_stream, n_stream, indexer_q->nb[1], indexer_q->nb[2], indexer_q->nb[3]/n_stream, 0);
302
0
                indexer_weights = ggml_view_4d(ctx0, indexer_weights, indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream, indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0);
303
304
                // calculate indexer kq
305
0
                indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3);
306
0
                cb(indexer_q, "indexer_q", il);
307
0
                indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3);
308
0
                cb(indexer_k, "indexer_k", il);
309
310
0
                ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q);
311
0
                cb(indexer_kq, "indexer_kq", il);
312
313
                // ReLU requires contiguous tensors
314
0
                indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3));
315
0
                cb(indexer_kq, "indexer_kq", il);
316
317
                // apply ReLU
318
0
                ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq);
319
0
                cb(indexer_score, "indexer_score", il);
320
321
                // pre-scale weights to avoid scaling operations on huge indexer_score tensor
322
0
                indexer_weights = ggml_scale(ctx0, indexer_weights, 1.0f / sqrtf(float(n_embd_indexer_head * n_indexer_head)));
323
0
                cb(indexer_weights, "indexer_weights", il);
324
325
                // multiply scores by indexer weights
326
0
                indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights);
327
0
                cb(indexer_score, "indexer_score", il);
328
329
                // sum by q n_indexer_head dimension
330
0
                indexer_score = ggml_sum_rows(ctx0, indexer_score);
331
0
                cb(indexer_score, "indexer_score", il);
332
333
                // permute result to match KQ mask
334
0
                indexer_score = ggml_cont(ctx0, ggml_permute(ctx0, indexer_score, 2, 1, 0, 3));
335
0
                cb(indexer_score, "indexer_score", il);
336
337
                // mask indexer scores
338
0
                ggml_tensor * indexer_kq_mask = inp_attn_dsa->get_kq_mask_lid();
339
0
                indexer_score = ggml_add(ctx0, indexer_score, indexer_kq_mask);
340
0
                cb(indexer_score, "indexer_score", il);
341
342
                // get indices of top k indexer scores
343
0
                uint32_t n_top_k = indexer_score->ne[0] < n_indexer_top_k ? indexer_score->ne[0] : n_indexer_top_k;
344
0
                top_k = ggml_cont(ctx0, ggml_top_k(ctx0, indexer_score, n_top_k));
345
0
                cb(top_k, "top_k", il);
346
0
            }
347
348
0
            ggml_tensor * q = ggml_mul_mat(ctx0, model.layers[il].wq_b, qr);
349
0
            cb(q, "q", il);
350
351
            // split into {n_embd_head_qk_nope, n_head, n_tokens}
352
0
            ggml_tensor * q_nope =
353
0
                ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
354
0
                             ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
355
0
            cb(q_nope, "q_nope", il);
356
357
            // and {n_embd_head_qk_rope, n_head, n_tokens}
358
0
            ggml_tensor * q_pe = ggml_view_3d(
359
0
                ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
360
0
                ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
361
0
            cb(q_pe, "q_pe", il);
362
363
0
            ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
364
0
            cb(kv_cmpr_pe, "kv_cmpr_pe", il);
365
366
            // split into {kv_lora_rank, n_tokens}
367
0
            ggml_tensor * kv_cmpr =
368
0
                ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
369
0
                             ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
370
0
            cb(kv_cmpr, "kv_cmpr", il);
371
372
            // and {n_embd_head_qk_rope, 1, n_tokens}
373
0
            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
374
0
                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
375
0
                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
376
0
                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
377
0
            cb(k_pe, "k_pe", il);
378
379
0
            q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
380
0
                                 ext_factor, attn_factor, beta_fast, beta_slow);
381
0
            cb(q_pe, "q_pe", il);
382
383
0
            k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
384
0
                                 ext_factor, attn_factor, beta_fast, beta_slow);
385
0
            cb(k_pe, "k_pe", il);
386
387
0
            kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
388
0
            cb(kv_cmpr, "kv_cmpr", il);
389
390
            // MLA attention
391
0
            {
392
                // {n_embd_head_qk_nope, n_tokens, n_head}
393
0
                q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
394
0
                cb(q_nope, "q_nope_perm", il);
395
396
                // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
397
0
                ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
398
0
                cb(q_nope_absorbed, "q_nope_absorbed", il);
399
400
                // {kv_lora_rank, n_head, n_tokens}
401
0
                q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
402
0
                cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
403
404
                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
405
                // note: rope must go first for in-place context shifting in build_rope_shift()
406
0
                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
407
0
                cb(Qcur, "Qcur", il);
408
409
0
                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
410
0
                cb(kv_cmpr, "kv_cmpr_reshape", il);
411
412
                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
413
0
                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
414
0
                cb(Kcur, "Kcur", il);
415
416
                // {kv_lora_rank, 1, n_tokens}
417
0
                ggml_tensor * Vcur = kv_cmpr;
418
0
                cb(Vcur, "Vcur", il);
419
420
                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
421
0
                cur = build_attn(inp_attn_dsa,
422
0
                        model.layers[il].wo, NULL, model.layers[il].wo_s,
423
0
                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
424
0
            }
425
0
        }
426
0
        if (il == n_layer - 1 && inp_out_ids) {
427
0
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
428
0
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
429
0
        }
430
0
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
431
0
        cb(ffn_inp, "ffn_inp", il);
432
433
0
        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
434
0
        cb(cur, "ffn_norm", il);
435
436
0
        if ((uint32_t) il < hparams.n_layer_dense_lead) {
437
0
            cur = build_ffn(cur,
438
0
                model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
439
0
                model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
440
0
                model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
441
0
                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
442
0
            cb(cur, "ffn_out", il);
443
0
        } else {
444
            // MoE branch
445
0
            ggml_tensor * moe_out = build_moe_ffn(cur,
446
0
                model.layers[il].ffn_gate_inp,
447
0
                model.layers[il].ffn_up_exps,
448
0
                model.layers[il].ffn_gate_exps,
449
0
                model.layers[il].ffn_down_exps,
450
0
                model.layers[il].ffn_exp_probs_b,
451
0
                n_expert, n_expert_used,
452
0
                LLM_FFN_SILU, hparams.expert_weights_norm,
453
0
                hparams.expert_weights_scale,
454
0
                (llama_expert_gating_func_type) hparams.expert_gating_func,
455
0
                il,
456
0
                nullptr,
457
0
                model.layers[il].ffn_gate_up_exps,
458
0
                model.layers[il].ffn_up_exps_s,
459
0
                model.layers[il].ffn_gate_exps_s,
460
0
                model.layers[il].ffn_down_exps_s);
461
0
            cb(moe_out, "ffn_moe_out", il);
462
463
            // FFN shared expert
464
0
            {
465
0
                ggml_tensor * ffn_shexp =
466
0
                    build_ffn(cur,
467
0
                        model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
468
0
                        model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s,
469
0
                        model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
470
0
                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
471
0
                cb(ffn_shexp, "ffn_shexp", il);
472
473
0
                cur = ggml_add(ctx0, moe_out, ffn_shexp);
474
0
                cb(cur, "ffn_out", il);
475
0
            }
476
0
        }
477
0
        cur = ggml_add(ctx0, cur, ffn_inp);
478
479
0
        cur = build_cvec(cur, il);
480
0
        cb(cur, "l_out", il);
481
482
        // input for next layer
483
0
        inpL = cur;
484
0
    }
485
0
    cur = inpL;
486
487
0
    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
488
489
0
    cb(cur, "result_norm", -1);
490
0
    res->t_embd = cur;
491
492
    // lm_head
493
0
    cur = ggml_mul_mat(ctx0, model.output, cur);
494
495
0
    cb(cur, "result_output", -1);
496
0
    res->t_logits = cur;
497
498
0
    ggml_build_forward_expand(gf, cur);
499
0
}