Coverage Report

Created: 2026-04-12 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama-model.cpp
Line
Count
Source
1
#include "llama-model.h"
2
3
#include "llama-arch.h"
4
#include "llama-hparams.h"
5
#include "llama-impl.h"
6
#include "llama-mmap.h"
7
#include "llama-cparams.h"
8
#include "llama-model-loader.h"
9
10
#include "llama-kv-cache.h"
11
#include "llama-kv-cache-iswa.h"
12
#include "llama-memory-hybrid.h"
13
#include "llama-memory-hybrid-iswa.h"
14
#include "llama-memory-recurrent.h"
15
16
#include "models/models.h"
17
18
#include "ggml.h"
19
#include "ggml-cpp.h"
20
21
// TODO: tmp until the ggml meta backend matures and becomes public
22
#include "../src/ggml-ext.h"
23
24
#include <algorithm>
25
#include <cassert>
26
#include <cfloat>
27
#include <cstdint>
28
#include <cstring>
29
#include <cmath>
30
#include <functional>
31
#include <map>
32
#include <numeric>
33
#include <regex>
34
#include <sstream>
35
#include <stdexcept>
36
#include <string>
37
#include <vector>
38
39
0
struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const struct ggml_tensor * tensor, void * userdata) {
40
0
    const llama_meta_device_get_split_state_userdata * ud = (const llama_meta_device_get_split_state_userdata *) userdata;
41
0
    const llama_hparams & hparams = ud->model->hparams;
42
0
    const std::string tensor_name = tensor->name;
43
44
0
    const std::regex pattern_q_weight        ("blk\\.\\d*\\.attn_q.weight");
45
0
    const std::regex pattern_kv_weight       ("blk\\.\\d*\\.attn_(k|v).weight");
46
0
    const std::regex pattern_qkv_weight      ("blk\\.\\d*\\.attn_qkv.weight");
47
0
    const std::regex pattern_q_bias          ("blk\\.\\d*\\.attn_q\\.bias");
48
0
    const std::regex pattern_kv_bias         ("blk\\.\\d*\\.attn_(k|v)\\.bias");
49
0
    const std::regex pattern_qkv_bias        ("blk\\.\\d*\\.attn_qkv.bias");
50
0
    const std::regex pattern_qk_norm         ("blk\\.\\d*\\.attn_(q|k)_norm\\.weight");
51
0
    const std::regex pattern_kv_cache        ("cache_(k|v)_l\\d*");
52
0
    const std::regex pattern_attn_sinks      ("blk\\.\\d*\\.attn_sinks.weight");
53
0
    const std::regex pattern_attn_out_weight ("blk\\.\\d*\\.attn_output.weight");
54
0
    const std::regex pattern_attn_out_bias   ("blk\\.\\d*\\.attn_output.bias");
55
0
    const std::regex pattern_attn_gate_weight("blk\\.\\d*\\.attn_gate.weight");
56
57
0
    const std::regex pattern_ssm_dt          ("blk\\.\\d*\\.ssm_dt.bias");
58
0
    const std::regex pattern_ssm_a           ("blk\\.\\d*\\.ssm_a");
59
0
    const std::regex pattern_ssm_alpha       ("blk\\.\\d*\\.ssm_alpha.weight");
60
0
    const std::regex pattern_ssm_beta        ("blk\\.\\d*\\.ssm_beta.weight");
61
0
    const std::regex pattern_ssm_beta_alpha  ("blk\\.\\d*\\.ssm_ba.weight");
62
0
    const std::regex pattern_r_cache         ("cache_r_l\\d*");
63
0
    const std::regex pattern_s_cache         ("cache_s_l\\d*");
64
0
    const std::regex pattern_ssm_conv1d      ("blk\\.\\d*\\.ssm_conv1d.weight");
65
0
    const std::regex pattern_ssm_out_weight  ("blk\\.\\d*\\.ssm_out.weight");
66
67
0
    const std::regex pattern_ffn_up_gate_weight("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.weight");
68
0
    const std::regex pattern_ffn_up_gate_bias  ("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.bias");
69
0
    const std::regex pattern_ffn_gate_up_weight("blk\\.\\d*\\.ffn_gate_up(_exps)?.weight");
70
0
    const std::regex pattern_ffn_down_weight   ("blk\\.\\d*\\.ffn_down(_exps)?.weight");
71
0
    const std::regex pattern_ffn_down_bias     ("blk\\.\\d*\\.ffn_down.bias");
72
0
    const std::regex pattern_ffn_down_exps_bias("blk\\.\\d*\\.ffn_down_exps.bias");
73
74
0
    const std::regex pattern_output_weight("output\\.weight");
75
0
    const std::regex pattern_output_bias  ("output\\.bias");
76
77
0
    struct tensor_config {
78
0
        ggml_backend_meta_split_axis axis;
79
80
0
        const ggml_tensor * tensor_axis_0;
81
82
0
        uint32_t il;
83
0
        size_t   rotation;
84
0
    };
85
86
0
    auto get_tensor_config_impl = [&](
87
0
                const ggml_backend_meta_split_axis axis, const std::string & suffix = "", const std::string & suffix_fallback = "") -> tensor_config {
88
0
        uint32_t il;
89
0
        std::string prefix;
90
0
        size_t rotation;
91
0
        if (tensor_name.substr(0, 4) == "blk.") {
92
0
            const size_t length_prefix = tensor_name.find('.', 4);
93
0
            GGML_ASSERT(length_prefix != std::string::npos);
94
0
            prefix = tensor_name.substr(0, length_prefix + 1);
95
0
            il = std::stoull(tensor_name.substr(4, length_prefix));
96
0
            rotation = il % ud->n_devices;
97
0
        } else if (tensor_name.substr(0, 6) == "cache_") {
98
0
            const size_t layer_index_start = tensor_name.find("_l", 6);
99
0
            GGML_ASSERT(layer_index_start != std::string::npos);
100
0
            il = std::stoull(tensor_name.substr(layer_index_start + 2));
101
0
            prefix = "blk." + std::to_string(il) + ".";
102
0
            rotation = il % ud->n_devices;
103
0
        } else {
104
0
            il = 0;
105
0
            rotation = hparams.n_layer % ud->n_devices;
106
0
        }
107
0
        const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str());
108
0
        if (tensor_axis_0 == nullptr) {
109
0
            GGML_ASSERT(!suffix_fallback.empty());
110
0
            tensor_axis_0 = ud->model->get_tensor((prefix + suffix_fallback).c_str());
111
0
        }
112
0
        GGML_ASSERT(tensor_axis_0 != nullptr);
113
0
        return {axis, tensor_axis_0, il, rotation};
114
0
    };
115
116
0
    auto get_tensor_config = [&]() -> tensor_config {
117
        // standard attention
118
0
        if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) {
119
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
120
0
        }
121
0
        if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) {
122
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
123
0
        }
124
0
        if (std::regex_match(tensor_name, pattern_qkv_weight)) {
125
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
126
0
        }
127
0
        if ( std::regex_match(tensor_name, pattern_qkv_bias)) {
128
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
129
0
        }
130
0
        if (std::regex_match(tensor_name, pattern_qk_norm)) {
131
0
            return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
132
0
        }
133
0
        if (std::regex_match(tensor_name, pattern_kv_cache) || std::regex_match(tensor_name, pattern_attn_sinks)) {
134
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
135
0
        }
136
0
        if (std::regex_match(tensor_name, pattern_attn_out_weight)) {
137
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
138
0
        }
139
0
        if (std::regex_match(tensor_name, pattern_attn_out_bias)) {
140
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
141
0
        }
142
143
0
        if (std::regex_match(tensor_name, pattern_attn_gate_weight)) {
144
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
145
0
        }
146
0
        if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) {
147
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");
148
0
        }
149
0
        if (std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta) ||
150
0
                std::regex_match(tensor_name, pattern_ssm_beta_alpha)) {
151
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ssm_out.weight");
152
0
        }
153
0
        if (std::regex_match(tensor_name, pattern_r_cache) || std::regex_match(tensor_name, pattern_s_cache)) {
154
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");
155
0
        }
156
0
        if (std::regex_match(tensor_name, pattern_ssm_conv1d)) {
157
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ssm_out.weight");
158
0
        }
159
0
        if (std::regex_match(tensor_name, pattern_ssm_out_weight)) {
160
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
161
0
        }
162
163
        // FFN
164
0
        if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight)) {
165
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ffn_down.weight", "ffn_down_exps.weight");
166
0
        }
167
0
        if (std::regex_match(tensor_name, pattern_ffn_up_gate_bias)) {
168
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ffn_down.weight", "ffn_down_exps.weight");
169
0
        }
170
0
        if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
171
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ffn_down.weight", "ffn_down_exps.weight");
172
0
        }
173
0
        if (std::regex_match(tensor_name, pattern_ffn_down_weight)) {
174
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ffn_down.weight", "ffn_down_exps.weight");
175
0
        }
176
0
        if (std::regex_match(tensor_name, pattern_ffn_down_bias)) {
177
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
178
0
        }
179
0
        if (std::regex_match(tensor_name, pattern_ffn_down_exps_bias)) {
180
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_PARTIAL);
181
0
        }
182
183
        // output
184
0
        if (std::regex_match(tensor_name, pattern_output_weight)) {
185
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
186
0
        }
187
0
        if (std::regex_match(tensor_name, pattern_output_bias)) {
188
0
            const ggml_tensor * output_weight = ud->model->get_tensor("output.weight");
189
0
            GGML_ASSERT(output_weight != nullptr);
190
0
            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
191
0
        }
192
193
        // everything else
194
0
        return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
195
0
    };
196
197
0
    auto get_split_segments = [&](int axis, uint32_t il) -> std::vector<int64_t> {
198
0
        if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) {
199
0
            const int64_t head_k_dim = hparams.ssm_d_state;
200
0
            const int64_t head_v_dim = hparams.ssm_d_state;
201
0
            const int64_t n_k_heads  = hparams.ssm_n_group;
202
0
            const int64_t n_v_heads  = hparams.ssm_dt_rank;
203
0
            const int64_t key_dim    = head_k_dim * n_k_heads;
204
0
            const int64_t value_dim  = head_v_dim * n_v_heads;
205
206
            // both Qwen 3 Next and Qwen 3.5 support n_v_heads > n_k_heads but the broadcasting pattern is different:
207
            //   - Qwen 3 Next: [k0_v0, k0_v1, k1_v2, k1_v3] (this is the default split pattern)
208
            //   - Qwen 3.5:    [k0_v0, k1_v1, k0_v2, k1_v3] (needs segmenting of V on the scale of K to get the correct pattern)
209
0
            if (ud->model->arch == LLM_ARCH_QWEN3NEXT) {
210
0
                if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
211
0
                    GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
212
0
                    return {key_dim, key_dim, value_dim};
213
0
                }
214
0
            } else {
215
0
                const int64_t head_ratio = n_v_heads / n_k_heads;
216
0
                if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
217
0
                    GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
218
0
                    return std::vector<int64_t>(2 + head_ratio, key_dim);
219
0
                }
220
0
                if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
221
0
                    return std::vector<int64_t>(head_ratio, key_dim);
222
0
                }
223
0
                if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
224
0
                        std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
225
0
                    return std::vector<int64_t>(head_ratio, n_k_heads);
226
0
                }
227
0
                if (std::regex_match(tensor_name, pattern_r_cache)) {
228
0
                    return std::vector<int64_t>(2 + head_ratio, key_dim * (hparams.ssm_d_conv - 1));
229
0
                }
230
0
                if (std::regex_match(tensor_name, pattern_s_cache)) {
231
0
                    return std::vector<int64_t>(head_ratio, n_k_heads * head_v_dim * head_v_dim);
232
0
                }
233
0
            }
234
235
            // the FFN is the same for Qwen 3 Next and Qwen 3.5:
236
0
            if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
237
0
                const int64_t n_ff_exp = hparams.n_ff_exp;
238
0
                GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
239
0
                return {n_ff_exp, n_ff_exp};
240
0
            }
241
0
            return {tensor->ne[axis]};
242
0
        }
243
244
0
        if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) {
245
0
            const int64_t n_embd      = hparams.n_embd;
246
0
            const int64_t n_embd_gqa  = hparams.n_embd_v_gqa(il);
247
0
            GGML_ASSERT(hparams.n_embd_k_gqa() == n_embd_gqa);
248
0
            GGML_ASSERT(tensor->ne[axis] == n_embd + 2*n_embd_gqa);
249
0
            return {n_embd, n_embd_gqa, n_embd_gqa};
250
0
        }
251
0
        if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
252
0
            const int64_t n_ff_exp = hparams.n_ff_exp;
253
0
            GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
254
0
            return {n_ff_exp, n_ff_exp};
255
0
        }
256
0
        return {tensor->ne[axis]};
257
0
    };
258
259
0
    auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<int64_t> & segments) -> std::vector<int64_t> {
260
0
        if (hparams.is_recurrent(il)) {
261
            // linear attention
262
0
            const int64_t head_dim  = hparams.ssm_d_state;
263
0
            const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
264
0
            if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) ||
265
0
                    std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
266
0
                return std::vector<int64_t>(segments.size(), granularity_qkv);
267
0
            }
268
0
            if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
269
0
                    std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
270
0
                return std::vector<int64_t>(segments.size(), granularity_qkv / head_dim);
271
0
            }
272
0
            if (std::regex_match(tensor_name, pattern_ssm_beta_alpha)) {
273
0
                return std::vector<int64_t>(segments.size(), 2 * (granularity_qkv / head_dim));
274
0
            }
275
0
            if (std::regex_match(tensor_name, pattern_r_cache)) {
276
0
                return std::vector<int64_t>(segments.size(), granularity_qkv * (hparams.ssm_d_conv - 1));
277
0
            }
278
0
            if (std::regex_match(tensor_name, pattern_s_cache)) {
279
0
                return std::vector<int64_t>(segments.size(), granularity_qkv * head_dim);
280
0
            }
281
0
        } else {
282
            // regular attention
283
0
            const uint32_t n_gqa    = hparams.n_gqa(il);
284
0
            const uint32_t n_embd_q = n_gqa * hparams.n_embd_head_k(il);
285
0
            if (std::regex_match(tensor_name, pattern_attn_sinks)) {
286
0
                GGML_ASSERT(segments.size() == 1);
287
0
                return {std::lcm(n_embd_q, blck_size)/n_embd_q * n_gqa};
288
0
            }
289
290
0
            const int64_t granularity_q = std::lcm(n_embd_q, blck_size);
291
0
            if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_q_bias)) {
292
0
                GGML_ASSERT(segments.size() == 1);
293
                // some models have Q gate tensors, for those cases the granularity needs to be doubled:
294
0
                if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) {
295
0
                    return {std::lcm(2*n_embd_q, blck_size)};
296
0
                }
297
0
                return {granularity_q};
298
0
            }
299
0
            if (std::regex_match(tensor_name, pattern_attn_out_weight)) {
300
0
                GGML_ASSERT(segments.size() == 1);
301
0
                return {granularity_q};
302
0
            }
303
304
0
            const int64_t granularity_kv = granularity_q / n_gqa;
305
0
            if (std::regex_match(tensor_name, pattern_kv_weight) ||
306
0
                std::regex_match(tensor_name, pattern_kv_bias) ||
307
0
                std::regex_match(tensor_name, pattern_kv_cache)) {
308
0
                GGML_ASSERT(segments.size() == 1);
309
0
                return {granularity_kv};
310
0
            }
311
0
            if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) {
312
0
                GGML_ASSERT(segments.size() == 3);
313
0
                return {granularity_q, granularity_kv, granularity_kv};
314
0
            }
315
0
        }
316
317
        // FFN
318
0
        if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) ||
319
0
                std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) {
320
0
            GGML_ASSERT(segments.size() <= 2);
321
0
            return std::vector<int64_t>(segments.size(), blck_size);
322
0
        }
323
324
        // everything else
325
0
        GGML_ASSERT(segments.size() == 1);
326
0
        return {1};
327
0
    };
328
329
0
    ggml_backend_meta_split_state split_state;
330
0
    memset(&split_state, 0, sizeof(split_state));
331
0
    tensor_config tc = get_tensor_config();
332
0
    split_state.axis = tc.axis;
333
0
    if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) {
334
0
        const int64_t ne_full = tensor->ne[split_state.axis];
335
0
        const int64_t blck_size = ggml_blck_size(tc.tensor_axis_0->type);
336
0
        const float * tensor_split = ud->model->tensor_split();
337
0
        std::vector<float> tensor_split_scan;
338
0
        tensor_split_scan.reserve(ud->n_devices);
339
0
        for (size_t j = 0; j < ud->n_devices; j++) {
340
0
            tensor_split_scan.push_back(tensor_split == nullptr ? 0.0f : tensor_split[(j + tc.rotation) % ud->n_devices]);
341
0
            if (j > 0) {
342
0
                tensor_split_scan[j] += tensor_split_scan[j - 1];
343
0
            }
344
0
        }
345
0
        const std::vector<int64_t> segments = get_split_segments(split_state.axis, tc.il);
346
0
        const std::vector<int64_t> granularity = get_split_granularity(blck_size, tc.il, segments);
347
0
        for (size_t is = 0; is < segments.size(); is++) {
348
0
            const int64_t ne_s = segments[is];
349
0
            const int64_t g_s = granularity[is];
350
0
            GGML_ASSERT(ne_full % g_s == 0);
351
0
            int64_t low = 0;
352
0
            size_t j = 0;
353
0
            for (; j < ud->n_devices - 1; j++) {
354
0
                int64_t high = tensor_split_scan.back() == 0.0f ?
355
0
                    ne_s * (j+1)/ud->n_devices : ne_s * tensor_split_scan[j]/tensor_split_scan.back();
356
0
                if (high % g_s != 0) {
357
0
                    high -= high % g_s;
358
0
                }
359
0
                split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = high - low;
360
0
                low = high;
361
0
            }
362
0
            split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = ne_s - low;
363
0
        }
364
0
        split_state.n_segments = segments.size();
365
0
    } else {
366
0
        memset(split_state.ne, 0, sizeof(split_state.ne));
367
0
        split_state.n_segments = 1;
368
0
    }
369
0
    return split_state;
370
0
    GGML_UNUSED(userdata);
371
0
}
372
373
0
const char * llm_type_name(llm_type type) {
374
0
    switch (type) {
375
0
        case LLM_TYPE_14M:           return "14M";
376
0
        case LLM_TYPE_17M:           return "17M";
377
0
        case LLM_TYPE_22M:           return "22M";
378
0
        case LLM_TYPE_33M:           return "33M";
379
0
        case LLM_TYPE_47M:           return "47M";
380
0
        case LLM_TYPE_60M:           return "60M";
381
0
        case LLM_TYPE_70M:           return "70M";
382
0
        case LLM_TYPE_80M:           return "80M";
383
0
        case LLM_TYPE_109M:          return "109M";
384
0
        case LLM_TYPE_137M:          return "137M";
385
0
        case LLM_TYPE_140M:          return "140M";
386
0
        case LLM_TYPE_149M:          return "149M";
387
0
        case LLM_TYPE_160M:          return "160M";
388
0
        case LLM_TYPE_190M:          return "190M";
389
0
        case LLM_TYPE_220M:          return "220M";
390
0
        case LLM_TYPE_250M:          return "250M";
391
0
        case LLM_TYPE_256M:          return "256M";
392
0
        case LLM_TYPE_270M:          return "270M";
393
0
        case LLM_TYPE_335M:          return "335M";
394
0
        case LLM_TYPE_350M:          return "350M";
395
0
        case LLM_TYPE_360M:          return "360M";
396
0
        case LLM_TYPE_395M:          return "395M";
397
0
        case LLM_TYPE_410M:          return "410M";
398
0
        case LLM_TYPE_450M:          return "450M";
399
0
        case LLM_TYPE_475M:          return "475M";
400
0
        case LLM_TYPE_558M:          return "558M";
401
0
        case LLM_TYPE_700M:          return "700M";
402
0
        case LLM_TYPE_770M:          return "770M";
403
0
        case LLM_TYPE_780M:          return "780M";
404
0
        case LLM_TYPE_950M:          return "950M";
405
0
        case LLM_TYPE_0_3B:          return "0.3B";
406
0
        case LLM_TYPE_0_5B:          return "0.5B";
407
0
        case LLM_TYPE_0_6B:          return "0.6B";
408
0
        case LLM_TYPE_0_8B:          return "0.8B";
409
0
        case LLM_TYPE_1B:            return "1B";
410
0
        case LLM_TYPE_1_2B:          return "1.2B";
411
0
        case LLM_TYPE_1_3B:          return "1.3B";
412
0
        case LLM_TYPE_1_4B:          return "1.4B";
413
0
        case LLM_TYPE_1_5B:          return "1.5B";
414
0
        case LLM_TYPE_1_6B:          return "1.6B";
415
0
        case LLM_TYPE_1_7B:          return "1.7B";
416
0
        case LLM_TYPE_1_8B:          return "1.8B";
417
0
        case LLM_TYPE_2B:            return "2B";
418
0
        case LLM_TYPE_2_6B:          return "2.6B";
419
0
        case LLM_TYPE_2_8B:          return "2.8B";
420
0
        case LLM_TYPE_2_9B:          return "2.9B";
421
0
        case LLM_TYPE_3B:            return "3B";
422
0
        case LLM_TYPE_4B:            return "4B";
423
0
        case LLM_TYPE_6B:            return "6B";
424
0
        case LLM_TYPE_6_9B:          return "6.9B";
425
0
        case LLM_TYPE_7B:            return "7B";
426
0
        case LLM_TYPE_8B:            return "8B";
427
0
        case LLM_TYPE_9B:            return "9B";
428
0
        case LLM_TYPE_11B:           return "11B";
429
0
        case LLM_TYPE_12B:           return "12B";
430
0
        case LLM_TYPE_13B:           return "13B";
431
0
        case LLM_TYPE_14B:           return "14B";
432
0
        case LLM_TYPE_15B:           return "15B";
433
0
        case LLM_TYPE_16B:           return "16B";
434
0
        case LLM_TYPE_20B:           return "20B";
435
0
        case LLM_TYPE_26B:           return "26B";
436
0
        case LLM_TYPE_27B:           return "27B";
437
0
        case LLM_TYPE_30B:           return "30B";
438
0
        case LLM_TYPE_32B:           return "32B";
439
0
        case LLM_TYPE_34B:           return "34B";
440
0
        case LLM_TYPE_35B:           return "35B";
441
0
        case LLM_TYPE_36B:           return "36B";
442
0
        case LLM_TYPE_40B:           return "40B";
443
0
        case LLM_TYPE_65B:           return "65B";
444
0
        case LLM_TYPE_70B:           return "70B";
445
0
        case LLM_TYPE_120B:          return "120B";
446
0
        case LLM_TYPE_142B:          return "142B";
447
0
        case LLM_TYPE_236B:          return "236B";
448
0
        case LLM_TYPE_290B:          return "290B";
449
0
        case LLM_TYPE_314B:          return "314B";
450
0
        case LLM_TYPE_405B:          return "405B";
451
0
        case LLM_TYPE_671B:          return "671B";
452
0
        case LLM_TYPE_SMALL:         return "0.1B";
453
0
        case LLM_TYPE_MEDIUM:        return "0.4B";
454
0
        case LLM_TYPE_LARGE:         return "0.8B";
455
0
        case LLM_TYPE_XL:            return "1.5B";
456
0
        case LLM_TYPE_A1_7B:         return "A1.7B";
457
0
        case LLM_TYPE_A2_7B:         return "A2.7B";
458
0
        case LLM_TYPE_8x7B:          return "8x7B";
459
0
        case LLM_TYPE_8x22B:         return "8x22B";
460
0
        case LLM_TYPE_16x12B:        return "16x12B";
461
0
        case LLM_TYPE_16x3_8B:       return "16x3.8B";
462
0
        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
463
0
        case LLM_TYPE_57B_A14B:      return "57B.A14B";
464
0
        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
465
0
        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
466
0
        case LLM_TYPE_A13B:          return "A13B";
467
0
        case LLM_TYPE_7B_A1B:        return "7B.A1B";
468
0
        case LLM_TYPE_8B_A1B:        return "8B.A1B";
469
0
        case LLM_TYPE_16B_A1B:       return "16B.A1B";
470
0
        case LLM_TYPE_21B_A3B:       return "21B.A3B";
471
0
        case LLM_TYPE_24B_A2B:       return "24B.A2B";
472
0
        case LLM_TYPE_30B_A3B:       return "30B.A3B";
473
0
        case LLM_TYPE_31B_A3_5B:     return "31B.A3.5B";
474
0
        case LLM_TYPE_35B_A3B:       return "35B.A3B";
475
0
        case LLM_TYPE_48B_A3B:       return "48B.A3B";
476
0
        case LLM_TYPE_80B_A3B:       return "80B.A3B";
477
0
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
478
0
        case LLM_TYPE_102B_A12B:     return "102B.A12B";
479
0
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
480
0
        case LLM_TYPE_120B_A12B:     return "120B.A12B";
481
0
        case LLM_TYPE_122B_A10B:     return "122B.A10B";
482
0
        case LLM_TYPE_196B_A11B:     return "196B.A11B";
483
0
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
484
0
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
485
0
        case LLM_TYPE_300B_A47B:     return "300B.A47B";
486
0
        case LLM_TYPE_310B_A15B:     return "310B.A15B";
487
0
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
488
0
        case LLM_TYPE_397B_A17B:     return "397B.A17B";
489
0
        case LLM_TYPE_744B_A40B:     return "744B.A40B";
490
0
        case LLM_TYPE_E2B:           return "E2B";
491
0
        case LLM_TYPE_E4B:           return "E4B";
492
0
        default:                     return "?B";
493
0
    }
494
0
}
495
496
0
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
497
0
    switch (type) {
498
0
        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
499
0
        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
500
0
        default:                                    return "unknown";
501
0
    }
502
0
}
503
504
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
505
    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
506
    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
507
    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
508
    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
509
};
510
511
0
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
512
0
    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
513
0
}
514
515
0
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
516
0
    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
517
0
        if (kv.second == name) {
518
0
            return (llama_rope_scaling_type) kv.first;
519
0
        }
520
0
    }
521
522
0
    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
523
0
}
524
525
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
526
0
static buft_list_t make_cpu_buft_list(const std::vector<llama_device> & devices, bool use_extra_bufts, bool no_host) {
527
0
    buft_list_t buft_list;
528
529
    // add ACCEL buffer types
530
0
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
531
0
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
532
0
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
533
0
            auto * buft = ggml_backend_dev_buffer_type(dev);
534
            // skip
535
0
            if (buft != ggml_backend_cpu_buffer_type()) {
536
0
                buft_list.emplace_back(dev, buft);
537
0
            }
538
0
        }
539
0
    }
540
541
    // add a host buffer type
542
    // storing the tensors in a host buffer is useful when the processing of large batches
543
    // is offloaded to a GPU device, since it reduces the time spent on data transfers
544
    // generally, this will be done using the first device in the list
545
    // a better approach would be to handle this on a weight-by-weight basis using the offload_op
546
    // function of the device to determine if it would benefit from being stored in a host buffer
547
0
    if (!no_host) {
548
0
        for (const auto & dev : devices) {
549
0
            ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev.dev);
550
0
            if (buft) {
551
0
                buft_list.emplace_back(dev.dev, buft);
552
0
                break;
553
0
            }
554
0
        }
555
0
    }
556
557
    // add extra buffer types
558
0
    if (use_extra_bufts) {
559
0
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
560
0
        if (cpu_dev == nullptr) {
561
0
            throw std::runtime_error(format("%s: no CPU backend found", __func__));
562
0
        }
563
564
0
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
565
0
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
566
0
            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
567
0
        if (ggml_backend_dev_get_extra_bufts_fn) {
568
0
            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
569
0
            while (extra_bufts && *extra_bufts) {
570
0
                buft_list.emplace_back(cpu_dev, *extra_bufts);
571
0
                ++extra_bufts;
572
0
            }
573
0
        }
574
0
    }
575
576
    // add the CPU buffer type
577
0
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
578
0
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
579
0
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
580
0
            buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
581
0
        }
582
0
    }
583
584
0
    return buft_list;
585
0
}
586
587
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
588
0
static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
589
0
    buft_list_t buft_list;
590
591
    // add the device split buffer type if requested and available
592
0
    if (split_mode == LLAMA_SPLIT_MODE_ROW) {
593
0
        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
594
0
        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
595
0
            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
596
0
        if (ggml_backend_split_buffer_type_fn) {
597
0
            size_t dev_index = [&]() {
598
0
                auto * reg = ggml_backend_dev_backend_reg(dev);
599
0
                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
600
0
                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
601
0
                        return i;
602
0
                    }
603
0
                }
604
0
                throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
605
0
            }();
606
0
            auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
607
0
            if (buft != nullptr) {
608
0
                buft_list.emplace_back(dev, buft);
609
0
            }
610
0
        }
611
0
    }
612
613
    // add the device default buffer type
614
0
    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
615
616
    // add the device extra buffer type (if any)
617
0
    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
618
0
    if (reg) {
619
0
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
620
0
            ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
621
622
0
        if (ggml_backend_dev_get_extra_bufts_fn) {
623
0
            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
624
0
            while (extra_bufts && *extra_bufts) {
625
0
                buft_list.emplace_back(dev, *extra_bufts);
626
0
                ++extra_bufts;
627
0
            }
628
0
        }
629
0
    }
630
631
0
    return buft_list;
632
0
}
633
634
struct llama_model::impl {
635
4.15k
    impl() = default;
636
3.90k
    ~impl() = default;
637
638
    uint64_t n_elements = 0;
639
640
    size_t n_bytes = 0;
641
642
    std::string desc_str;
643
644
    // model memory mapped files
645
    llama_mmaps mappings;
646
647
    // objects representing data potentially being locked in memory
648
    llama_mlocks mlock_bufs;
649
    llama_mlocks mlock_mmaps;
650
651
    // contexts where the model tensors metadata is stored as well as the corresponding buffers:
652
    std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
653
654
    buft_list_t cpu_buft_list;
655
    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
656
657
    struct layer_dev {
658
        ggml_backend_dev_t dev;
659
        buft_list_t * buft_list;
660
    };
661
662
    layer_dev dev_input = {};
663
    layer_dev dev_output = {};
664
    std::vector<layer_dev> dev_layer;
665
666
    bool has_tensor_overrides;
667
};
668
669
4.15k
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
670
4.15k
    pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
671
4.15k
}
672
673
3.90k
llama_model::~llama_model() {
674
3.90k
    for (auto * lora : loras) {
675
0
        delete lora;
676
0
    }
677
3.90k
}
678
679
0
void llama_model::load_stats(llama_model_loader & ml) {
680
0
    pimpl->n_elements = ml.n_elements;
681
0
    pimpl->n_bytes = ml.n_bytes;
682
0
}
683
684
649
void llama_model::load_arch(llama_model_loader & ml) {
685
649
    arch = ml.get_arch();
686
649
    if (arch == LLM_ARCH_UNKNOWN) {
687
471
        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
688
471
    }
689
178
    if (!devices.empty() && devices[0].is_meta && !llm_arch_supports_sm_tensor(arch)) {
690
0
        throw std::runtime_error(std::string("LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '") + llm_arch_name(arch) + "'");
691
0
    }
692
178
}
693
694
178
void llama_model::load_hparams(llama_model_loader & ml) {
695
178
    const gguf_context * ctx = ml.metadata;
696
697
    // get metadata as string
698
1.62k
    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
699
1.44k
        gguf_type type = gguf_get_kv_type(ctx, i);
700
1.44k
        if (type == GGUF_TYPE_ARRAY) {
701
26
            continue;
702
26
        }
703
1.41k
        const char * name = gguf_get_key(ctx, i);
704
1.41k
        const std::string value = gguf_kv_to_str(ctx, i);
705
1.41k
        gguf_kv.emplace(name, value);
706
1.41k
    }
707
708
    // get general kv
709
178
    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
710
711
    // everything past this point is not vocab-related
712
    // for CLIP models, we only need to load tensors, no hparams
713
178
    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
714
1
        return;
715
1
    }
716
717
177
    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
718
177
    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
719
177
    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
720
177
    ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn,     false);
721
177
    ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type,    false);
722
177
    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
723
177
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
724
177
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
725
177
    ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
726
177
    ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
727
728
177
    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
729
0
        ml.get_key(LLM_KV_FEATURES_LENGTH,  hparams.n_embd);
730
0
        ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
731
732
0
        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
733
0
        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
734
735
0
        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
736
0
        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
737
0
    }
738
739
177
    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
740
177
    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
741
177
    if (hparams.n_expert > 0) {
742
0
        GGML_ASSERT(hparams.n_expert_used > 0);
743
0
        GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
744
0
        if (hparams.n_expert_groups > 1) {
745
0
            GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
746
0
            GGML_ASSERT(hparams.n_group_used > 0);
747
0
            GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
748
0
        }
749
177
    } else {
750
177
        GGML_ASSERT(hparams.n_expert_used == 0);
751
177
        GGML_ASSERT(hparams.n_expert_groups == 0);
752
177
    }
753
754
177
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
755
177
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
756
177
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
757
177
    std::fill(
758
177
        hparams.recurrent_layer_arr.begin(),
759
177
        hparams.recurrent_layer_arr.end(),
760
177
        llm_arch_is_recurrent(ml.get_arch()));
761
762
177
    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
763
177
    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
764
765
177
    std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
766
177
    std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
767
177
    std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
768
177
    std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
769
177
    std::fill(hparams.swiglu_clamp_exp.begin(),   hparams.swiglu_clamp_exp.end(),   0.0f);
770
177
    std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
771
772
177
    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
773
177
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
774
775
    // n_head_kv is optional, default to n_head
776
177
    hparams.n_head_kv_arr = hparams.n_head_arr;
777
778
177
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
779
780
177
    bool rope_finetuned = false;
781
177
    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
782
177
    hparams.rope_finetuned = rope_finetuned;
783
784
177
    hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
785
177
    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
786
787
    // rope_freq_base (optional)
788
177
    hparams.rope_freq_base_train = 10000.0f;
789
177
    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
790
791
177
    std::string rope_scaling("linear");
792
177
    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
793
177
    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
794
177
    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
795
796
    // TODO: Handle SWA metadata similarly when models start implementing it
797
    // rope_freq_scale (inverse of the kv) is optional
798
177
    float ropescale = 0.0f;
799
177
    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
800
        // try the old key name
801
0
        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
802
0
    }
803
177
    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
804
805
177
    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
806
807
    // non-transformer models do not have attention heads
808
177
    if (hparams.n_head() > 0) {
809
        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
810
        // gpt-j n_rot = rotary_dim
811
812
0
        hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
813
0
        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
814
815
0
        hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
816
0
        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
817
818
        // sanity check for n_rot (optional)
819
0
        hparams.n_rot_full = hparams.n_embd_head_k_full;
820
821
0
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
822
823
0
        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
824
0
            if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
825
0
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
826
0
            }
827
0
        }
828
177
    } else {
829
177
        hparams.n_rot_full = 0;
830
177
        hparams.n_embd_head_k_full = 0;
831
177
        hparams.n_embd_head_v_full = 0;
832
177
    }
833
834
    // head size and n_rot for SWA layers
835
177
    {
836
177
        hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
837
177
        hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
838
177
        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
839
177
        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
840
841
177
        hparams.n_rot_swa = hparams.n_rot_full;
842
177
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
843
177
    }
844
845
    // for differentiating model types
846
177
    uint32_t n_vocab = 0;
847
177
    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
848
849
    // for classifier models
850
177
    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
851
177
    if (!classifier_labels.empty()) {
852
0
        hparams.n_cls_out = classifier_labels.size();
853
0
    }
854
855
    // arch-specific KVs
856
177
    switch (arch) {
857
0
        case LLM_ARCH_LLAMA:
858
0
        case LLM_ARCH_LLAMA_EMBED:
859
0
            {
860
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
861
862
0
                if (hparams.n_expert == 8) {
863
0
                    switch (hparams.n_layer) {
864
0
                        case 32: type = LLM_TYPE_8x7B; break;
865
0
                        case 56: type = LLM_TYPE_8x22B; break;
866
0
                        default: type = LLM_TYPE_UNKNOWN;
867
0
                    }
868
0
                } else {
869
0
                    switch (hparams.n_layer) {
870
0
                        case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
871
0
                        case 22: type = LLM_TYPE_1B; break;
872
0
                        case 26: type = LLM_TYPE_3B; break;
873
0
                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
874
0
                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
875
                        // granite uses a vocab with len 49152
876
0
                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
877
0
                        case 36: type = LLM_TYPE_8B; break; // granite
878
0
                        case 40: type = LLM_TYPE_13B; break;
879
0
                        case 48: type = LLM_TYPE_34B; break;
880
0
                        case 60: type = LLM_TYPE_30B; break;
881
0
                        case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
882
0
                        default: type = LLM_TYPE_UNKNOWN;
883
0
                    }
884
0
                }
885
0
            } break;
886
0
        case LLM_ARCH_LLAMA4:
887
0
            {
888
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
889
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
890
0
                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
891
892
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
893
0
                if (found_swa && hparams.n_swa == 0) {
894
0
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
895
0
                    hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
896
0
                } else {
897
0
                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
898
0
                    hparams.n_swa                   = 8192;
899
0
                    hparams.n_attn_temp_floor_scale = 8192;
900
0
                    hparams.f_attn_temp_scale       = 0.1f;
901
0
                    hparams.f_attn_temp_offset      = 1.0f;
902
0
                    uint32_t swa_period             = 4; // pattern: 3 chunked - 1 full
903
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
904
0
                    hparams.set_swa_pattern(swa_period);
905
906
0
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
907
0
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
908
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
909
0
                }
910
911
0
                switch (hparams.n_expert) {
912
0
                    case 0: {
913
                        // MobileLLM (no MoE)
914
0
                        switch (hparams.n_embd) {
915
0
                            case 2048: type = LLM_TYPE_140M; break;
916
0
                            case 4096: type = LLM_TYPE_360M; break;
917
0
                            case 6144: type = LLM_TYPE_950M; break;
918
0
                            default:   type = LLM_TYPE_UNKNOWN;
919
0
                        }
920
0
                    } break;
921
0
                    case 16:  type = LLM_TYPE_17B_16E; break;
922
0
                    case 128: type = LLM_TYPE_17B_128E; break;
923
0
                    default:  type = LLM_TYPE_UNKNOWN;
924
0
                }
925
926
0
                hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
927
0
            } break;
928
0
        case LLM_ARCH_ARCEE:
929
0
            {
930
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
931
932
                // Arcee uses the same structure as Llama
933
0
                switch (hparams.n_layer) {
934
0
                    case 36: type = LLM_TYPE_4B; break;
935
0
                    default: type = LLM_TYPE_UNKNOWN;
936
0
                }
937
0
            } break;
938
0
        case LLM_ARCH_AFMOE:
939
0
            {
940
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
941
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
942
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
943
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
944
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
945
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
946
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
947
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
948
949
                // Set up interleaved sliding window attention (ISWA)
950
                // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
951
0
                if (hparams.n_swa > 0) {
952
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
953
0
                    uint32_t swa_period = 4;
954
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
955
0
                    hparams.set_swa_pattern(swa_period);
956
957
0
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
958
0
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
959
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
960
0
                } else {
961
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
962
0
                }
963
964
                // Default to sigmoid if not set
965
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
966
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
967
0
                }
968
969
0
                switch (hparams.n_layer) {
970
0
                    case 56: type = LLM_TYPE_6B; break;
971
0
                    case 32: type = LLM_TYPE_26B; break;
972
0
                    default: type = LLM_TYPE_UNKNOWN;
973
0
                }
974
0
            } break;
975
0
        case LLM_ARCH_DECI:
976
0
            {
977
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
978
0
                switch (hparams.n_layer) {
979
0
                    case 32: type = LLM_TYPE_7B; break;
980
0
                    case 80: type = LLM_TYPE_70B; break;
981
0
                    case 162: type = LLM_TYPE_405B; break;
982
0
                    default: type = LLM_TYPE_UNKNOWN;
983
0
                }
984
0
            } break;
985
0
        case LLM_ARCH_MINICPM:
986
0
            {
987
                // Backward-compatible defaults for older MiniCPM GGUFs
988
0
                hparams.f_embedding_scale = 12.0f;
989
0
                hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
990
0
                hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
991
992
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
993
994
                // Optional KV reads, override defaults if present in newer GGUF exports
995
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
996
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
997
0
                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
998
999
                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
1000
0
                hparams.rope_finetuned = true;
1001
1002
0
                switch (hparams.n_layer) {
1003
0
                    case 52: type = LLM_TYPE_1B; break;
1004
0
                    case 40: type = LLM_TYPE_2B; break;
1005
0
                    default: type = LLM_TYPE_UNKNOWN;
1006
0
                }
1007
0
            } break;
1008
0
        case LLM_ARCH_MINICPM3:
1009
0
            {
1010
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1011
0
                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
1012
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
1013
1014
0
                switch (hparams.n_layer) {
1015
0
                    case 62: type = LLM_TYPE_4B; break;
1016
0
                    default: type = LLM_TYPE_UNKNOWN;
1017
0
                }
1018
0
            } break;
1019
0
        case LLM_ARCH_GROK:
1020
0
            {
1021
                // defaults for old GGUFs
1022
0
                hparams.yarn_beta_fast = 8.0f;
1023
0
                hparams.f_logit_scale = 0.5773502691896257f;
1024
0
                hparams.f_embedding_scale = 78.38367176906169f;
1025
0
                hparams.f_attn_out_scale = 0.08838834764831845f;
1026
0
                hparams.f_attn_logit_softcapping = 30.0f;
1027
0
                hparams.f_router_logit_softcapping = 30.0f;
1028
                // no final_logit_softcapping in grok-1
1029
0
                hparams.f_final_logit_softcapping = 0.0f;
1030
1031
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
1032
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp, false);
1033
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
1034
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
1035
0
                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
1036
0
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
1037
0
                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
1038
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
1039
1040
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,  hparams.attn_temp_length, false);
1041
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,  hparams.yarn_ext_factor, false);
1042
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
1043
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
1044
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
1045
1046
0
                switch (hparams.n_layer) {
1047
0
                    case 64: type = LLM_TYPE_314B; break;
1048
0
                    default: type = LLM_TYPE_UNKNOWN;
1049
0
                }
1050
0
            } break;
1051
0
        case LLM_ARCH_FALCON:
1052
0
            {
1053
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1054
1055
0
                switch (hparams.n_layer) {
1056
0
                    case 32: type = LLM_TYPE_7B; break;
1057
0
                    case 60: type = LLM_TYPE_40B; break;
1058
0
                    default: type = LLM_TYPE_UNKNOWN;
1059
0
                }
1060
0
            } break;
1061
0
        case LLM_ARCH_BAICHUAN:
1062
0
            {
1063
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1064
0
                switch (hparams.n_layer) {
1065
0
                    case 32: type = LLM_TYPE_7B; break;
1066
0
                    case 40: type = LLM_TYPE_13B; break;
1067
0
                    default: type = LLM_TYPE_UNKNOWN;
1068
0
                }
1069
1070
0
                if (type == LLM_TYPE_13B) {
1071
                    // TODO: become GGUF KV parameter
1072
0
                    hparams.f_max_alibi_bias = 8.0f;
1073
0
                }
1074
0
            } break;
1075
0
        case LLM_ARCH_STARCODER:
1076
0
            {
1077
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1078
0
                switch (hparams.n_layer) {
1079
0
                    case 24: type = LLM_TYPE_1B; break;
1080
0
                    case 36: type = LLM_TYPE_3B; break;
1081
0
                    case 42: type = LLM_TYPE_7B; break;
1082
0
                    case 40: type = LLM_TYPE_15B; break;
1083
0
                    default: type = LLM_TYPE_UNKNOWN;
1084
0
                }
1085
0
            } break;
1086
0
        case LLM_ARCH_REFACT:
1087
0
            {
1088
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1089
0
                switch (hparams.n_layer) {
1090
0
                    case 32: type = LLM_TYPE_1B; break;
1091
0
                    default: type = LLM_TYPE_UNKNOWN;
1092
0
                }
1093
1094
                // TODO: become GGUF KV parameter
1095
0
                hparams.f_max_alibi_bias = 8.0f;
1096
0
            } break;
1097
0
        case LLM_ARCH_BERT:
1098
0
            {
1099
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
1100
1101
0
                switch (hparams.n_layer) {
1102
0
                    case 3:
1103
0
                        type = LLM_TYPE_17M; break; // bge-micro
1104
0
                    case 6:
1105
0
                        type = LLM_TYPE_22M; break; // MiniLM-L6
1106
0
                    case 12:
1107
0
                        switch (hparams.n_embd) {
1108
0
                            case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
1109
0
                            case 768: type = LLM_TYPE_109M; break; // bge-base
1110
0
                            default: type = LLM_TYPE_UNKNOWN;
1111
0
                        } break;
1112
0
                    case 24:
1113
0
                        type = LLM_TYPE_335M; break; // bge-large
1114
0
                    default: type = LLM_TYPE_UNKNOWN;
1115
0
                }
1116
0
            } break;
1117
0
        case LLM_ARCH_MODERN_BERT:
1118
0
            {
1119
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1120
0
                if (found_swa && hparams.n_swa > 0) {
1121
0
                    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1122
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1123
0
                    uint32_t swa_period = 3;
1124
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1125
0
                    hparams.set_swa_pattern(swa_period, true);
1126
0
                } else {
1127
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1128
0
                }
1129
1130
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1131
1132
0
                switch (hparams.n_layer) {
1133
0
                    case 12:
1134
0
                        type = LLM_TYPE_47M; break; // granite-embedding-small
1135
0
                    case 22:
1136
0
                        type = LLM_TYPE_149M; break; // modern-bert-base
1137
0
                    case 28:
1138
0
                        type = LLM_TYPE_395M; break; // modern-bert-large
1139
0
                    default: type = LLM_TYPE_UNKNOWN;
1140
0
                }
1141
0
            } break;
1142
0
        case LLM_ARCH_JINA_BERT_V2:
1143
0
            {
1144
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
1145
0
                hparams.f_max_alibi_bias = 8.0f;
1146
1147
0
                switch (hparams.n_layer) {
1148
0
                    case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
1149
0
                    case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
1150
0
                    default: type = LLM_TYPE_UNKNOWN;
1151
0
                }
1152
0
            } break;
1153
0
        case LLM_ARCH_JINA_BERT_V3:
1154
0
            {
1155
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
1156
1157
0
                switch (hparams.n_layer) {
1158
0
                    case 24:
1159
0
                        type = LLM_TYPE_558M; break;
1160
0
                    default: type = LLM_TYPE_UNKNOWN;
1161
0
                }
1162
0
            } break;
1163
0
        case LLM_ARCH_NOMIC_BERT:
1164
0
        case LLM_ARCH_NOMIC_BERT_MOE:
1165
0
            {
1166
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
1167
0
                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
1168
1169
0
                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
1170
0
                    if (arch == LLM_ARCH_NOMIC_BERT) {
1171
0
                        type = LLM_TYPE_137M;
1172
0
                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
1173
0
                        type = LLM_TYPE_475M;
1174
0
                    }
1175
0
                }
1176
0
            } break;
1177
0
        case LLM_ARCH_NEO_BERT:
1178
0
            {
1179
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1180
1181
0
                if (hparams.n_layer == 28) {
1182
0
                    type = LLM_TYPE_250M;
1183
0
                }
1184
0
            } break;
1185
0
        case LLM_ARCH_EUROBERT:
1186
0
            {
1187
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1188
1189
0
                if (hparams.n_layer == 12) {
1190
0
                    type = LLM_TYPE_SMALL;  // 0.2B
1191
0
                }
1192
0
            } break;
1193
0
        case LLM_ARCH_BLOOM:
1194
0
            {
1195
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1196
1197
0
                switch (hparams.n_layer) {
1198
0
                    case 24: type = LLM_TYPE_1B; break;
1199
0
                    case 30:
1200
0
                        switch (hparams.n_embd) {
1201
0
                            case 2560: type = LLM_TYPE_3B; break;
1202
0
                            case 4096: type = LLM_TYPE_7B; break;
1203
0
                            default: type = LLM_TYPE_UNKNOWN;
1204
0
                        } break;
1205
0
                    default: type = LLM_TYPE_UNKNOWN;
1206
0
                }
1207
1208
                // TODO: become GGUF KV parameter
1209
0
                hparams.f_max_alibi_bias = 8.0f;
1210
0
            } break;
1211
0
        case LLM_ARCH_MPT:
1212
0
            {
1213
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
1214
0
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
1215
0
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
1216
1217
0
                switch (hparams.n_layer) {
1218
0
                    case 32: type = LLM_TYPE_7B; break;
1219
0
                    case 48: type = LLM_TYPE_30B; break;
1220
0
                    default: type = LLM_TYPE_UNKNOWN;
1221
0
                }
1222
0
            } break;
1223
0
        case LLM_ARCH_STABLELM:
1224
0
            {
1225
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1226
1227
0
                switch (hparams.n_layer) {
1228
0
                    case 24: type = LLM_TYPE_1B; break;
1229
0
                    case 32: type = LLM_TYPE_3B; break;
1230
0
                    case 40: type = LLM_TYPE_12B; break;
1231
0
                    default: type = LLM_TYPE_UNKNOWN;
1232
0
               }
1233
0
            } break;
1234
0
        case LLM_ARCH_QWEN:
1235
0
            {
1236
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1237
1238
0
                switch (hparams.n_layer) {
1239
0
                    case 32: type = LLM_TYPE_7B; break;
1240
0
                    case 40: type = LLM_TYPE_13B; break;
1241
0
                    default: type = LLM_TYPE_UNKNOWN;
1242
0
                }
1243
0
            } break;
1244
0
        case LLM_ARCH_QWEN2VL:
1245
0
            {
1246
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1247
0
            }
1248
            // fall through
1249
0
        case LLM_ARCH_QWEN2:
1250
0
            {
1251
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1252
0
                switch (hparams.n_layer) {
1253
0
                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
1254
0
                    case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
1255
0
                    case 32: type = LLM_TYPE_7B; break;
1256
0
                    case 36: type = LLM_TYPE_3B; break;
1257
0
                    case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
1258
0
                    case 48: type = LLM_TYPE_14B; break;
1259
0
                    case 64: type = LLM_TYPE_32B; break;
1260
0
                    case 80: type = LLM_TYPE_70B; break;
1261
0
                    default: type = LLM_TYPE_UNKNOWN;
1262
0
                }
1263
0
            } break;
1264
0
        case LLM_ARCH_DREAM:
1265
0
            {
1266
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1267
                // Dream models are primarily 7B with 28 layers
1268
0
                switch (hparams.n_layer) {
1269
0
                    case 28:
1270
0
                        type = LLM_TYPE_7B;
1271
0
                        break;
1272
0
                    default:
1273
0
                        type = LLM_TYPE_UNKNOWN;
1274
0
                }
1275
                // Set non-causal attention for diffusion models
1276
0
                hparams.causal_attn = false;
1277
0
            }
1278
0
            break;
1279
0
        case LLM_ARCH_LLADA:
1280
0
            {
1281
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1282
                // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
1283
0
                switch (hparams.n_layer) {
1284
0
                    case 32:
1285
0
                        type = LLM_TYPE_8B;
1286
0
                        break;
1287
0
                    default:
1288
0
                        type = LLM_TYPE_UNKNOWN;
1289
0
                }
1290
                // Set non-causal attention for diffusion models
1291
0
                hparams.causal_attn = false;
1292
0
            }
1293
0
            break;
1294
0
        case LLM_ARCH_LLADA_MOE:
1295
0
            {
1296
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1297
1298
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1299
                // diffusion language model uses non-causal attention
1300
0
                hparams.causal_attn = false;
1301
0
                switch (hparams.n_layer) {
1302
0
                    case 16: type = LLM_TYPE_A1_7B; break;
1303
0
                    default: type = LLM_TYPE_UNKNOWN;
1304
0
                }
1305
0
            } break;
1306
0
        case LLM_ARCH_RND1:
1307
0
            {
1308
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1309
1310
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1311
0
                switch (hparams.n_layer) {
1312
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1313
0
                    default: type = LLM_TYPE_UNKNOWN;
1314
0
                }
1315
                // Set non-causal attention for diffusion models
1316
0
                hparams.causal_attn = false;
1317
0
            } break;
1318
0
        case LLM_ARCH_QWEN2MOE:
1319
0
            {
1320
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1321
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1322
1323
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1324
0
                switch (hparams.n_layer) {
1325
0
                    case 24: type = LLM_TYPE_A2_7B; break;
1326
0
                    case 28: type = LLM_TYPE_57B_A14B; break;
1327
0
                    default: type = LLM_TYPE_UNKNOWN;
1328
0
                }
1329
0
            } break;
1330
0
        case LLM_ARCH_QWEN3:
1331
0
            {
1332
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1333
0
                switch (hparams.n_layer) {
1334
0
                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
1335
0
                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1336
0
                    case 40: type = LLM_TYPE_14B; break;
1337
0
                    case 64: type = LLM_TYPE_32B; break;
1338
0
                    default: type = LLM_TYPE_UNKNOWN;
1339
0
                }
1340
0
            } break;
1341
0
        case LLM_ARCH_MAINCODER:
1342
0
            {
1343
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1344
0
                switch (hparams.n_layer) {
1345
0
                    case 32: type = LLM_TYPE_1B; break;
1346
0
                    default: type = LLM_TYPE_UNKNOWN;
1347
0
                }
1348
0
            } break;
1349
0
        case LLM_ARCH_QWEN3VL:
1350
0
            {
1351
0
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1352
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1353
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1354
0
                switch (hparams.n_layer) {
1355
0
                    case 28: type = LLM_TYPE_1_7B; break;
1356
0
                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1357
0
                    case 64: type = LLM_TYPE_32B; break;
1358
0
                    default: type = LLM_TYPE_UNKNOWN;
1359
0
                }
1360
0
            } break;
1361
0
        case LLM_ARCH_QWEN3MOE:
1362
0
            {
1363
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1364
1365
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1366
0
                switch (hparams.n_layer) {
1367
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1368
0
                    case 94: type = LLM_TYPE_235B_A22B; break;
1369
0
                    default: type = LLM_TYPE_UNKNOWN;
1370
0
                }
1371
0
            } break;
1372
0
        case LLM_ARCH_QWEN3VLMOE:
1373
0
            {
1374
0
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1375
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1376
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1377
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1378
0
                switch (hparams.n_layer) {
1379
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1380
0
                    case 94: type = LLM_TYPE_235B_A22B; break;
1381
0
                    default: type = LLM_TYPE_UNKNOWN;
1382
0
                }
1383
0
            } break;
1384
0
        case LLM_ARCH_PHI2:
1385
0
            {
1386
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1387
1388
0
                switch (hparams.n_layer) {
1389
0
                    case 24: type = LLM_TYPE_1B; break;
1390
0
                    case 32: type = LLM_TYPE_3B; break;
1391
0
                    default: type = LLM_TYPE_UNKNOWN;
1392
0
                }
1393
0
            } break;
1394
0
        case LLM_ARCH_PHI3:
1395
0
            {
1396
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1397
1398
0
                switch (hparams.n_layer) {
1399
0
                    case 24: type = LLM_TYPE_1B; break;
1400
0
                    case 32: type = LLM_TYPE_3B; break;
1401
0
                    case 40: type = LLM_TYPE_14B; break;
1402
0
                    default: type = LLM_TYPE_UNKNOWN;
1403
0
                }
1404
1405
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1406
1407
0
                if (found_swa && hparams.n_swa > 0) {
1408
0
                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
1409
0
                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
1410
1411
                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
1412
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1413
1414
0
                    hparams.n_swa         = 0;
1415
0
                    hparams.set_swa_pattern(1);
1416
0
                }
1417
0
            } break;
1418
0
        case LLM_ARCH_PHIMOE:
1419
0
            {
1420
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1421
1422
0
                switch (hparams.n_layer) {
1423
0
                    case 32: type = LLM_TYPE_16x3_8B; break;
1424
0
                    default: type = LLM_TYPE_UNKNOWN;
1425
0
                }
1426
0
            } break;
1427
0
        case LLM_ARCH_PLAMO:
1428
0
            {
1429
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1430
1431
0
                switch (hparams.n_layer) {
1432
0
                    case 40: type = LLM_TYPE_13B; break;
1433
0
                    default: type = LLM_TYPE_UNKNOWN;
1434
0
               }
1435
0
            } break;
1436
0
        case LLM_ARCH_PLAMO2:
1437
0
            {
1438
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1439
1440
                // Load Mamba SSM parameters
1441
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1442
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1443
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1444
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1445
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1446
1447
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1448
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1449
0
                }
1450
1451
0
                switch (hparams.n_layer) {
1452
0
                    case 16: type = LLM_TYPE_1B; break;
1453
0
                    case 32:
1454
0
                        if (hparams.n_embd == 2048) {
1455
0
                            type = LLM_TYPE_2B;
1456
0
                        } else if (hparams.n_embd == 4096) {
1457
0
                            type = LLM_TYPE_8B;
1458
0
                        }
1459
0
                        break;
1460
0
                    default: type = LLM_TYPE_UNKNOWN;
1461
0
                }
1462
0
            } break;
1463
0
        case LLM_ARCH_PLAMO3:
1464
0
            {
1465
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1466
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1467
0
                if (found_swa && hparams.n_swa > 0) {
1468
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1469
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1470
0
                    uint32_t swa_period = 8;
1471
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1472
0
                    hparams.set_swa_pattern(swa_period);
1473
0
                } else {
1474
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1475
0
                }
1476
1477
0
                switch (hparams.n_layer) {
1478
0
                    case 24: type = LLM_TYPE_2B; break;
1479
0
                    default: type = LLM_TYPE_UNKNOWN;
1480
0
                }
1481
0
            } break;
1482
0
        case LLM_ARCH_GPT2:
1483
0
            {
1484
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1485
0
                switch (hparams.n_layer) {
1486
0
                    case 12: type = LLM_TYPE_SMALL; break;
1487
0
                    case 24: type = LLM_TYPE_MEDIUM; break;
1488
0
                    case 36: type = LLM_TYPE_LARGE; break;
1489
0
                    case 48: type = LLM_TYPE_XL; break;
1490
0
                    default: type = LLM_TYPE_UNKNOWN;
1491
0
                }
1492
0
            } break;
1493
0
        case LLM_ARCH_CODESHELL:
1494
0
            {
1495
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1496
0
                switch (hparams.n_layer) {
1497
0
                    case 42: type = LLM_TYPE_7B; break;
1498
0
                    default: type = LLM_TYPE_UNKNOWN;
1499
0
                }
1500
0
            } break;
1501
0
        case LLM_ARCH_ORION:
1502
0
            {
1503
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1504
1505
0
                switch (hparams.n_layer) {
1506
0
                    case 40: type = LLM_TYPE_14B; break;
1507
0
                    default: type = LLM_TYPE_UNKNOWN;
1508
0
                }
1509
0
            } break;
1510
0
        case LLM_ARCH_INTERNLM2:
1511
0
            {
1512
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1513
0
                switch (hparams.n_layer) {
1514
0
                    case 32: type = LLM_TYPE_7B; break;
1515
0
                    case 48: type = LLM_TYPE_20B; break;
1516
0
                    default: type = LLM_TYPE_UNKNOWN;
1517
0
                }
1518
0
            } break;
1519
0
        case LLM_ARCH_GEMMA:
1520
0
            {
1521
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1522
1523
0
                switch (hparams.n_layer) {
1524
0
                    case 18: type = LLM_TYPE_2B; break;
1525
0
                    case 28: type = LLM_TYPE_7B; break;
1526
0
                    default: type = LLM_TYPE_UNKNOWN;
1527
0
               }
1528
0
            } break;
1529
0
        case LLM_ARCH_GEMMA2:
1530
0
            {
1531
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1532
0
                hparams.n_swa = 4096; // default value of gemma 2
1533
0
                uint32_t swa_period = 2;
1534
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1535
0
                hparams.set_swa_pattern(swa_period);
1536
0
                hparams.attn_soft_cap = true;
1537
0
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1538
0
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1539
1540
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
1541
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1542
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1543
0
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
1544
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
1545
1546
0
                switch (hparams.n_layer) {
1547
0
                    case 26: type = LLM_TYPE_2B; break;
1548
0
                    case 42: type = LLM_TYPE_9B; break;
1549
0
                    case 46: type = LLM_TYPE_27B; break;
1550
0
                    default: type = LLM_TYPE_UNKNOWN;
1551
0
               }
1552
1553
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1554
0
                hparams.f_attention_scale = type == LLM_TYPE_27B
1555
0
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1556
0
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
1557
0
            } break;
1558
0
        case LLM_ARCH_GEMMA3:
1559
0
            {
1560
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1561
0
                if (found_swa && hparams.n_swa > 0) {
1562
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1563
0
                    uint32_t swa_period = 6;
1564
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1565
0
                    hparams.set_swa_pattern(swa_period);
1566
1567
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1568
0
                } else {
1569
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1570
0
                }
1571
1572
0
                hparams.f_final_logit_softcapping = 0.0f;
1573
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1574
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1575
1576
0
                switch (hparams.n_layer) {
1577
0
                    case 18: type = LLM_TYPE_270M; break;
1578
0
                    case 26: type = LLM_TYPE_1B; break;
1579
0
                    case 32: type = LLM_TYPE_8B; break; // Rnj-1
1580
0
                    case 34: type = LLM_TYPE_4B; break;
1581
0
                    case 48: type = LLM_TYPE_12B; break;
1582
0
                    case 62: type = LLM_TYPE_27B; break;
1583
0
                    default: type = LLM_TYPE_UNKNOWN;
1584
0
                }
1585
1586
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1587
0
                hparams.f_attention_scale = type == LLM_TYPE_27B
1588
0
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1589
0
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
1590
0
            } break;
1591
0
        case LLM_ARCH_GEMMA3N:
1592
0
            {
1593
0
                uint32_t swa_period = 5;
1594
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1595
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1596
0
                hparams.set_swa_pattern(swa_period);
1597
1598
0
                hparams.n_layer_kv_from_start     = 20;
1599
0
                hparams.f_attention_scale         = 1.0f;
1600
1601
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
1602
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
1603
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1604
1605
0
                switch (hparams.n_layer) {
1606
0
                    case 30: type = LLM_TYPE_E2B; break;
1607
0
                    case 35: type = LLM_TYPE_E4B; break;
1608
0
                    default: type = LLM_TYPE_UNKNOWN;
1609
0
                }
1610
0
            } break;
1611
0
        case LLM_ARCH_GEMMA4:
1612
0
            {
1613
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1614
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
1615
1616
0
                uint32_t n_kv_shared_layers = 0;
1617
0
                ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
1618
1619
0
                hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
1620
0
                hparams.f_attention_scale     = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
1621
1622
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
1623
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
1624
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
1625
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1626
0
                ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER,  hparams.n_embd_per_layer);
1627
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA,    hparams.n_embd_head_k_swa);
1628
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
1629
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
1630
1631
0
                switch (hparams.n_layer) {
1632
0
                    case 35: type = LLM_TYPE_E2B; break;
1633
0
                    case 42: type = LLM_TYPE_E4B; break; // to confirm: E4B or E5B?
1634
0
                    default: type = LLM_TYPE_UNKNOWN;
1635
0
                }
1636
0
            } break;
1637
0
        case LLM_ARCH_GEMMA_EMBEDDING:
1638
0
            {
1639
0
                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1640
0
                uint32_t swa_period = 6;
1641
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1642
0
                hparams.set_swa_pattern(swa_period);
1643
1644
0
                hparams.causal_attn = false; // embeddings do not use causal attention
1645
1646
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1647
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1648
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1649
1650
                //applied only if model converted with --sentence-transformers-dense-modules
1651
0
                ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1652
0
                ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1653
0
                ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1654
0
                ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1655
1656
0
                GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1657
0
                GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1658
1659
0
                switch (hparams.n_layer) {
1660
0
                    case 24: type = LLM_TYPE_0_3B; break;
1661
0
                    default: type = LLM_TYPE_UNKNOWN;
1662
0
                }
1663
0
                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
1664
1665
0
            } break;
1666
0
        case LLM_ARCH_STARCODER2:
1667
0
            {
1668
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1669
0
                switch (hparams.n_layer) {
1670
0
                    case 30: type = LLM_TYPE_3B; break;
1671
0
                    case 32: type = LLM_TYPE_7B; break;
1672
0
                    case 40: type = LLM_TYPE_15B; break;
1673
0
                    case 52: type = LLM_TYPE_20B; break; // granite
1674
0
                    case 88: type = LLM_TYPE_34B; break; // granite
1675
0
                    default: type = LLM_TYPE_UNKNOWN;
1676
0
                }
1677
0
            } break;
1678
0
        case LLM_ARCH_MAMBA:
1679
0
            {
1680
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1681
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1682
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1683
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1684
0
                ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
1685
1686
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1687
1688
0
                switch (hparams.n_layer) {
1689
0
                    case 24:
1690
0
                        switch (hparams.n_embd) {
1691
0
                            case 768: type = LLM_TYPE_SMALL; break;
1692
0
                            default: type = LLM_TYPE_UNKNOWN;
1693
0
                        } break;
1694
0
                    case 48:
1695
0
                        switch (hparams.n_embd) {
1696
0
                            case 1024: type = LLM_TYPE_MEDIUM; break;
1697
0
                            case 1536: type = LLM_TYPE_LARGE; break;
1698
0
                            case 2048: type = LLM_TYPE_XL; break;
1699
0
                            default:   type = LLM_TYPE_UNKNOWN;
1700
0
                        } break;
1701
0
                    case 64:
1702
0
                        switch (hparams.n_embd) {
1703
0
                            case 2560: type = LLM_TYPE_3B; break;
1704
0
                            default: type = LLM_TYPE_UNKNOWN;
1705
0
                        } break;
1706
0
                    default: type = LLM_TYPE_UNKNOWN;
1707
0
                }
1708
0
            } break;
1709
0
        case LLM_ARCH_MAMBA2:
1710
0
            {
1711
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1712
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1713
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1714
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1715
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1716
1717
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1718
1719
0
                switch (hparams.n_layer) {
1720
0
                    case 24:
1721
0
                        switch (hparams.n_embd) {
1722
0
                            case 768: type = LLM_TYPE_SMALL; break;
1723
0
                            default: type = LLM_TYPE_UNKNOWN;
1724
0
                        } break;
1725
0
                    case 48:
1726
0
                        switch (hparams.n_embd) {
1727
0
                            case 1024: type = LLM_TYPE_MEDIUM; break;
1728
0
                            case 1536: type = LLM_TYPE_LARGE; break;
1729
0
                            case 2048: type = LLM_TYPE_XL; break;
1730
0
                            default: type = LLM_TYPE_UNKNOWN;
1731
0
                        } break;
1732
0
                    case 64:
1733
0
                        switch (hparams.n_embd) {
1734
0
                            case 2560: type = LLM_TYPE_3B; break;
1735
0
                            case 4096: type = LLM_TYPE_7B; break;
1736
0
                            default: type = LLM_TYPE_UNKNOWN;
1737
0
                        } break;
1738
0
                    default: type = LLM_TYPE_UNKNOWN;
1739
0
                }
1740
0
            } break;
1741
0
        case LLM_ARCH_JAMBA:
1742
0
            {
1743
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1744
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1745
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1746
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1747
1748
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1749
1750
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1751
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1752
0
                }
1753
1754
0
                switch (hparams.n_layer) {
1755
                    // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
1756
0
                    case 12: // 900M  8x???M
1757
0
                    case 32: // 51B  16x?B
1758
0
                    default: type = LLM_TYPE_UNKNOWN;
1759
0
                }
1760
0
            } break;
1761
0
        case LLM_ARCH_XVERSE:
1762
0
            {
1763
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1764
0
                switch (hparams.n_layer) {
1765
0
                    case 32: type = LLM_TYPE_7B; break;
1766
0
                    case 40: type = LLM_TYPE_13B; break;
1767
0
                    case 80: type = LLM_TYPE_65B; break;
1768
0
                    default: type = LLM_TYPE_UNKNOWN;
1769
0
                }
1770
0
            } break;
1771
0
        case LLM_ARCH_COMMAND_R:
1772
0
            {
1773
0
                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale, false);
1774
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1775
0
                switch (hparams.n_layer) {
1776
0
                    case 40: type = LLM_TYPE_35B; break;
1777
0
                    default: type = LLM_TYPE_UNKNOWN;
1778
0
                }
1779
0
            } break;
1780
0
        case LLM_ARCH_COHERE2:
1781
0
            {
1782
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1783
0
                uint32_t swa_period = 4;
1784
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1785
0
                hparams.set_swa_pattern(swa_period);
1786
0
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1787
0
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1788
1789
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
1790
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1791
0
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
1792
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
1793
0
                switch (hparams.n_layer) {
1794
0
                    case 32: type = LLM_TYPE_8B; break;
1795
0
                    default: type = LLM_TYPE_UNKNOWN;
1796
0
                }
1797
0
            } break;
1798
0
        case LLM_ARCH_DBRX:
1799
0
        {
1800
0
            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1801
0
            ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
1802
1803
0
            switch (hparams.n_layer) {
1804
0
                case 40: type = LLM_TYPE_16x12B; break;
1805
0
                default: type = LLM_TYPE_UNKNOWN;
1806
0
            }
1807
0
        } break;
1808
0
        case LLM_ARCH_OLMO:
1809
0
            {
1810
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1811
0
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
1812
1813
0
                switch (hparams.n_layer) {
1814
0
                    case 22: type = LLM_TYPE_1B; break;
1815
0
                    case 32: type = LLM_TYPE_7B; break;
1816
0
                    case 80: type = LLM_TYPE_70B; break;
1817
0
                    default: type = LLM_TYPE_UNKNOWN;
1818
0
                }
1819
0
            } break;
1820
0
        case LLM_ARCH_OLMO2:
1821
0
            {
1822
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1823
1824
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1825
0
                if (found_swa && hparams.n_swa > 0) {
1826
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1827
0
                    uint32_t swa_period = 4;
1828
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1829
0
                    hparams.set_swa_pattern(swa_period);
1830
1831
0
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
1832
0
                    hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1833
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1834
0
                } else {
1835
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1836
0
                }
1837
1838
0
                switch (hparams.n_layer) {
1839
0
                    case 16: type = LLM_TYPE_1B; break;
1840
0
                    case 32: type = LLM_TYPE_7B; break;
1841
0
                    case 40: type = LLM_TYPE_13B; break;
1842
0
                    case 64: type = LLM_TYPE_32B; break;
1843
0
                    default: type = LLM_TYPE_UNKNOWN;
1844
0
                }
1845
0
            } break;
1846
0
        case LLM_ARCH_SEED_OSS:
1847
0
            {
1848
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1849
0
                switch (hparams.n_layer) {
1850
0
                    case 64: type = LLM_TYPE_36B; break;
1851
0
                    default: type = LLM_TYPE_UNKNOWN;
1852
0
                }
1853
0
            } break;
1854
0
        case LLM_ARCH_OLMOE:
1855
0
            {
1856
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1857
0
                switch (hparams.n_layer) {
1858
0
                    case 16: type = LLM_TYPE_A1_7B; break;
1859
0
                    default: type = LLM_TYPE_UNKNOWN;
1860
0
                }
1861
0
            } break;
1862
0
        case LLM_ARCH_OPENELM:
1863
0
            {
1864
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1865
1866
0
                switch (hparams.n_layer) {
1867
0
                case 16: type = LLM_TYPE_270M; break;
1868
0
                case 20: type = LLM_TYPE_450M; break;
1869
0
                case 28: type = LLM_TYPE_1B; break;
1870
0
                case 36: type = LLM_TYPE_3B; break;
1871
0
                default: type = LLM_TYPE_UNKNOWN;
1872
0
                }
1873
0
            } break;
1874
0
        case LLM_ARCH_GPTNEOX:
1875
0
            {
1876
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1877
0
                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
1878
0
                switch (hparams.n_layer) {
1879
0
                    case 6:
1880
0
                        switch (hparams.n_ff()) {
1881
0
                            case 512:  type = LLM_TYPE_14M; break;
1882
0
                            case 2048: type = LLM_TYPE_70M; break;
1883
0
                            default:   type = LLM_TYPE_UNKNOWN;
1884
0
                        } break;
1885
0
                    case 12:
1886
0
                        switch (hparams.n_ff()) {
1887
0
                            case 3072: type = LLM_TYPE_160M; break;
1888
0
                            default: type = LLM_TYPE_UNKNOWN;
1889
0
                        } break;
1890
0
                    case 16:
1891
0
                        switch (hparams.n_ff()) {
1892
0
                            case 8192: type = LLM_TYPE_1B; break;
1893
0
                            default: type = LLM_TYPE_UNKNOWN;
1894
0
                        } break;
1895
0
                    case 24:
1896
0
                        switch (hparams.n_ff()) {
1897
0
                            case 4096: type = LLM_TYPE_410M; break;
1898
0
                            case 8192: type = LLM_TYPE_1_4B; break;
1899
0
                            default: type = LLM_TYPE_UNKNOWN;
1900
0
                        } break;
1901
0
                    case 32:
1902
0
                        switch (hparams.n_ff()) {
1903
0
                            case 10240: type = LLM_TYPE_2_8B; break;
1904
0
                            case 16384: type = LLM_TYPE_6_9B; break;
1905
0
                            default: type = LLM_TYPE_UNKNOWN;
1906
0
                        } break;
1907
0
                    case 36:
1908
0
                        switch (hparams.n_ff()) {
1909
0
                            case 20480: type = LLM_TYPE_12B; break;
1910
0
                            default: type = LLM_TYPE_UNKNOWN;
1911
0
                        } break;
1912
0
                    case 44:
1913
0
                        switch (hparams.n_ff()) {
1914
0
                            case 24576: type = LLM_TYPE_20B; break;
1915
0
                            default: type = LLM_TYPE_UNKNOWN;
1916
0
                        } break;
1917
0
                    default: type = LLM_TYPE_UNKNOWN;
1918
0
                }
1919
0
            } break;
1920
0
        case LLM_ARCH_ARCTIC:
1921
0
            {
1922
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1923
1924
0
                if (hparams.n_expert == 128) {
1925
0
                    switch (hparams.n_layer) {
1926
0
                        case 35: type = LLM_TYPE_10B_128x3_66B; break;
1927
0
                        default: type = LLM_TYPE_UNKNOWN;
1928
0
                    }
1929
0
                } else {
1930
0
                    type = LLM_TYPE_UNKNOWN;
1931
0
                }
1932
0
            } break;
1933
0
        case LLM_ARCH_DEEPSEEK:
1934
0
            {
1935
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1936
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
1937
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1938
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1939
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
1940
1941
0
                switch (hparams.n_ff_exp) {
1942
0
                    case 1408: type = LLM_TYPE_16B; break;
1943
0
                    case 1792: type = LLM_TYPE_20B; break;
1944
0
                    default: type = LLM_TYPE_UNKNOWN;
1945
0
                }
1946
0
            } break;
1947
0
        case LLM_ARCH_DEEPSEEK2:
1948
0
        case LLM_ARCH_MISTRAL4:
1949
0
            {
1950
                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
1951
0
                const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
1952
1953
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1954
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
1955
0
                if (!is_lite) {
1956
0
                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1957
0
                }
1958
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
1959
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
1960
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
1961
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1962
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
1963
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
1964
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
1965
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
1966
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1967
                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1968
                    // that have no expert_gating_func model parameter set
1969
0
                    if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
1970
                        // GLM 4.7 Lite
1971
0
                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1972
0
                    } else {
1973
0
                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1974
0
                    }
1975
0
                }
1976
1977
0
                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1978
                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1979
                    // cancel the factor from the convert script
1980
0
                    hparams.rope_yarn_log_mul /= 0.1f;
1981
0
                }
1982
1983
                // (optional) temperature tuning - used by mistral-large
1984
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
1985
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length?
1986
1987
0
                hparams.f_attn_temp_offset = 0.0f;
1988
1989
0
                switch (hparams.n_layer) {
1990
0
                    case 27: type = LLM_TYPE_16B; break;
1991
0
                    case 47: type = LLM_TYPE_30B_A3B; break;
1992
0
                    case 60: type = LLM_TYPE_236B; break;
1993
0
                    case 61: type = LLM_TYPE_671B; break;
1994
0
                    default: type = LLM_TYPE_UNKNOWN;
1995
0
                }
1996
0
            } break;
1997
0
        case LLM_ARCH_DEEPSEEK2OCR:
1998
0
            {
1999
                // similar to deepseek2, but without MLA
2000
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2001
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
2002
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2003
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2004
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
2005
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2006
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2007
2008
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2009
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
2010
0
                }
2011
2012
0
                switch (hparams.n_layer) {
2013
0
                    case 12: type = LLM_TYPE_3B; break;
2014
0
                    default: type = LLM_TYPE_UNKNOWN;
2015
0
                }
2016
0
            } break;
2017
0
        case LLM_ARCH_PLM:
2018
0
            {
2019
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2020
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
2021
0
                switch (hparams.n_layer) {
2022
0
                    case 32: type = LLM_TYPE_1_8B; break;
2023
0
                    default: type = LLM_TYPE_UNKNOWN;
2024
0
                }
2025
0
            } break;
2026
0
        case LLM_ARCH_CHATGLM:
2027
0
            {
2028
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2029
0
                switch (hparams.n_layer) {
2030
0
                    case 28: {
2031
0
                        if (hparams.n_head(0) == 16) {
2032
0
                            type = LLM_TYPE_1_5B;
2033
0
                        } else {
2034
0
                            type = LLM_TYPE_6B;
2035
0
                        }
2036
0
                    } break;
2037
0
                    case 40: {
2038
0
                        if (hparams.n_head(0) == 24) {
2039
0
                            type = LLM_TYPE_4B;
2040
0
                        } else {
2041
0
                            type = LLM_TYPE_9B;
2042
0
                        }
2043
0
                    } break;
2044
0
                    default: type = LLM_TYPE_UNKNOWN;
2045
0
                }
2046
0
            } break;
2047
0
        case LLM_ARCH_GLM4:
2048
0
            {
2049
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
2050
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
2051
2052
                // NextN/MTP parameters (GLM-OCR)
2053
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
2054
0
                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
2055
2056
                // TODO: when MTP is implemented, this should probably be updated if needed
2057
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2058
2059
0
                switch (hparams.n_layer) {
2060
0
                    case 17: type = LLM_TYPE_1B; break; // GLM-OCR
2061
0
                    case 40: type = LLM_TYPE_9B; break;
2062
0
                    case 61: type = LLM_TYPE_32B; break;
2063
0
                    default: type = LLM_TYPE_UNKNOWN;
2064
0
                }
2065
0
            } break;
2066
0
        case LLM_ARCH_GLM4_MOE:
2067
0
            {
2068
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
2069
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
2070
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
2071
2072
                // MoE parameters
2073
0
                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
2074
0
                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
2075
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2076
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
2077
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
2078
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2079
2080
                // Expert gating function (GLM-4.5 uses sigmoid)
2081
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2082
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2083
0
                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
2084
0
                }
2085
2086
                // NextN/MTP parameters
2087
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
2088
0
                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
2089
2090
                // TODO: when MTP is implemented, this should probably be updated if needed
2091
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2092
2093
0
                switch (hparams.n_layer) {
2094
0
                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
2095
0
                    case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
2096
0
                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
2097
0
                    default: type = LLM_TYPE_UNKNOWN;
2098
0
                }
2099
0
            } break;
2100
0
        case LLM_ARCH_GLM_DSA:
2101
0
            {
2102
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
2103
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
2104
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
2105
2106
                // MoE parameters
2107
0
                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
2108
0
                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
2109
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2110
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
2111
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
2112
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2113
2114
                // deepseek MLA parameters
2115
0
                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
2116
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
2117
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
2118
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
2119
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2120
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
2121
2122
                // DSA parameters
2123
0
                ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
2124
0
                ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
2125
0
                ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
2126
2127
                // Expert gating function (GLM-4.5 uses sigmoid)
2128
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2129
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2130
0
                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
2131
0
                }
2132
2133
                // NextN/MTP parameters
2134
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
2135
0
                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
2136
2137
                // TODO: when MTP is implemented, this should probably be updated if needed
2138
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2139
2140
0
                switch (hparams.n_layer) {
2141
0
                    case 79: type = LLM_TYPE_744B_A40B; break;
2142
0
                    default: type = LLM_TYPE_UNKNOWN;
2143
0
                }
2144
0
            } break;
2145
0
        case LLM_ARCH_BITNET:
2146
0
            {
2147
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2148
2149
0
                switch (hparams.n_layer) {
2150
0
                    case 26: type = LLM_TYPE_3B; break;
2151
0
                    default: type = LLM_TYPE_UNKNOWN;
2152
0
                }
2153
0
            } break;
2154
0
        case LLM_ARCH_T5:
2155
0
            {
2156
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
2157
0
                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
2158
2159
0
                uint32_t dec_start_token_id;
2160
0
                if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
2161
0
                    hparams.dec_start_token_id = dec_start_token_id;
2162
0
                }
2163
2164
0
                hparams.dec_n_layer = hparams.n_layer;
2165
0
                ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
2166
2167
0
                switch (hparams.n_layer) {
2168
0
                    case 6:  type = LLM_TYPE_60M;  break; // t5-small
2169
0
                    case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
2170
0
                    case 12:
2171
0
                        switch (hparams.n_ff()) {
2172
0
                            case 3072: type = LLM_TYPE_220M; break; // t5-base
2173
0
                            case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
2174
0
                            default: type = LLM_TYPE_UNKNOWN;
2175
0
                        } break;
2176
0
                    case 24:
2177
0
                        switch (hparams.n_ff()) {
2178
0
                            case 4096:  type = LLM_TYPE_770M; break; // t5-large
2179
0
                            case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
2180
0
                            case 16384: type = LLM_TYPE_3B;   break; // t5-3b
2181
0
                            case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
2182
0
                            case 65536: type = LLM_TYPE_11B;  break; // t5-11b
2183
0
                            case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
2184
0
                            default: type = LLM_TYPE_UNKNOWN;
2185
0
                        } break;
2186
0
                    default: type = LLM_TYPE_UNKNOWN;
2187
0
               }
2188
0
            } break;
2189
0
        case LLM_ARCH_T5ENCODER:
2190
0
            {
2191
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2192
0
                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
2193
0
                type = LLM_TYPE_UNKNOWN;
2194
0
            } break;
2195
0
        case LLM_ARCH_JAIS:
2196
0
            {
2197
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2198
0
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
2199
2200
0
                switch (hparams.n_layer) {
2201
0
                    case 24: type = LLM_TYPE_1_3B; break;
2202
0
                    case 40: type = LLM_TYPE_13B; break;
2203
                    /* TODO: add variants */
2204
0
                    default: type = LLM_TYPE_UNKNOWN;
2205
0
                }
2206
0
            } break;
2207
0
        case LLM_ARCH_JAIS2:
2208
0
            {
2209
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2210
2211
0
                switch (hparams.n_layer) {
2212
0
                    case 32: type = LLM_TYPE_8B; break;
2213
0
                    case 68: type = LLM_TYPE_70B; break;
2214
0
                    default: type = LLM_TYPE_UNKNOWN;
2215
0
                }
2216
0
            } break;
2217
0
        case LLM_ARCH_NEMOTRON:
2218
0
            {
2219
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2220
0
                switch (hparams.n_layer) {
2221
0
                    case 32: type = LLM_TYPE_4B; break;
2222
0
                    default: type = LLM_TYPE_UNKNOWN;
2223
0
                }
2224
0
            } break;
2225
0
        case LLM_ARCH_NEMOTRON_H:
2226
0
        case LLM_ARCH_NEMOTRON_H_MOE:
2227
0
            {
2228
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2229
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2230
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2231
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2232
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2233
2234
                // A layer is recurrent IFF the n_head_kv value is set to 0 and
2235
                // the n_ff value is set to 0
2236
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2237
0
                    hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
2238
0
                }
2239
2240
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2241
2242
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp,        false);
2243
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp,      false);
2244
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
2245
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2246
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
2247
0
                ml.get_key(LLM_KV_MOE_LATENT_SIZE,                   hparams.moe_latent_size, false);
2248
2249
0
                switch (hparams.n_layer) {
2250
0
                    case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
2251
0
                    case 56: type = LLM_TYPE_9B; break;
2252
0
                    case 88: type = LLM_TYPE_120B_A12B; break;
2253
0
                    default: type = LLM_TYPE_UNKNOWN;
2254
0
                }
2255
0
            } break;
2256
0
        case LLM_ARCH_EXAONE:
2257
0
            {
2258
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2259
2260
0
                switch (hparams.n_layer) {
2261
0
                    case 32: type = LLM_TYPE_8B; break;
2262
0
                    default: type = LLM_TYPE_UNKNOWN;
2263
0
                }
2264
0
            } break;
2265
0
        case LLM_ARCH_EXAONE4:
2266
0
            {
2267
0
                if (hparams.n_layer == 64) {    // 32B
2268
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2269
0
                    hparams.n_swa = 4096;
2270
0
                    uint32_t swa_period = 4;
2271
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
2272
0
                    hparams.set_swa_pattern(swa_period);
2273
2274
0
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
2275
0
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2276
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2277
0
                }
2278
2279
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
2280
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2281
2282
0
                switch (hparams.n_layer) {
2283
0
                    case 30: type = LLM_TYPE_1_2B; break;
2284
0
                    case 64: type = LLM_TYPE_32B; break;
2285
0
                    default: type = LLM_TYPE_UNKNOWN;
2286
0
                }
2287
0
            } break;
2288
0
        case LLM_ARCH_EXAONE_MOE:
2289
0
            {
2290
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2291
0
                hparams.n_swa = 128;
2292
0
                uint32_t swa_period = 4;
2293
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
2294
0
                hparams.set_swa_pattern(swa_period);
2295
0
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
2296
0
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2297
2298
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
2299
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
2300
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2301
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
2302
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2303
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2304
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
2305
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
2306
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2307
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
2308
2309
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
2310
0
                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
2311
2312
0
                switch (hparams.n_layer) {
2313
0
                    case 32: type = LLM_TYPE_30B_A3B; break;
2314
0
                    case 48:
2315
0
                    case 49: type = LLM_TYPE_235B_A22B; break;
2316
0
                    default: type = LLM_TYPE_UNKNOWN;
2317
0
                }
2318
0
            } break;
2319
0
        case LLM_ARCH_RWKV6:
2320
0
        case LLM_ARCH_RWKV6QWEN2:
2321
0
            {
2322
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
2323
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
2324
0
                ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
2325
0
                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
2326
0
                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
2327
0
                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
2328
0
                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
2329
2330
0
                switch (hparams.n_layer) {
2331
0
                    case 24: type = LLM_TYPE_1_6B; break;
2332
0
                    case 32:
2333
0
                        switch (hparams.n_embd) {
2334
0
                            case 2560: type = LLM_TYPE_3B; break;
2335
0
                            case 4096: type = LLM_TYPE_7B; break;
2336
0
                            default: type = LLM_TYPE_UNKNOWN;
2337
0
                        } break;
2338
0
                    case 61: type = LLM_TYPE_14B; break;
2339
0
                    case 64: type = LLM_TYPE_32B; break;
2340
0
                    default: type = LLM_TYPE_UNKNOWN;
2341
0
                }
2342
0
            } break;
2343
0
        case LLM_ARCH_RWKV7:
2344
0
        case LLM_ARCH_ARWKV7:
2345
0
            {
2346
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
2347
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
2348
0
                ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
2349
0
                ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
2350
0
                ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
2351
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
2352
0
                ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
2353
0
                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
2354
2355
0
                switch (hparams.n_layer) {
2356
0
                    case 12:
2357
0
                        switch (hparams.n_embd) {
2358
0
                            case 768: type = LLM_TYPE_190M; break;
2359
0
                            default: type = LLM_TYPE_UNKNOWN;
2360
0
                        } break;
2361
0
                    case 24:
2362
0
                        switch (hparams.n_embd) {
2363
0
                            case 1024: type = LLM_TYPE_450M; break;
2364
0
                            case 2048: type = LLM_TYPE_1_5B; break;
2365
0
                            default: type = LLM_TYPE_UNKNOWN;
2366
0
                        } break;
2367
0
                    case 28:
2368
0
                        switch (hparams.n_embd) {
2369
0
                            case 1536: type = LLM_TYPE_1_5B; break;
2370
0
                            case 3584: type = LLM_TYPE_7B; break;
2371
0
                            default: type = LLM_TYPE_UNKNOWN;
2372
0
                        } break;
2373
0
                    case 32:
2374
0
                        switch (hparams.n_embd) {
2375
0
                            case 2560: type = LLM_TYPE_2_9B; break;
2376
0
                            case 4096: type = LLM_TYPE_7B; break;
2377
0
                            default: type = LLM_TYPE_UNKNOWN;
2378
0
                        } break;
2379
0
                    case 61:
2380
0
                        switch (hparams.n_embd) {
2381
0
                            case 4096: type = LLM_TYPE_14B; break;
2382
0
                            default: type = LLM_TYPE_UNKNOWN;
2383
0
                        } break;
2384
0
                    default: type = LLM_TYPE_UNKNOWN;
2385
0
                }
2386
0
            } break;
2387
0
        case LLM_ARCH_GRANITE:
2388
0
        case LLM_ARCH_GRANITE_MOE:
2389
0
            {
2390
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2391
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
2392
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, false);
2393
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
2394
0
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, false);
2395
2396
                // Granite uses rope_finetuned as a switch for rope, so default to true
2397
0
                bool rope_finetuned = true;
2398
0
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2399
0
                hparams.rope_finetuned = rope_finetuned;
2400
2401
0
                switch (hparams.n_layer) {
2402
0
                    case 32: type = LLM_TYPE_3B; break;
2403
0
                    case 40: type = LLM_TYPE_3B; break;
2404
                    // Add additional layer/vocab/etc checks here for other model sizes
2405
0
                    default: type = LLM_TYPE_UNKNOWN;
2406
0
                }
2407
2408
                // For Granite MoE Shared
2409
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
2410
0
            } break;
2411
0
        case LLM_ARCH_GRANITE_HYBRID:
2412
0
            {
2413
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2414
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
2415
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
2416
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
2417
0
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
2418
2419
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2420
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2421
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2422
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2423
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2424
2425
                // Granite uses rope_finetuned as a switch for rope, so default to true
2426
0
                bool rope_finetuned = true;
2427
0
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2428
0
                hparams.rope_finetuned = rope_finetuned;
2429
2430
                // A layer is recurrent IFF the n_head_kv value is set to 0
2431
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2432
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
2433
0
                }
2434
2435
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2436
2437
0
                switch (hparams.n_embd) {
2438
0
                    case 768: type = LLM_TYPE_350M; break;
2439
0
                    case 1536: type = (hparams.n_ff() == 512 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
2440
0
                    case 2048: case 2560: type = LLM_TYPE_3B; break;
2441
0
                    case 4096: type = LLM_TYPE_32B; break;
2442
0
                    default: type = LLM_TYPE_UNKNOWN;
2443
0
                }
2444
2445
                // For Granite MoE Shared
2446
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
2447
0
            } break;
2448
0
        case LLM_ARCH_CHAMELEON:
2449
0
            {
2450
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2451
0
                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
2452
0
                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
2453
2454
0
                switch (hparams.n_layer) {
2455
0
                    case 32: type = LLM_TYPE_7B; break;
2456
0
                    case 48: type = LLM_TYPE_34B; break;
2457
0
                    default: type = LLM_TYPE_UNKNOWN;
2458
0
               }
2459
0
            } break;
2460
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
2461
0
            {
2462
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
2463
0
                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
2464
0
                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
2465
0
            } break;
2466
0
        case LLM_ARCH_BAILINGMOE:
2467
0
            {
2468
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2469
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
2470
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2471
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2472
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
2473
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2474
2475
0
                switch (hparams.n_layer) {
2476
0
                    case 28: type = LLM_TYPE_16B; break;
2477
0
                    case 88: type = LLM_TYPE_290B; break;
2478
0
                    default: type = LLM_TYPE_UNKNOWN;
2479
0
                }
2480
0
            } break;
2481
0
        case LLM_ARCH_BAILINGMOE2:
2482
0
            {
2483
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2484
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
2485
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2486
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2487
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
2488
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
2489
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2490
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
2491
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
2492
0
                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
2493
2494
                // TODO: when MTP is implemented, this should probably be updated if needed
2495
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2496
2497
0
                switch (hparams.n_layer) {
2498
0
                    case 20: type = LLM_TYPE_16B_A1B; break;
2499
0
                    case 21: type = LLM_TYPE_16B_A1B; break;
2500
0
                    case 32: type = LLM_TYPE_100B_A6B; break;
2501
0
                    case 33: type = LLM_TYPE_100B_A6B; break;
2502
0
                    default: type = LLM_TYPE_UNKNOWN;
2503
0
                }
2504
0
            } break;
2505
0
        case LLM_ARCH_DOTS1:
2506
0
            {
2507
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2508
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
2509
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2510
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2511
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
2512
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2513
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2514
0
                switch (hparams.n_layer) {
2515
0
                    case 62: type = LLM_TYPE_142B; break;
2516
0
                    default: type = LLM_TYPE_UNKNOWN;
2517
0
                }
2518
0
            } break;
2519
0
        case LLM_ARCH_ERNIE4_5:
2520
0
        case LLM_ARCH_ERNIE4_5_MOE:
2521
0
        case LLM_ARCH_PADDLEOCR:
2522
0
            {
2523
                // paddleocr need mrope_section
2524
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
2525
2526
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2527
0
                if (arch == LLM_ARCH_ERNIE4_5_MOE) {
2528
0
                    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2529
0
                    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2530
0
                    ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
2531
0
                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
2532
0
                }
2533
2534
0
                switch (hparams.n_layer) {
2535
0
                    case 18: type = LLM_TYPE_0_3B; break;
2536
0
                    case 28: type = LLM_TYPE_21B_A3B; break;
2537
0
                    case 54: type = LLM_TYPE_300B_A47B; break;
2538
0
                    default: type = LLM_TYPE_UNKNOWN;
2539
0
                }
2540
0
            } break;
2541
0
        case LLM_ARCH_FALCON_H1:
2542
0
            {
2543
                // Common parameters
2544
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2545
2546
                // SSM parameters
2547
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2548
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2549
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2550
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2551
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2552
2553
0
                std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
2554
2555
0
                switch (hparams.n_layer) {
2556
0
                    case 36:
2557
0
                        type = LLM_TYPE_0_5B; break;
2558
0
                    case 24:
2559
0
                        type = LLM_TYPE_1_5B; break;
2560
0
                    case 66:
2561
0
                        type = LLM_TYPE_1B; break;
2562
0
                    case 32:
2563
0
                        type = LLM_TYPE_3B; break;
2564
0
                    case 44:
2565
0
                        type = LLM_TYPE_7B; break;
2566
0
                    case 72:
2567
0
                        type = LLM_TYPE_34B; break;
2568
0
                    default:
2569
0
                        type = LLM_TYPE_UNKNOWN;
2570
0
                }
2571
0
            } break;
2572
0
        case LLM_ARCH_HUNYUAN_MOE:
2573
0
            {
2574
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2575
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2576
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2577
2578
0
                switch (hparams.n_layer) {
2579
0
                    case 32: type = LLM_TYPE_A13B; break;
2580
0
                    default: type = LLM_TYPE_UNKNOWN;
2581
0
                }
2582
0
            } break;
2583
0
        case LLM_ARCH_HUNYUAN_DENSE:
2584
0
            {
2585
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2586
2587
0
                switch (hparams.n_embd) {
2588
0
                    case 1024: type = LLM_TYPE_0_5B; break;
2589
0
                    case 2048: type = LLM_TYPE_1_8B; break;
2590
0
                    case 3072: type = LLM_TYPE_4B; break;
2591
0
                    case 4096: type = LLM_TYPE_7B; break;
2592
0
                    default: type = LLM_TYPE_UNKNOWN;
2593
0
                }
2594
0
            } break;
2595
0
        case LLM_ARCH_SMOLLM3:
2596
0
            {
2597
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2598
0
                hparams.n_no_rope_layer_step = 4;
2599
2600
0
                switch (hparams.n_layer) {
2601
0
                    case 36: type = LLM_TYPE_3B; break;
2602
0
                    default: type = LLM_TYPE_UNKNOWN;
2603
0
                }
2604
0
            } break;
2605
0
        case LLM_ARCH_OPENAI_MOE:
2606
0
            {
2607
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2608
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2609
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
2610
2611
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2612
0
                uint32_t swa_period = 2;
2613
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
2614
0
                hparams.set_swa_pattern(swa_period);
2615
2616
0
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
2617
0
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2618
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2619
2620
0
                switch (hparams.n_layer) {
2621
0
                    case 24: type = LLM_TYPE_20B; break;
2622
0
                    case 36: type = LLM_TYPE_120B; break;
2623
0
                    default: type = LLM_TYPE_UNKNOWN;
2624
0
                }
2625
0
            } break;
2626
0
        case LLM_ARCH_LFM2:
2627
0
            {
2628
0
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2629
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2630
0
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2631
0
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2632
0
                }
2633
0
                hparams.n_layer_dense_lead = hparams.n_layer;
2634
0
                switch (hparams.n_ff()) {
2635
0
                    case  4608: type = LLM_TYPE_350M; break;
2636
0
                    case  6912: type = LLM_TYPE_700M; break;
2637
0
                    case  8192: type = LLM_TYPE_1_2B; break;
2638
0
                    case 10752: type = LLM_TYPE_2_6B; break;
2639
0
                    default:    type = LLM_TYPE_UNKNOWN;
2640
0
                }
2641
0
                if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
2642
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2643
0
                    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2644
0
                        hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
2645
0
                    }
2646
0
                }
2647
0
            } break;
2648
0
        case LLM_ARCH_LFM2MOE:
2649
0
            {
2650
0
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2651
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2652
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
2653
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2654
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
2655
2656
0
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2657
0
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2658
0
                }
2659
2660
0
                switch (hparams.n_layer) {
2661
0
                    case 24: type = LLM_TYPE_8B_A1B;  break;
2662
0
                    case 40: type = LLM_TYPE_24B_A2B; break;
2663
0
                    default: type = LLM_TYPE_UNKNOWN;
2664
0
                }
2665
0
            } break;
2666
0
        case LLM_ARCH_SMALLTHINKER:
2667
0
            {
2668
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
2669
2670
0
                if (found_swa && hparams.n_swa > 0) {
2671
0
                    hparams.swa_type    = LLAMA_SWA_TYPE_STANDARD;
2672
0
                    hparams.n_swa       = 4096;
2673
0
                    uint32_t swa_period = 4;
2674
0
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
2675
0
                    hparams.set_swa_pattern(swa_period, true);
2676
2677
0
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
2678
0
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2679
0
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2680
0
                } else {
2681
0
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
2682
0
                    hparams.n_no_rope_layer_step = hparams.n_layer;
2683
0
                }
2684
2685
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
2686
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2687
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2688
2689
0
                switch (hparams.n_layer) {
2690
0
                    case 32: type = LLM_TYPE_4B;  break;
2691
0
                    case 52: type = LLM_TYPE_20B; break;
2692
0
                    default: type = LLM_TYPE_UNKNOWN;
2693
0
                }
2694
0
            } break;
2695
0
        case LLM_ARCH_GROVEMOE:
2696
0
            {
2697
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2698
0
                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp, false);
2699
0
                ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
2700
0
                ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
2701
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2702
2703
0
                switch (hparams.n_layer) {
2704
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
2705
0
                    default: type = LLM_TYPE_UNKNOWN;
2706
0
                }
2707
0
            } break;
2708
0
        case LLM_ARCH_APERTUS:
2709
0
            {
2710
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2711
0
                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
2712
0
                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
2713
0
                ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
2714
0
                ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
2715
2716
0
                switch (hparams.n_layer) {
2717
0
                    case 32: type = LLM_TYPE_8B; break;
2718
0
                    default: type = LLM_TYPE_UNKNOWN;
2719
0
                }
2720
0
            } break;
2721
0
        case LLM_ARCH_MINIMAX_M2:
2722
0
            {
2723
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
2724
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
2725
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
2726
2727
0
                switch (hparams.n_layer) {
2728
0
                    case 62: type = LLM_TYPE_230B_A10B; break;
2729
0
                    default: type = LLM_TYPE_UNKNOWN;
2730
0
                }
2731
0
            } break;
2732
0
        case LLM_ARCH_COGVLM:
2733
0
            {
2734
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2735
0
                switch (hparams.n_layer) {
2736
0
                    case 32: type = LLM_TYPE_13B; break;
2737
0
                    default: type = LLM_TYPE_UNKNOWN;
2738
0
                }
2739
0
            } break;
2740
0
        case LLM_ARCH_PANGU_EMBED:
2741
0
            {
2742
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2743
0
                switch (hparams.n_layer) {
2744
0
                    case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
2745
0
                    case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
2746
0
                    default: type = LLM_TYPE_UNKNOWN;
2747
0
                }
2748
0
            } break;
2749
0
        case LLM_ARCH_QWEN3NEXT:
2750
0
            {
2751
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
2752
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2753
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2754
2755
                // Load linear attention (gated delta net) parameters
2756
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2757
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2758
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2759
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2760
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2761
2762
                // Mark recurrent layers (linear attention layers)
2763
0
                {
2764
0
                    uint32_t full_attn_interval = 4;
2765
0
                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2766
0
                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2767
0
                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2768
0
                    }
2769
0
                }
2770
2771
0
                switch (hparams.n_layer) {
2772
0
                    case 48: type = LLM_TYPE_80B_A3B; break;
2773
0
                    default: type = LLM_TYPE_UNKNOWN;
2774
0
                }
2775
0
            } break;
2776
0
        case LLM_ARCH_QWEN35:
2777
0
            {
2778
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2779
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
2780
2781
                // Load linear attention (gated delta net) parameters
2782
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2783
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2784
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2785
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2786
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2787
2788
                // Mark recurrent layers (linear attention layers)
2789
0
                {
2790
0
                    uint32_t full_attn_interval = 4;
2791
0
                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2792
0
                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2793
0
                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2794
0
                    }
2795
0
                }
2796
2797
0
                switch (hparams.n_layer) {
2798
0
                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
2799
0
                    case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
2800
0
                    case 64: type = LLM_TYPE_27B; break;
2801
0
                    default: type = LLM_TYPE_UNKNOWN;
2802
0
                }
2803
0
            } break;
2804
0
        case LLM_ARCH_QWEN35MOE:
2805
0
            {
2806
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
2807
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2808
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2809
2810
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
2811
2812
                // Load linear attention (gated delta net) parameters
2813
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2814
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2815
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2816
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2817
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2818
2819
                // Mark recurrent layers (linear attention layers)
2820
0
                {
2821
0
                    uint32_t full_attn_interval = 4;
2822
0
                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2823
0
                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2824
0
                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2825
0
                    }
2826
0
                }
2827
2828
0
                switch (hparams.n_layer) {
2829
0
                    case 40: type = LLM_TYPE_35B_A3B; break;
2830
0
                    case 48: type = LLM_TYPE_122B_A10B; break;
2831
0
                    case 60: type = LLM_TYPE_397B_A17B; break;
2832
0
                    default: type = LLM_TYPE_UNKNOWN;
2833
0
                }
2834
0
            } break;
2835
0
        case LLM_ARCH_MISTRAL3:
2836
0
            {
2837
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2838
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2839
2840
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
2841
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
2842
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
2843
2844
0
                hparams.f_attn_temp_offset = 0.0f;
2845
2846
                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2847
0
                if (hparams.f_attn_temp_scale != 0.0f) {
2848
0
                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
2849
0
                    if (hparams.n_attn_temp_floor_scale == 0) {
2850
0
                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
2851
0
                    }
2852
0
                }
2853
2854
0
                switch (hparams.n_layer) {
2855
0
                    case 26: type = LLM_TYPE_3B; break;
2856
0
                    case 34: type = LLM_TYPE_8B; break;
2857
0
                    case 40: type = LLM_TYPE_14B; break;
2858
0
                    default: type = LLM_TYPE_UNKNOWN;
2859
0
                }
2860
0
            } break;
2861
0
        case LLM_ARCH_MIMO2:
2862
0
            {
2863
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2864
2865
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2866
2867
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2868
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
2869
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa, false);
2870
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2871
2872
0
                switch (hparams.n_layer) {
2873
0
                    case 48: type = LLM_TYPE_310B_A15B; break;
2874
0
                    default: type = LLM_TYPE_UNKNOWN;
2875
0
                }
2876
0
            } break;
2877
0
        case LLM_ARCH_KIMI_LINEAR:
2878
0
            {
2879
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2880
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
2881
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
2882
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
2883
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
2884
0
                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);
2885
2886
                // MLA qk_rope_head_dim (for reference)
2887
                // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
2888
2889
                // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
2890
                // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
2891
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2892
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
2893
0
                }
2894
2895
                // MoE parameters - Kimi uses moe_intermediate_size = 1024
2896
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2897
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
2898
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
2899
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
2900
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
2901
2902
0
                switch (hparams.n_layer) {
2903
0
                    case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
2904
0
                    default: type = LLM_TYPE_UNKNOWN;
2905
0
                }
2906
0
            } break;
2907
0
        case LLM_ARCH_STEP35:
2908
0
            {
2909
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2910
2911
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2912
2913
                // full_attention layer only use half of the RoPE dimensions
2914
0
                hparams.n_rot_full = hparams.n_rot_full / 2;
2915
2916
                // MoE + SWA parameters
2917
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2918
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2919
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
2920
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
2921
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2922
2923
                // Step35 uses sigmoid gating by default (if not set in GGUF)
2924
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2925
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
2926
0
                }
2927
2928
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,  hparams.n_swa);
2929
0
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa, false);
2930
0
                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2931
0
                ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
2932
0
                ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
2933
2934
0
                switch (hparams.n_layer) {
2935
0
                    case 45: type = LLM_TYPE_196B_A11B; break;
2936
0
                    default: type = LLM_TYPE_UNKNOWN;
2937
0
                }
2938
0
            } break;
2939
0
        default: throw std::runtime_error("unsupported model architecture: " + arch_name());
2940
177
    }
2941
2942
0
    pimpl->n_bytes = ml.n_bytes;
2943
2944
0
    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
2945
2946
0
    if (hparams.f_max_alibi_bias > 0.0f) {
2947
0
        hparams.use_alibi = true;
2948
0
    }
2949
2950
0
    hparams.rope_type = llama_model_rope_type(this);
2951
0
}
2952
2953
0
void llama_model::load_vocab(llama_model_loader & ml) {
2954
0
    const auto kv = LLM_KV(arch);
2955
2956
0
    vocab.load(ml, kv);
2957
0
}
2958
2959
0
bool llama_model::load_tensors(llama_model_loader & ml) {
2960
0
    const auto & split_mode   = params.split_mode;
2961
0
    const auto & use_mlock    = params.use_mlock;
2962
0
    const auto & tensor_split = params.tensor_split;
2963
2964
0
    const int n_layer      = hparams.n_layer;
2965
0
    const int n_gpu_layers = this->n_gpu_layers();
2966
2967
0
    const bool use_mmap_buffer = true;
2968
2969
0
    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
2970
0
        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
2971
2972
    // build a list of buffer types for the CPU and GPU devices
2973
0
    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2974
0
    for (const auto & dev : devices) {
2975
0
        buft_list_t buft_list = make_gpu_buft_list(dev.dev, split_mode, tensor_split);
2976
        // add CPU buffer types as a fallback
2977
0
        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
2978
0
        pimpl->gpu_buft_list.emplace(dev.dev, std::move(buft_list));
2979
0
    }
2980
2981
0
    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2982
0
    if (cpu_dev == nullptr) {
2983
0
        throw std::runtime_error(format("%s: no CPU backend found", __func__));
2984
0
    }
2985
2986
    // calculate the split points
2987
0
    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2988
0
    std::vector<float> splits(n_devices());
2989
0
    if (all_zero) {
2990
        // default split, by free memory
2991
0
        for (size_t i = 0; i < n_devices(); ++i) {
2992
0
            ggml_backend_dev_t dev = devices[i].dev;
2993
0
            size_t total;
2994
0
            size_t free;
2995
0
            ggml_backend_dev_memory(dev, &free, &total);
2996
2997
            // devices can return 0 bytes for free and total memory if they do not
2998
            // have any to report. in this case, we will use the host memory as a fallback
2999
            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
3000
0
            if (free == 0 && total == 0) {
3001
0
                ggml_backend_dev_memory(cpu_dev, &free, &total);
3002
0
            }
3003
0
            splits[i] = free;
3004
0
        }
3005
0
    } else {
3006
0
        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
3007
0
    }
3008
3009
    // sum and normalize the splits to get the split points
3010
0
    float split_sum = 0.0f;
3011
0
    for (size_t i = 0; i < n_devices(); ++i) {
3012
0
        split_sum += splits[i];
3013
0
        splits[i] = split_sum;
3014
0
    }
3015
0
    for (size_t i = 0; i < n_devices(); ++i) {
3016
0
        splits[i] /= split_sum;
3017
0
    }
3018
3019
0
    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
3020
0
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
3021
0
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
3022
0
        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
3023
0
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
3024
0
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
3025
0
            return {cpu_dev, &pimpl->cpu_buft_list};
3026
0
        }
3027
0
        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
3028
0
        auto * dev = devices.at(layer_gpu).dev;
3029
0
        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
3030
0
        return {dev, &pimpl->gpu_buft_list.at(dev)};
3031
0
    };
3032
3033
    // assign the input layer
3034
    // there is very little benefit to offloading the input layer, so always keep it on the CPU
3035
0
    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
3036
3037
    // assign the repeating layers to the devices according to the splits
3038
0
    pimpl->dev_layer.resize(n_layer);
3039
0
    for (int il = 0; il < n_layer; ++il) {
3040
0
        pimpl->dev_layer[il] = get_layer_buft_list(il);
3041
0
    }
3042
3043
    // assign the output layer
3044
0
    pimpl->dev_output = get_layer_buft_list(n_layer);
3045
3046
0
    const auto TENSOR_DUPLICATED      = llama_model_loader::TENSOR_DUPLICATED;
3047
0
    const auto TENSOR_NOT_REQUIRED    = llama_model_loader::TENSOR_NOT_REQUIRED;
3048
0
    const auto TENSOR_SKIP            = llama_model_loader::TENSOR_SKIP;
3049
0
    const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
3050
3051
    // create tensors for the weights
3052
0
    {
3053
        // note: cast to int64_t since we will use these for the tensor dimensions
3054
0
        const int64_t n_head        = hparams.n_head();
3055
0
        const int64_t n_head_kv     = hparams.n_head_kv();
3056
0
        const int64_t n_embd        = hparams.n_embd;
3057
0
        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
3058
0
        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
3059
0
        const int64_t n_embd_head_k = hparams.n_embd_head_k();
3060
0
        const int64_t n_embd_head_v = hparams.n_embd_head_v();
3061
0
        const int64_t n_ff          = hparams.n_ff();
3062
0
        const int64_t n_embd_gqa    = n_embd_v_gqa;
3063
0
        const int64_t n_vocab       = vocab.n_tokens();
3064
0
        const int64_t n_token_types = vocab.n_token_types();
3065
0
        const int64_t n_rot         = hparams.n_rot();
3066
0
        const int64_t n_expert      = hparams.n_expert;
3067
0
        const int64_t n_expert_used = hparams.n_expert_used;
3068
0
        const int64_t n_ctx_train   = hparams.n_ctx_train;
3069
3070
0
        if (n_expert > 0 && hparams.n_expert_used == 0) {
3071
0
            throw std::runtime_error("model has expert layers but no expert layers are used");
3072
0
        }
3073
3074
0
        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
3075
0
            const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
3076
0
            return ml.create_tensor(
3077
0
                hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
3078
0
                tn, ne, flags);
3079
0
        };
3080
3081
0
        layers.resize(n_layer);
3082
3083
        // TODO: move to a separate function
3084
0
        const auto tn = LLM_TN(arch);
3085
3086
        // helper: try merged gate_up_exps first, fall back to separate gate and up
3087
0
        auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
3088
0
            layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
3089
0
            if (layer.ffn_gate_up_exps == nullptr) {
3090
0
                layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
3091
0
                layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
3092
0
            }
3093
0
        };
3094
0
        switch (arch) {
3095
0
            case LLM_ARCH_LLAMA:
3096
0
            case LLM_ARCH_REFACT:
3097
0
            case LLM_ARCH_MINICPM:
3098
0
            case LLM_ARCH_GRANITE:
3099
0
            case LLM_ARCH_GRANITE_MOE:
3100
0
            case LLM_ARCH_MISTRAL3:
3101
0
            case LLM_ARCH_LLAMA_EMBED:
3102
0
                {
3103
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3104
3105
                    // output
3106
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3107
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3108
3109
                    // if output is NULL, init from the input tok embed
3110
0
                    if (output == NULL) {
3111
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3112
0
                    }
3113
3114
0
                    for (int i = 0; i < n_layer; ++i) {
3115
0
                        auto & layer = layers[i];
3116
3117
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3118
3119
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3120
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3121
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3122
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3123
3124
                        // optional bias tensors
3125
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3126
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3127
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3128
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3129
3130
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3131
3132
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
3133
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3134
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3135
0
                        }
3136
0
                        else {
3137
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3138
0
                        }
3139
3140
0
                        if (n_expert == 0) {
3141
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3142
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3143
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3144
3145
                            // optional MLP bias
3146
0
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3147
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3148
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3149
0
                        } else {
3150
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3151
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3152
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
3153
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
3154
3155
                            // For Granite MoE Shared
3156
0
                            if (hparams.n_ff_shexp > 0) {
3157
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3158
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3159
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3160
0
                            }
3161
0
                        }
3162
0
                    }
3163
0
                } break;
3164
0
            case LLM_ARCH_LLADA:
3165
0
                {
3166
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3167
3168
                    // output
3169
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3170
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
3171
3172
                    // if output is NULL, init from the input tok embed
3173
0
                    if (output == NULL) {
3174
0
                        output =
3175
0
                            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
3176
0
                    }
3177
3178
0
                    for (int i = 0; i < n_layer; ++i) {
3179
0
                        auto & layer = layers[i];
3180
3181
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3182
3183
                        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
3184
0
                        layer.wq =
3185
0
                            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
3186
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
3187
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
3188
                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
3189
0
                        layer.wo =
3190
0
                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
3191
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
3192
3193
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3194
3195
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
3196
0
                                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3197
3198
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
3199
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3200
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
3201
3202
                        // optional MLP bias
3203
0
                        layer.ffn_gate_b =
3204
0
                            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
3205
0
                        layer.ffn_down_b =
3206
0
                            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
3207
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
3208
0
                    }
3209
0
                }
3210
0
                break;
3211
0
            case LLM_ARCH_LLADA_MOE:
3212
0
                {
3213
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3214
3215
                    // output
3216
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3217
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3218
3219
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
3220
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
3221
3222
0
                    for (int i = 0; i < n_layer; ++i) {
3223
0
                        auto & layer = layers[i];
3224
3225
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3226
3227
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3228
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3229
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3230
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3231
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3232
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3233
3234
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3235
3236
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3237
3238
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3239
3240
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3241
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3242
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3243
0
                    }
3244
0
                } break;
3245
0
            case LLM_ARCH_LLAMA4:
3246
0
                {
3247
0
                    if (n_expert == 0) {
3248
0
                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
3249
0
                    }
3250
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3251
3252
                    // output
3253
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3254
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3255
3256
                    // if output is NULL, init from the input tok embed
3257
0
                    if (output == NULL) {
3258
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3259
0
                    }
3260
3261
0
                    for (int i = 0; i < n_layer; ++i) {
3262
0
                        const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
3263
3264
0
                        auto & layer = layers[i];
3265
3266
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3267
3268
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3269
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3270
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3271
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3272
3273
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3274
3275
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3276
3277
0
                        if (is_moe_layer) {
3278
0
                            const int64_t n_ff_exp = hparams.n_ff_exp;
3279
3280
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3281
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
3282
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
3283
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
3284
3285
                            // Shared expert
3286
0
                            const int64_t n_ff_shexp = n_ff_exp;
3287
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
3288
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
3289
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
3290
0
                        } else {
3291
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3292
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3293
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3294
0
                        }
3295
0
                    }
3296
0
                } break;
3297
0
            case LLM_ARCH_DECI:
3298
0
                {
3299
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3300
3301
                    // output
3302
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3303
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3304
3305
                    // if output is NULL, init from the input tok embed
3306
0
                    if (output == NULL) {
3307
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3308
0
                    }
3309
3310
0
                    for (int i = 0; i < n_layer; ++i) {
3311
0
                        auto & layer = layers[i];
3312
0
                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
3313
0
                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
3314
0
                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
3315
0
                        const int64_t n_ff          = hparams.n_ff(i);
3316
0
                        const int64_t n_head        = hparams.n_head(i);
3317
0
                        const int64_t n_head_kv     = hparams.n_head_kv(i);
3318
3319
0
                        if (n_head_kv == 0 && n_head > 0) {
3320
                            // linear attention for DeciLMCausalModel
3321
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3322
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3323
0
                        }
3324
0
                        else if (n_head_kv > 0) {
3325
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3326
3327
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3328
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3329
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3330
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3331
0
                        }
3332
3333
                        // optional bias tensors
3334
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3335
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3336
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3337
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3338
3339
0
                        if (n_ff > 0) {
3340
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3341
0
                        }
3342
3343
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
3344
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3345
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3346
0
                        }
3347
0
                        else {
3348
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3349
0
                        }
3350
3351
0
                        if (n_ff > 0) {
3352
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3353
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3354
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3355
0
                        }
3356
3357
                        // optional MLP bias
3358
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3359
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3360
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3361
0
                    }
3362
0
                } break;
3363
0
            case LLM_ARCH_MINICPM3:
3364
0
                {
3365
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
3366
0
                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
3367
3368
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
3369
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
3370
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3371
3372
                    // output
3373
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3374
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3375
3376
                    // if output is NULL, init from the input tok embed
3377
0
                    if (output == NULL) {
3378
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3379
0
                    }
3380
3381
0
                    for (int i = 0; i < n_layer; ++i) {
3382
0
                        auto & layer = layers[i];
3383
3384
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3385
0
                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
3386
3387
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3388
3389
0
                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3390
0
                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3391
3392
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3393
0
                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3394
0
                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
3395
3396
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3397
3398
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3399
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3400
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3401
3402
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3403
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3404
0
                    }
3405
0
                } break;
3406
0
            case LLM_ARCH_GROK:
3407
0
                {
3408
0
                    if (n_expert == 0) {
3409
0
                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
3410
0
                    }
3411
3412
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3413
3414
                    // output
3415
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3416
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3417
3418
                    // if output is NULL, init from the input tok embed
3419
0
                    if (output == NULL) {
3420
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3421
0
                    }
3422
3423
0
                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
3424
0
                    for (int i = 0; i < n_layer; ++i) {
3425
0
                        auto & layer = layers[i];
3426
3427
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3428
3429
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3430
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3431
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3432
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3433
3434
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3435
3436
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3437
3438
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3439
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, TENSOR_NOT_REQUIRED);
3440
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3441
3442
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3443
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
3444
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
3445
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
3446
3447
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3448
0
                        if (!layer.ffn_post_norm) {
3449
0
                            layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3450
0
                        }
3451
0
                    }
3452
0
                } break;
3453
0
            case LLM_ARCH_DBRX:
3454
0
                {
3455
0
                    if (n_expert == 0) {
3456
0
                        throw std::runtime_error("DBRX model cannot have zero experts");
3457
0
                    }
3458
3459
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3460
3461
                    // output
3462
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3463
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3464
3465
0
                    for (int i = 0; i < n_layer; ++i) {
3466
0
                        auto & layer = layers[i];
3467
3468
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3469
3470
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3471
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3472
3473
0
                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3474
3475
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3476
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
3477
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
3478
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
3479
0
                    }
3480
0
                } break;
3481
0
            case LLM_ARCH_BAICHUAN:
3482
0
                {
3483
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3484
0
                    {
3485
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3486
0
                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3487
0
                    }
3488
3489
0
                    for (int i = 0; i < n_layer; ++i) {
3490
0
                        auto & layer = layers[i];
3491
3492
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3493
3494
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3495
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3496
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3497
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3498
3499
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3500
3501
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3502
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3503
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3504
0
                    }
3505
0
                } break;
3506
0
            case LLM_ARCH_FALCON:
3507
0
                {
3508
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3509
3510
                    // output
3511
0
                    {
3512
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3513
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3514
3515
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3516
0
                        if (!output) {
3517
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3518
0
                        }
3519
0
                    }
3520
3521
0
                    for (int i = 0; i < n_layer; ++i) {
3522
0
                        auto & layer = layers[i];
3523
3524
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3525
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3526
3527
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3528
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3529
3530
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3531
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3532
3533
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3534
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3535
0
                    }
3536
0
                } break;
3537
0
            case LLM_ARCH_STARCODER:
3538
0
                {
3539
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3540
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
3541
3542
                    // output
3543
0
                    {
3544
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3545
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3546
0
                        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3547
0
                        if (!output) {
3548
                            // needs to be on GPU
3549
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3550
0
                        }
3551
3552
0
                    }
3553
3554
0
                    for (int i = 0; i < n_layer; ++i) {
3555
0
                        auto & layer = layers[i];
3556
3557
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3558
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3559
3560
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3561
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3562
3563
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3564
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3565
3566
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3567
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3568
3569
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3570
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3571
3572
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
3573
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
3574
0
                    }
3575
0
                } break;
3576
0
            case LLM_ARCH_BERT:
3577
0
            case LLM_ARCH_NOMIC_BERT:
3578
0
            case LLM_ARCH_NOMIC_BERT_MOE:
3579
0
            case LLM_ARCH_JINA_BERT_V3:
3580
0
                {
3581
0
                    if (n_token_types == 0) {
3582
0
                        throw std::runtime_error(arch_name() + " model needs to define token type count");
3583
0
                    }
3584
0
                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3585
0
                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
3586
3587
0
                    if (arch == LLM_ARCH_BERT) {
3588
0
                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
3589
3590
0
                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3591
0
                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3592
3593
0
                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3594
0
                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3595
0
                    }
3596
3597
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
3598
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
3599
3600
0
                    for (int i = 0; i < n_layer; ++i) {
3601
0
                        auto & layer = layers[i];
3602
3603
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3604
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3605
3606
0
                        if (!layer.wqkv) {
3607
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3608
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
3609
3610
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3611
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);
3612
3613
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3614
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
3615
0
                        }
3616
3617
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
3618
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3619
3620
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3621
0
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
3622
3623
0
                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
3624
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
3625
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
3626
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
3627
0
                        } else {
3628
0
                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3629
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3630
0
                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3631
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3632
3633
0
                            if (arch == LLM_ARCH_NOMIC_BERT) {
3634
0
                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3635
0
                            }
3636
0
                        }
3637
3638
0
                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3639
0
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
3640
0
                    }
3641
0
                } break;
3642
0
            case LLM_ARCH_MODERN_BERT:
3643
0
                {
3644
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3645
0
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
3646
3647
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3648
3649
0
                    for(int i = 0; i < n_layer; ++i) {
3650
0
                        auto& layer = layers[i];
3651
3652
0
                        if ( i != 0 ) {
3653
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3654
0
                        } else{
3655
                            // layer 0 uses identity
3656
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3657
0
                        }
3658
3659
3660
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
3661
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
3662
3663
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
3664
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3665
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3666
0
                    }
3667
3668
0
                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3669
0
                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3670
0
                    cls       = create_tensor(tn(LLM_TENSOR_CLS,      "weight"), {n_embd, n_embd},            TENSOR_NOT_REQUIRED);
3671
0
                    cls_norm  = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd},                    TENSOR_NOT_REQUIRED);
3672
3673
0
                } break;
3674
0
            case LLM_ARCH_NEO_BERT:
3675
0
                {
3676
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3677
3678
0
                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3679
0
                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3680
3681
0
                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3682
0
                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3683
3684
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
3685
3686
0
                    for (int i = 0; i < n_layer; ++i) {
3687
0
                        auto & layer = layers[i];
3688
3689
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3690
3691
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3692
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3693
3694
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3695
3696
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
3697
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3698
0
                    }
3699
0
                } break;
3700
0
            case LLM_ARCH_EUROBERT:
3701
0
                {
3702
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3703
3704
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3705
3706
0
                    for (int i = 0; i < n_layer; ++i) {
3707
0
                        auto & layer = layers[i];
3708
3709
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3710
3711
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3712
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3713
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3714
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3715
3716
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3717
3718
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3719
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3720
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3721
0
                    }
3722
0
                } break;
3723
0
            case LLM_ARCH_JINA_BERT_V2:
3724
0
                {
3725
0
                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
3726
0
                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
3727
3728
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0); // LayerNorm
3729
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0); // LayerNorm bias
3730
3731
0
                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
3732
0
                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
3733
0
                    for (int i = 0; i < n_layer; ++i) {
3734
0
                        auto & layer = layers[i]; // JinaBertLayer
3735
3736
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3737
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3738
3739
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3740
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3741
3742
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3743
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3744
3745
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3746
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3747
3748
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3749
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3750
3751
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
3752
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
3753
3754
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
3755
0
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
3756
3757
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3758
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3759
3760
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3761
3762
0
                        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
3763
0
                        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
3764
0
                        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
3765
3766
0
                        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
3767
0
                        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
3768
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
3769
3770
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3771
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3772
3773
0
                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3774
0
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
3775
0
                    }
3776
0
                } break;
3777
0
            case LLM_ARCH_BLOOM:
3778
0
                {
3779
0
                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
3780
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
3781
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
3782
3783
                    // output
3784
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3785
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3786
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3787
3788
                    // if output is NULL, init from the input tok embed
3789
0
                    if (output == NULL) {
3790
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3791
0
                    }
3792
3793
0
                    for (int i = 0; i < n_layer; ++i) {
3794
0
                        auto & layer = layers[i];
3795
3796
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3797
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
3798
3799
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3800
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
3801
3802
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3803
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
3804
3805
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3806
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
3807
3808
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3809
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3810
3811
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3812
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
3813
0
                    }
3814
0
                } break;
3815
0
            case LLM_ARCH_MPT:
3816
0
                {
3817
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3818
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
3819
3820
                    // output
3821
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3822
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
3823
3824
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3825
0
                    if (!output) {
3826
0
                        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3827
0
                    }
3828
3829
0
                    for (int i = 0; i < n_layer; ++i) {
3830
0
                        auto & layer = layers[i];
3831
3832
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3833
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3834
3835
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3836
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3837
3838
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3839
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3840
3841
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3842
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3843
3844
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3845
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3846
3847
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3848
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3849
3850
                        // FIXME test-llama-archs crashes if q_norm is created
3851
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
3852
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
3853
3854
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3855
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3856
3857
                        // AWQ ScaleActivation layer
3858
0
                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
3859
0
                    }
3860
0
                } break;
3861
0
            case LLM_ARCH_STABLELM:
3862
0
                {
3863
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3864
3865
                    // output
3866
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3867
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3868
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3869
3870
0
                    for (int i = 0; i < n_layer; ++i) {
3871
0
                        auto & layer = layers[i];
3872
3873
0
                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3874
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3875
3876
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3877
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3878
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3879
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3880
3881
                        // optional bias tensors, present in Stable LM 2 1.6B
3882
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3883
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3884
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3885
3886
                        // optional q and k layernorms, present in StableLM 2 12B
3887
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
3888
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3889
3890
                        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
3891
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3892
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3893
3894
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3895
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3896
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3897
0
                    }
3898
0
                } break;
3899
0
            case LLM_ARCH_QWEN:
3900
0
                {
3901
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3902
3903
                    // output
3904
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3905
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3906
3907
0
                    for (int i = 0; i < n_layer; ++i) {
3908
0
                        auto & layer = layers[i];
3909
3910
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3911
3912
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
3913
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
3914
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3915
3916
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3917
3918
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
3919
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
3920
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
3921
0
                    }
3922
0
                } break;
3923
0
            case LLM_ARCH_QWEN2:
3924
0
            case LLM_ARCH_QWEN2VL:
3925
0
            case LLM_ARCH_DREAM:
3926
0
                {
3927
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3928
3929
                    // output
3930
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3931
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3932
0
                    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
3933
                    // if output is NULL, init from the input tok embed
3934
0
                    if (output == NULL) {
3935
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3936
0
                    }
3937
3938
0
                    for (int i = 0; i < n_layer; ++i) {
3939
0
                        auto & layer = layers[i];
3940
3941
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3942
3943
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3944
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3945
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3946
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3947
3948
                        // optional bias tensors
3949
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3950
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3951
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3952
3953
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3954
3955
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3956
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3957
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3958
0
                    }
3959
0
                } break;
3960
0
            case LLM_ARCH_QWEN2MOE:
3961
0
                {
3962
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3963
3964
                    // output
3965
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3966
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3967
3968
0
                    for (int i = 0; i < n_layer; ++i) {
3969
0
                        auto & layer = layers[i];
3970
3971
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3972
3973
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3974
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3975
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3976
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3977
3978
                        // optional bias tensors
3979
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3980
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3981
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3982
3983
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3984
3985
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3986
3987
0
                        if (n_expert == 0) {
3988
0
                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
3989
0
                        }
3990
0
                        if (n_expert_used == 0) {
3991
0
                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
3992
0
                        }
3993
3994
                        // MoE branch
3995
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3996
3997
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3998
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3999
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4000
4001
                        // Shared expert branch
4002
0
                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
4003
4004
0
                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
4005
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
4006
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
4007
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
4008
0
                    }
4009
0
                } break;
4010
0
            case LLM_ARCH_QWEN3:
4011
0
            case LLM_ARCH_QWEN3VL:
4012
0
                {
4013
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4014
4015
                    // output
4016
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4017
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4018
                    // if output is NULL, init from the input tok embed
4019
0
                    if (output == NULL) {
4020
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4021
0
                    }
4022
4023
                    // output rerank head
4024
0
                    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
4025
4026
0
                    for (int i = 0; i < n_layer; ++i) {
4027
0
                        auto & layer = layers[i];
4028
4029
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4030
4031
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4032
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4033
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4034
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4035
4036
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4037
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4038
4039
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4040
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4041
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4042
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4043
0
                    }
4044
0
                } break;
4045
0
            case LLM_ARCH_QWEN3MOE:
4046
0
            case LLM_ARCH_QWEN3VLMOE:
4047
0
            case LLM_ARCH_RND1:
4048
0
                {
4049
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4050
4051
                    // output
4052
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4053
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4054
                    // if output is NULL, init from the input tok embed
4055
0
                    if (output == NULL) {
4056
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4057
0
                    }
4058
4059
0
                    for (int i = 0; i < n_layer; ++i) {
4060
0
                        auto & layer = layers[i];
4061
4062
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4063
4064
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4065
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4066
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4067
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4068
4069
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4070
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4071
4072
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4073
4074
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4075
4076
0
                        if (n_expert == 0) {
4077
0
                            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
4078
0
                        }
4079
0
                        if (n_expert_used == 0) {
4080
0
                            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
4081
0
                        }
4082
4083
                        // MoE branch
4084
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4085
4086
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4087
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
4088
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4089
0
                    }
4090
0
                } break;
4091
0
            case LLM_ARCH_PHI2:
4092
0
                {
4093
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4094
4095
                    // output
4096
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4097
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4098
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4099
0
                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
4100
4101
0
                    for (int i = 0; i < n_layer; ++i) {
4102
0
                        auto & layer = layers[i];
4103
4104
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4105
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4106
4107
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4108
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4109
4110
0
                        if (layer.wqkv == nullptr) {
4111
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4112
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
4113
4114
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4115
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);
4116
4117
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4118
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);
4119
0
                        }
4120
4121
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4122
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4123
4124
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4125
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4126
4127
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4128
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4129
0
                    }
4130
0
                } break;
4131
0
            case LLM_ARCH_PHI3:
4132
0
                {
4133
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4134
4135
                    // output
4136
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4137
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4138
4139
                    // if output is NULL, init from the input tok embed
4140
0
                    if (output == NULL) {
4141
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4142
0
                    }
4143
4144
0
                    for (int i = 0; i < n_layer; ++i) {
4145
0
                        auto & layer = layers[i];
4146
4147
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4148
4149
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
4150
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4151
4152
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
4153
4154
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
4155
0
                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
4156
4157
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4158
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4159
0
                    }
4160
0
                } break;
4161
0
            case LLM_ARCH_PHIMOE:
4162
0
                {
4163
0
                    const int64_t n_embd_head = n_embd / n_head;
4164
4165
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4166
4167
                    // output
4168
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4169
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4170
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
4171
0
                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
4172
4173
0
                    for (int i = 0; i < n_layer; ++i) {
4174
0
                        auto & layer = layers[i];
4175
4176
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4177
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
4178
4179
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
4180
0
                        if (layer.wqkv == nullptr) {
4181
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4182
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias",   i), {n_embd}, 0);
4183
4184
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4185
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
4186
4187
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4188
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
4189
0
                        }
4190
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4191
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
4192
4193
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
4194
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
4195
4196
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
4197
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
4198
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
4199
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
4200
4201
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4202
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4203
0
                     }
4204
0
                } break;
4205
0
            case LLM_ARCH_PLAMO:
4206
0
                {
4207
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4208
4209
                    // output
4210
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4211
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4212
4213
0
                    for (int i = 0; i < n_layer; ++i) {
4214
0
                        auto & layer = layers[i];
4215
4216
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4217
4218
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4219
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4220
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4221
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4222
4223
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4224
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4225
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4226
0
                    }
4227
0
                } break;
4228
0
            case LLM_ARCH_PLAMO2:
4229
0
                {
4230
                    // mamba parameters
4231
0
                    const uint32_t d_conv             = hparams.ssm_d_conv;
4232
0
                    const uint32_t d_state            = hparams.ssm_d_state;
4233
0
                    const uint32_t num_heads          = hparams.ssm_dt_rank;
4234
0
                    const uint32_t intermediate_size  = hparams.ssm_d_inner;
4235
0
                    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
4236
4237
                    // attention parameters
4238
0
                    const uint32_t qk_dim = hparams.n_embd_head_k();
4239
0
                    const uint32_t v_dim  = hparams.n_embd_head_v();
4240
4241
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4242
4243
                    // output
4244
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4245
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4246
                    // if output is NULL, init from the input tok embed
4247
0
                    if (output == NULL) {
4248
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4249
0
                    }
4250
4251
0
                    for (int i = 0; i < n_layer; ++i) {
4252
0
                        auto & layer = layers[i];
4253
0
                        bool is_mamba_layer = hparams.is_recurrent(i);
4254
4255
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4256
4257
0
                        if (is_mamba_layer) {
4258
0
                            layer.ssm_in       = create_tensor(tn(LLM_TENSOR_SSM_IN,     "weight", i), {n_embd, 2 * intermediate_size}, 0);
4259
0
                            layer.ssm_conv1d   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
4260
4261
0
                            layer.ssm_x    = create_tensor(tn(LLM_TENSOR_SSM_X,  "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
4262
0
                            layer.ssm_dt   = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
4263
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
4264
4265
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
4266
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
4267
4268
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
4269
4270
0
                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
4271
0
                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
4272
0
                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
4273
0
                        } else {
4274
0
                            const int64_t num_attention_heads = hparams.n_head(i);
4275
0
                            const int64_t q_num_heads         = num_attention_heads;
4276
0
                            const int64_t num_key_value_heads = hparams.n_head_kv(i);
4277
0
                            const int64_t k_num_heads         = num_key_value_heads;
4278
0
                            const int64_t v_num_heads         = num_key_value_heads;
4279
0
                            const int64_t q_proj_dim          = q_num_heads * qk_dim;
4280
0
                            const int64_t k_proj_dim          = k_num_heads * qk_dim;
4281
0
                            const int64_t v_proj_dim          = v_num_heads * v_dim;
4282
4283
0
                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
4284
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
4285
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
4286
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
4287
0
                        }
4288
4289
                        // All layers have post-attention norm, FFN norm, and FFN tensors
4290
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
4291
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4292
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4293
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
4294
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
4295
0
                    }
4296
0
                } break;
4297
0
            case LLM_ARCH_PLAMO3:
4298
0
                {
4299
0
                    const int64_t head_dim_q = hparams.n_embd_head_k();
4300
0
                    const int64_t head_dim_v = hparams.n_embd_head_v();
4301
4302
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4303
4304
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4305
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4306
0
                    if (output == NULL) {
4307
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4308
0
                    }
4309
4310
0
                    for (int i = 0; i < n_layer; ++i) {
4311
0
                        auto & layer = layers[i];
4312
4313
0
                        const int64_t num_attention_heads = hparams.n_head(i);
4314
0
                        const int64_t num_key_value_heads = hparams.n_head_kv(i);
4315
0
                        const int64_t q_proj_dim = num_attention_heads * head_dim_q;
4316
0
                        const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
4317
0
                        const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
4318
0
                        const int64_t n_ff_cur   = hparams.n_ff(i);
4319
4320
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4321
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
4322
0
                                {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
4323
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
4324
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
4325
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
4326
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
4327
4328
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4329
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
4330
4331
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_cur * 2}, 0);
4332
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
4333
0
                    }
4334
0
                } break;
4335
0
            case LLM_ARCH_GPT2:
4336
0
                {
4337
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4338
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
4339
4340
                    // output
4341
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4342
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4343
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4344
4345
                    // if output is NULL, init from the input tok embed
4346
0
                    if (output == NULL) {
4347
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4348
0
                    }
4349
4350
0
                    for (int i = 0; i < n_layer; ++i) {
4351
0
                        auto & layer = layers[i];
4352
4353
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
4354
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
4355
4356
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4357
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4358
4359
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4360
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4361
4362
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4363
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4364
4365
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4366
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4367
4368
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4369
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4370
0
                    }
4371
0
                } break;
4372
0
            case LLM_ARCH_CODESHELL:
4373
0
                {
4374
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4375
4376
                    // if tok embd is NULL, init from output
4377
0
                    if (tok_embd == NULL) {
4378
0
                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4379
0
                    }
4380
4381
                    // output
4382
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4383
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4384
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4385
4386
0
                    for (int i = 0; i < n_layer; ++i) {
4387
0
                        auto & layer = layers[i];
4388
4389
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4390
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4391
4392
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4393
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4394
4395
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4396
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4397
4398
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4399
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4400
4401
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4402
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4403
4404
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
4405
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
4406
0
                    }
4407
0
                } break;
4408
0
            case LLM_ARCH_ORION:
4409
0
                {
4410
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4411
4412
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4413
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4414
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4415
4416
0
                    for (int i = 0; i < n_layer; ++i) {
4417
0
                        auto & layer = layers[i];
4418
4419
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4420
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4421
4422
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4423
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4424
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4425
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4426
4427
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4428
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4429
4430
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4431
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4432
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4433
0
                    }
4434
0
                } break;
4435
0
            case LLM_ARCH_INTERNLM2:
4436
0
                {
4437
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4438
4439
                    // output
4440
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4441
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4442
4443
0
                    for (int i = 0; i < n_layer; ++i) {
4444
0
                        auto & layer = layers[i];
4445
4446
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4447
                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4448
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4449
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4450
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4451
4452
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4453
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4454
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4455
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4456
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4457
0
                    }
4458
0
                } break;
4459
0
            case LLM_ARCH_GEMMA:
4460
0
                {
4461
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4462
4463
                    // output
4464
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4465
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
4466
4467
0
                    for (int i = 0; i < n_layer; ++i) {
4468
0
                        auto & layer = layers[i];
4469
4470
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4471
4472
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4473
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4474
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4475
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4476
4477
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4478
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4479
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4480
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4481
0
                    }
4482
0
                } break;
4483
0
            case LLM_ARCH_GEMMA2:
4484
0
                {
4485
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4486
4487
                    // output
4488
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4489
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
4490
4491
0
                    for (int i = 0; i < n_layer; ++i) {
4492
0
                        auto & layer = layers[i];
4493
4494
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4495
4496
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4497
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4498
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4499
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4500
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4501
4502
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4503
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4504
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4505
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4506
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4507
0
                    }
4508
0
                } break;
4509
0
            case LLM_ARCH_GEMMA3:
4510
0
            case LLM_ARCH_GEMMA_EMBEDDING:
4511
0
                {
4512
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4513
4514
                    // output
4515
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4516
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4517
4518
                    // if output is NULL, init from the input tok embed
4519
0
                    if (output == NULL) {
4520
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4521
0
                    }
4522
4523
                    // Dense linear weights
4524
0
                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
4525
0
                    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
4526
4527
4528
0
                    for (int i = 0; i < n_layer; ++i) {
4529
0
                        auto & layer = layers[i];
4530
4531
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4532
4533
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4534
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4535
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4536
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4537
4538
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4539
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
4540
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
4541
4542
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4543
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4544
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4545
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4546
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4547
0
                    }
4548
0
                } break;
4549
0
            case LLM_ARCH_GEMMA3N:
4550
0
                {
4551
0
                    const int64_t n_altup      = hparams.n_altup;
4552
0
                    const int64_t laurel_rank  = hparams.laurel_rank;
4553
0
                    const int64_t n_embd_altup = hparams.n_embd_altup;
4554
4555
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4556
                    // if output is NULL, init from the input tok embed
4557
0
                    if (output == NULL) {
4558
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4559
0
                    }
4560
4561
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4562
4563
0
                    altup_proj        = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,        "weight"), {n_embd, n_embd, n_altup - 1}, 0);
4564
0
                    altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
4565
4566
0
                    per_layer_tok_embd   = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
4567
0
                    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight", 0), {n_embd, n_embd_altup * n_layer}, 0);
4568
0
                    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight", 0), {n_embd_altup}, 0);
4569
4570
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4571
4572
0
                    for (int i = 0; i < n_layer; ++i) {
4573
0
                        auto & layer = layers[i];
4574
4575
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4576
4577
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4578
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4579
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4580
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4581
4582
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
4583
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
4584
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4585
4586
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4587
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4588
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4589
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4590
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4591
4592
                        // altup & laurel
4593
0
                        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
4594
0
                        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
4595
0
                        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
4596
0
                        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
4597
0
                        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
4598
0
                        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
4599
0
                        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
4600
0
                        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
4601
0
                        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
4602
0
                        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
4603
0
                        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
4604
0
                    }
4605
0
                } break;
4606
0
            case LLM_ARCH_GEMMA4:
4607
0
                {
4608
0
                    const uint32_t n_embd_per_layer = hparams.n_embd_per_layer;
4609
0
                    const int64_t  n_ff_exp         = hparams.n_ff_exp;
4610
4611
0
                    if (n_embd_head_k != n_embd_head_v) {
4612
0
                        throw std::runtime_error("Gemma 4 requires n_embd_head_k == n_embd_head_v");
4613
0
                    }
4614
0
                    if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) {
4615
0
                        throw std::runtime_error("Gemma 4 requires n_embd_head_k_swa == n_embd_head_v_swa");
4616
0
                    }
4617
4618
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4619
                    // if output is NULL, init from the input tok embed
4620
0
                    if (output == NULL) {
4621
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4622
0
                    }
4623
4624
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4625
4626
0
                    if (n_embd_per_layer > 0) {
4627
0
                        per_layer_tok_embd   = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"),    {n_embd_per_layer * n_layer, n_vocab}, 0);
4628
0
                        per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight", 0), {n_embd, n_embd_per_layer * n_layer}, 0);
4629
0
                        per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight", 0), {n_embd_per_layer}, 0);
4630
0
                    }
4631
4632
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4633
4634
0
                    int rope_freqs_flag = 0;
4635
4636
0
                    for (int i = 0; i < n_layer; ++i) {
4637
0
                        auto & layer = layers[i];
4638
0
                        const int64_t n_head      = hparams.n_head(i);
4639
0
                        const int64_t n_embd_head = hparams.n_embd_head_k(i);
4640
0
                        const int64_t n_embd_k    = hparams.n_embd_k_gqa(i);
4641
0
                        const int64_t n_embd_v    = hparams.n_embd_v_gqa(i);
4642
0
                        const int     kv_flags    = hparams.has_kv(i) ? 0 : TENSOR_NOT_REQUIRED;
4643
4644
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4645
4646
                        // note: use_alternative_attention (v_proj is optional, if it's not present, use k_proj)
4647
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head * n_head}, 0);
4648
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k}, kv_flags);
4649
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v}, TENSOR_NOT_REQUIRED);
4650
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head * n_head, n_embd}, 0);
4651
4652
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head}, 0);
4653
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head}, kv_flags);
4654
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4655
4656
0
                        layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1u}, TENSOR_NOT_REQUIRED);
4657
4658
0
                        if (!hparams.is_swa(i)) {
4659
                            // full_attention layers use rope_freqs for proportional rope
4660
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_embd_head/2}, rope_freqs_flag);
4661
0
                            rope_freqs_flag = TENSOR_DUPLICATED;
4662
0
                        }
4663
4664
                        // handle use_double_wide_mlp
4665
0
                        int64_t n_ff_cur = hparams.n_ff(i);
4666
4667
                        // for expert layers, we use normal FFN as shared expert (same as python code)
4668
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4669
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff_cur}, 0);
4670
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff_cur}, 0);
4671
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
4672
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4673
4674
                        // MoE router
4675
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4676
0
                        bool has_expert = layer.ffn_gate_inp != nullptr;
4677
4678
                        // norm
4679
0
                        if (has_expert) {
4680
0
                            layer.ffn_gate_inp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "scale", i), {n_embd}, 0);
4681
4682
0
                            layer.ffn_pre_norm_2  = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM_2,  "weight", i), {n_embd}, 0);
4683
0
                            layer.ffn_post_norm_1 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_1, "weight", i), {n_embd}, 0);
4684
0
                            layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
4685
4686
                            // MoE FFN
4687
0
                            layer.ffn_gate_up_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS,  "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
4688
0
                            layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,     "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4689
4690
                            // per-expert scale will be loaded as down_exps_s at the end of the current switch case
4691
0
                        }
4692
4693
                        // per-layer embeddings
4694
0
                        if (n_embd_per_layer > 0) {
4695
0
                            layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_per_layer}, 0);
4696
0
                            layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_per_layer, n_embd}, 0);
4697
0
                            layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
4698
0
                        }
4699
0
                    }
4700
0
                } break;
4701
0
            case LLM_ARCH_STARCODER2:
4702
0
                {
4703
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4704
4705
                    // output
4706
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4707
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4708
4709
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4710
                    // if output is NULL, init from the input tok embed
4711
0
                    if (output == NULL) {
4712
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4713
0
                    }
4714
4715
0
                    for (int i = 0; i < n_layer; ++i) {
4716
0
                        auto & layer = layers[i];
4717
4718
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4719
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4720
4721
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4722
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4723
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4724
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4725
4726
                        // optional bias tensors
4727
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
4728
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
4729
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
4730
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4731
4732
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4733
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4734
4735
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4736
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4737
4738
                        // optional bias tensors
4739
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4740
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
4741
0
                    }
4742
0
                } break;
4743
0
            case LLM_ARCH_MAMBA:
4744
0
                {
4745
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4746
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4747
0
                    const int64_t d_state = hparams.ssm_d_state;
4748
0
                    const int64_t dt_rank = hparams.ssm_dt_rank;
4749
4750
                    // only an expansion factor of 2 is supported for now
4751
0
                    if (2 * n_embd != d_inner) {
4752
0
                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
4753
0
                    }
4754
4755
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4756
4757
                    // output
4758
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4759
4760
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4761
                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
4762
0
                    if (output == NULL) {
4763
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4764
0
                    }
4765
4766
0
                    for (int i = 0; i < n_layer; ++i) {
4767
0
                        auto & layer = layers[i];
4768
4769
                        // norm
4770
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4771
4772
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4773
4774
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4775
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4776
4777
0
                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4778
4779
0
                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4780
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4781
4782
                        // no "weight" suffix for these
4783
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4784
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4785
4786
                        // out_proj
4787
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4788
0
                    }
4789
0
                } break;
4790
0
            case LLM_ARCH_MAMBA2:
4791
0
                {
4792
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4793
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4794
0
                    const int64_t d_state = hparams.ssm_d_state;
4795
0
                    const int64_t n_head  = hparams.ssm_dt_rank;
4796
0
                    const int64_t n_group = hparams.ssm_n_group;
4797
0
                    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
4798
4799
                    // only an expansion factor of 2 is supported for now
4800
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4801
4802
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4803
4804
                    // output
4805
0
                    {
4806
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4807
4808
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4809
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4810
0
                        if (output == NULL) {
4811
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4812
0
                        }
4813
0
                    }
4814
4815
0
                    for (int i = 0; i < n_layer; ++i) {
4816
0
                        auto & layer = layers[i];
4817
4818
                        // norm
4819
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4820
4821
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4822
4823
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4824
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
4825
4826
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
4827
4828
                        // no "weight" suffix for these
4829
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
4830
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
4831
4832
0
                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4833
4834
                        // out_proj
4835
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4836
0
                    }
4837
0
                } break;
4838
0
            case LLM_ARCH_JAMBA:
4839
0
                {
4840
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4841
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4842
0
                    const int64_t d_state = hparams.ssm_d_state;
4843
0
                    const int64_t dt_rank = hparams.ssm_dt_rank;
4844
4845
                    // only an expansion factor of 2 is supported for now
4846
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4847
4848
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4849
4850
                    // output
4851
0
                    {
4852
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4853
4854
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4855
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4856
0
                        if (output == NULL) {
4857
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4858
0
                        }
4859
0
                    }
4860
4861
0
                    for (int i = 0; i < n_layer; ++i) {
4862
0
                        const int64_t n_head_kv = hparams.n_head_kv(i);
4863
0
                        const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
4864
4865
0
                        auto & layer = layers[i];
4866
4867
                        // norm
4868
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4869
4870
0
                        if (n_head_kv == 0) {
4871
                            // Mamba layer
4872
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4873
4874
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4875
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4876
4877
0
                            layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4878
4879
0
                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
4880
4881
0
                            layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4882
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4883
4884
0
                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
4885
0
                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
4886
4887
                            // no "weight" suffix for these
4888
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4889
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4890
4891
                            // out_proj
4892
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4893
0
                        } else {
4894
                            // Attention layers
4895
4896
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4897
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4898
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4899
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4900
0
                        }
4901
4902
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4903
4904
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4905
4906
0
                        if (layer.ffn_gate_inp) {
4907
                            // MoE
4908
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4909
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4910
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff, n_expert}, 0);
4911
0
                        } else {
4912
                            // FFN (no MoE)
4913
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4914
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4915
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4916
0
                        }
4917
0
                    }
4918
0
                } break;
4919
0
            case LLM_ARCH_GRANITE_HYBRID:
4920
0
                {
4921
                    // mamba2 Mixer SSM params
4922
                    // NOTE: int64_t for tensor dimensions
4923
0
                    const int64_t d_conv     = hparams.ssm_d_conv;
4924
0
                    const int64_t d_inner    = hparams.ssm_d_inner;
4925
0
                    const int64_t d_state    = hparams.ssm_d_state;
4926
0
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
4927
0
                    const int64_t n_group    = hparams.ssm_n_group;
4928
0
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4929
4930
                    // only an expansion factor of 2 is supported for now
4931
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4932
4933
                    // embeddings
4934
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4935
4936
                    // output
4937
0
                    {
4938
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4939
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4940
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4941
0
                        if (output == NULL) {
4942
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4943
0
                        }
4944
0
                    }
4945
4946
0
                    for (int i = 0; i < n_layer; ++i) {
4947
0
                        auto & layer = layers[i];
4948
4949
                        // norm
4950
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4951
4952
0
                        if (hparams.is_recurrent(i)) {
4953
                            // ssm layers
4954
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4955
4956
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4957
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4958
4959
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4960
4961
                            // no "weight" suffix for these
4962
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4963
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4964
4965
0
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4966
4967
                            // out_proj
4968
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4969
0
                        } else {
4970
                            // attention layers (with optional bias)
4971
0
                            const int64_t n_head_i = hparams.n_head(i);
4972
0
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4973
0
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4974
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4975
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4976
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4977
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4978
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4979
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4980
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4981
0
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4982
0
                        }
4983
4984
                        // feed forward (w/ optional biases)
4985
0
                        if (n_expert > 0) {
4986
                            // MoE FFN
4987
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4988
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4989
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
4990
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
4991
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4992
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4993
4994
                            // For Granite MoE Shared
4995
0
                            if (hparams.n_ff_shexp > 0) {
4996
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4997
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4998
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4999
0
                            }
5000
0
                        } else {
5001
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5002
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5003
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5004
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5005
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5006
0
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5007
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5008
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5009
0
                        }
5010
0
                    }
5011
0
                } break;
5012
0
            case LLM_ARCH_XVERSE:
5013
0
                {
5014
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5015
5016
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5017
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5018
5019
0
                    for (int i = 0; i < n_layer; ++i) {
5020
0
                        auto & layer = layers[i];
5021
5022
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5023
5024
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5025
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5026
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5027
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5028
5029
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5030
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5031
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5032
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5033
0
                    }
5034
0
                } break;
5035
0
            case LLM_ARCH_COMMAND_R:
5036
0
                {
5037
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5038
5039
                    // output
5040
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5041
                    // init output from the input tok embed
5042
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5043
5044
0
                    for (int i = 0; i < n_layer; ++i) {
5045
0
                        auto & layer = layers[i];
5046
5047
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5048
5049
0
                        if (n_layer >= 64){
5050
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
5051
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
5052
0
                        }
5053
5054
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5055
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5056
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5057
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5058
5059
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5060
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5061
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5062
0
                    }
5063
0
                } break;
5064
0
            case LLM_ARCH_COHERE2:
5065
0
                {
5066
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5067
5068
                    // output
5069
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5070
                    // init output from the input tok embed
5071
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
5072
0
                                                      TENSOR_DUPLICATED);
5073
5074
0
                    for (int i = 0; i < n_layer; ++i) {
5075
0
                        auto & layer = layers[i];
5076
5077
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
5078
5079
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
5080
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
5081
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
5082
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
5083
5084
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
5085
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
5086
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
5087
0
                    }
5088
0
                }
5089
0
                break;
5090
0
            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
5091
0
                {
5092
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5093
5094
                    // output
5095
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5096
                    // if output is NULL, init from the input tok embed
5097
0
                    if (output == NULL) {
5098
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5099
0
                    }
5100
5101
0
                    for (int i = 0; i < n_layer; ++i) {
5102
0
                        auto & layer = layers[i];
5103
5104
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5105
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5106
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5107
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5108
5109
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5110
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5111
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5112
0
                    }
5113
0
                } break;
5114
0
            case LLM_ARCH_OLMO2:
5115
0
                {
5116
0
                    const int64_t n_embd_head = n_embd / n_head;
5117
5118
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5119
5120
                    // output
5121
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5122
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5123
5124
0
                    for (int i = 0; i < n_layer; ++i) {
5125
0
                        auto & layer = layers[i];
5126
5127
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5128
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5129
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5130
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5131
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
5132
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
5133
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5134
5135
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5136
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5137
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5138
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5139
0
                    }
5140
0
                } break;
5141
0
            case LLM_ARCH_SEED_OSS:
5142
0
                {
5143
0
                    const uint32_t head_dim             = hparams.n_embd_head_k();
5144
0
                    const int64_t n_qo_dim              = n_head * head_dim;
5145
0
                    const int64_t n_kv_dim              = n_head_kv * head_dim;
5146
5147
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5148
5149
                    // output
5150
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5151
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5152
                    // if output is NULL, init from the input tok embed
5153
0
                    if (output == NULL) {
5154
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5155
0
                    }
5156
5157
0
                    for (int i = 0; i < n_layer; ++i) {
5158
0
                        auto & layer = layers[i];
5159
5160
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_qo_dim}, 0);
5161
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_kv_dim}, 0);
5162
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_kv_dim}, 0);
5163
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
5164
5165
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_qo_dim},   TENSOR_NOT_REQUIRED);
5166
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
5167
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
5168
5169
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5170
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5171
5172
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5173
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5174
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5175
0
                    }
5176
0
                } break;
5177
5178
0
            case LLM_ARCH_OLMOE:
5179
0
                {
5180
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5181
5182
                    // output
5183
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5184
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5185
5186
0
                    for (int i = 0; i < n_layer; ++i) {
5187
0
                        auto & layer = layers[i];
5188
5189
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5190
5191
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5192
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5193
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5194
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5195
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
5196
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
5197
5198
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5199
5200
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5201
5202
0
                        if (n_expert == 0) {
5203
0
                            throw std::runtime_error("n_expert must be > 0");
5204
0
                        }
5205
0
                        if (n_expert_used == 0) {
5206
0
                            throw std::runtime_error("n_expert_used must be > 0");
5207
0
                        }
5208
5209
                        // MoE branch
5210
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
5211
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
5212
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
5213
0
                    }
5214
0
                } break;
5215
0
            case LLM_ARCH_OPENELM:
5216
0
                {
5217
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5218
5219
                    // output
5220
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5221
                    // init output from the input tok embed
5222
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5223
5224
0
                    for (int i = 0; i < n_layer; ++i) {
5225
0
                        const int64_t n_head      =   hparams.n_head(i);
5226
0
                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
5227
0
                        const int64_t n_ff        =   hparams.n_ff(i);
5228
5229
0
                        auto & layer = layers[i];
5230
5231
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5232
5233
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
5234
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5235
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5236
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
5237
5238
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5239
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5240
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5241
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5242
0
                    }
5243
0
                } break;
5244
0
            case LLM_ARCH_GPTNEOX:
5245
0
                {
5246
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5247
5248
                    // output
5249
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5250
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
5251
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5252
5253
0
                    for (int i = 0; i < n_layer; ++i) {
5254
0
                        auto & layer = layers[i];
5255
5256
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5257
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5258
5259
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
5260
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
5261
5262
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5263
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
5264
5265
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5266
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
5267
5268
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5269
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
5270
5271
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5272
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
5273
0
                    }
5274
0
                } break;
5275
0
            case LLM_ARCH_ARCTIC:
5276
0
                {
5277
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5278
5279
                    // output
5280
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5281
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5282
5283
                    // if output is NULL, init from the input tok embed
5284
0
                    if (output == NULL) {
5285
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5286
0
                    }
5287
5288
0
                    for (int i = 0; i < n_layer; ++i) {
5289
0
                        auto & layer = layers[i];
5290
5291
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5292
5293
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5294
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5295
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5296
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5297
5298
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5299
5300
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
5301
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
5302
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
5303
5304
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5305
0
                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
5306
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
5307
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
5308
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
5309
0
                    }
5310
0
                } break;
5311
0
            case LLM_ARCH_DEEPSEEK:
5312
0
                {
5313
5314
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5315
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5316
5317
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5318
5319
                    // output
5320
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5321
                    // try to load output.weight, if not found, use token_embd (tied embeddings)
5322
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5323
0
                    if (!output) {
5324
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5325
0
                    }
5326
5327
0
                    for (int i = 0; i < n_layer; ++i) {
5328
0
                        auto & layer = layers[i];
5329
5330
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5331
5332
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5333
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5334
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5335
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5336
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5337
5338
0
                        if (i < (int) hparams.n_layer_dense_lead) {
5339
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5340
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5341
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5342
0
                        } else {
5343
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5344
5345
0
                            if (n_expert == 0) {
5346
0
                                throw std::runtime_error("n_expert must be > 0");
5347
0
                            }
5348
0
                            if (n_expert_used == 0) {
5349
0
                                throw std::runtime_error("n_expert_used must be > 0");
5350
0
                            }
5351
5352
                            // MoE branch
5353
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5354
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5355
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5356
5357
                            // Shared expert branch
5358
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5359
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5360
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5361
0
                        }
5362
0
                    }
5363
0
                } break;
5364
0
            case LLM_ARCH_DEEPSEEK2:
5365
0
            case LLM_ARCH_MISTRAL4:
5366
0
                {
5367
0
                    const bool is_mla = hparams.is_mla();
5368
5369
                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
5370
0
                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
5371
0
                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
5372
5373
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
5374
0
                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
5375
0
                    GGML_ASSERT(n_embd_head_qk_nope >= 1);
5376
5377
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
5378
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
5379
5380
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5381
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5382
5383
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5384
5385
                    // output
5386
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5387
                    // try to load output.weight, if not found, use token_embd (tied embeddings)
5388
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5389
0
                    if (!output) {
5390
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5391
0
                    }
5392
5393
0
                    for (int i = 0; i < n_layer; ++i) {
5394
0
                        auto & layer = layers[i];
5395
5396
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5397
0
                        if (q_lora_rank > 0) {
5398
0
                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
5399
0
                        }
5400
5401
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
5402
5403
0
                        if (q_lora_rank > 0) {
5404
0
                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
5405
0
                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
5406
0
                        } else {
5407
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
5408
0
                        }
5409
5410
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
5411
5412
                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
5413
0
                        if (is_mla) {
5414
0
                            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
5415
0
                            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
5416
0
                        } else {
5417
0
                            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
5418
0
                        }
5419
5420
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
5421
5422
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5423
5424
0
                        if (i < (int) hparams.n_layer_dense_lead) {
5425
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5426
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5427
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5428
0
                        } else {
5429
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5430
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5431
5432
0
                            if (n_expert == 0) {
5433
0
                                throw std::runtime_error("n_expert must be > 0");
5434
0
                            }
5435
0
                            if (n_expert_used == 0) {
5436
0
                                throw std::runtime_error("n_expert_used must be > 0");
5437
0
                            }
5438
5439
                            // MoE branch
5440
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5441
0
                            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
5442
5443
                            // Shared expert branch
5444
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5445
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5446
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5447
0
                        }
5448
0
                    }
5449
0
                } break;
5450
0
            case LLM_ARCH_DEEPSEEK2OCR:
5451
0
                {
5452
                    // similar to deepseek2, but without MLA
5453
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5454
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5455
5456
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5457
5458
                    // output
5459
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5460
                    // try to load output.weight, if not found, use token_embd (tied embeddings)
5461
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5462
0
                    if (!output) {
5463
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5464
0
                    }
5465
5466
0
                    for (int i = 0; i < n_layer; ++i) {
5467
0
                        auto & layer = layers[i];
5468
5469
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5470
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0);
5471
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0);
5472
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5473
5474
                        // norm
5475
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5476
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5477
5478
0
                        if (i < (int) hparams.n_layer_dense_lead) {
5479
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5480
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5481
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5482
0
                        } else {
5483
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5484
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5485
5486
0
                            if (n_expert == 0) {
5487
0
                                throw std::runtime_error("n_expert must be > 0");
5488
0
                            }
5489
0
                            if (n_expert_used == 0) {
5490
0
                                throw std::runtime_error("n_expert_used must be > 0");
5491
0
                            }
5492
5493
                            // MoE branch
5494
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5495
0
                            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
5496
5497
                            // Shared expert branch
5498
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5499
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5500
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5501
0
                        }
5502
0
                    }
5503
0
                } break;
5504
0
            case LLM_ARCH_PLM:
5505
0
                {
5506
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
5507
0
                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
5508
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
5509
5510
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5511
5512
                    // output
5513
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5514
                    // output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5515
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5516
5517
0
                    for (int i = 0; i < n_layer; ++i) {
5518
0
                        auto & layer = layers[i];
5519
5520
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5521
5522
0
                        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5523
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
5524
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
5525
0
                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
5526
0
                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
5527
5528
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5529
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5530
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5531
0
                    }
5532
0
                } break;
5533
0
            case LLM_ARCH_BITNET:
5534
0
                {
5535
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5536
5537
                    // output
5538
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5539
5540
0
                    for (int i = 0; i < n_layer; ++i) {
5541
0
                        auto & layer = layers[i];
5542
5543
0
                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
5544
0
                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
5545
5546
0
                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5547
0
                        layer.wq_s     = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5548
0
                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5549
0
                        layer.wk_s     = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5550
0
                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5551
0
                        layer.wv_s     = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5552
0
                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5553
0
                        layer.wo_s     = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5554
5555
0
                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
5556
0
                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
5557
5558
0
                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5559
0
                        layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5560
0
                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5561
0
                        layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5562
0
                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5563
0
                        layer.ffn_up_s   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
5564
0
                    }
5565
0
                } break;
5566
0
            case LLM_ARCH_T5:
5567
0
                {
5568
0
                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
5569
5570
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5571
5572
                    // output
5573
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5574
0
                    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5575
5576
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5577
                    // if output is NULL, init from the input tok embed
5578
0
                    if (output == NULL) {
5579
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5580
0
                    }
5581
5582
                    // n_layer:     number of encoder_layers
5583
                    // dec_n_layer: number of decoder_layers
5584
0
                    const int dec_n_layer = hparams.dec_n_layer;
5585
0
                    if (dec_n_layer > n_layer) {
5586
0
                        layers.resize(dec_n_layer);
5587
0
                    }
5588
5589
                    // load encoder layers
5590
0
                    for (int i = 0; i < n_layer; ++i) {
5591
0
                        auto & layer = layers[i];
5592
5593
0
                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
5594
0
                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5595
5596
0
                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5597
0
                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5598
0
                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5599
0
                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5600
5601
0
                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
5602
0
                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
5603
0
                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5604
0
                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5605
0
                    }
5606
5607
                    // load decoder layers
5608
0
                    for (int i = 0; i < dec_n_layer; ++i) {
5609
0
                        auto & layer = layers[i];
5610
5611
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
5612
0
                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5613
5614
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5615
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5616
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5617
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5618
5619
0
                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
5620
                        // this tensor seems to be unused in HF transformers implementation
5621
0
                        layer.attn_rel_b_cross = create_tensor(
5622
0
                            tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
5623
5624
0
                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5625
0
                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5626
0
                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5627
0
                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5628
5629
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
5630
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
5631
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5632
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5633
0
                    }
5634
0
                } break;
5635
0
            case LLM_ARCH_T5ENCODER:
5636
0
                {
5637
0
                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
5638
5639
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5640
5641
                    // output
5642
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
5643
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5644
                    // if output is NULL, init from the input tok embed
5645
0
                    if (output == NULL) {
5646
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5647
0
                    }
5648
5649
0
                    for (int i = 0; i < n_layer; ++i) {
5650
0
                        auto & layer = layers[i];
5651
5652
0
                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
5653
0
                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5654
5655
0
                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5656
0
                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5657
0
                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5658
0
                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
5659
5660
0
                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
5661
0
                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
5662
0
                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5663
0
                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5664
0
                    }
5665
0
                } break;
5666
0
            case LLM_ARCH_JAIS:
5667
0
                {
5668
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5669
5670
                    // output
5671
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5672
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
5673
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5674
5675
0
                    for (int i = 0; i < n_layer; ++i) {
5676
0
                        auto & layer = layers[i];
5677
5678
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
5679
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
5680
5681
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
5682
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
5683
5684
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5685
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
5686
5687
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5688
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
5689
5690
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5691
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
5692
5693
0
                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
5694
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
5695
5696
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5697
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
5698
0
                    }
5699
0
                } break;
5700
0
            case LLM_ARCH_JAIS2:
5701
0
                {
5702
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5703
5704
                    // output
5705
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5706
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
5707
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5708
0
                    if (!output) {
5709
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5710
0
                    }
5711
5712
0
                    for (int i = 0; i < n_layer; ++i) {
5713
0
                        auto & layer = layers[i];
5714
5715
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5716
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5717
5718
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5719
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5720
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5721
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5722
5723
                        // attention biases - all have shape n_embd (output dimension of projections)
5724
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
5725
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
5726
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
5727
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5728
5729
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5730
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
5731
5732
                        // Jais-2 uses simple MLP (no gate) with biases
5733
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5734
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
5735
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5736
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
5737
0
                    }
5738
0
                } break;
5739
0
            case LLM_ARCH_CHATGLM:
5740
0
                {
5741
0
                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
5742
5743
                    // output
5744
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5745
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5746
                    // if output is NULL, init from the input tok embed
5747
0
                    if (output == NULL) {
5748
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5749
0
                    }
5750
5751
0
                    for (int i = 0; i < n_layer; ++i) {
5752
0
                        auto & layer = layers[i];
5753
5754
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5755
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5756
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5757
5758
0
                        if (layer.wqkv == nullptr) {
5759
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5760
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5761
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5762
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5763
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5764
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5765
0
                        }
5766
5767
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5768
5769
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5770
5771
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
5772
5773
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5774
0
                    }
5775
0
                } break;
5776
0
            case LLM_ARCH_GLM4:
5777
0
                {
5778
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5779
5780
                    // output
5781
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5782
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5783
                    // if output is NULL, init from the input tok embed
5784
0
                    if (output == NULL) {
5785
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5786
0
                    }
5787
5788
0
                    for (int i = 0; i < n_layer; ++i) {
5789
0
                        int flags = 0;
5790
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5791
                            // skip all tensors in the NextN layers
5792
0
                            flags |= TENSOR_SKIP;
5793
0
                        }
5794
5795
0
                        auto & layer = layers[i];
5796
5797
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5798
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5799
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5800
5801
0
                        if (layer.wqkv == nullptr) {
5802
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
5803
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
5804
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
5805
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5806
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5807
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5808
0
                        }
5809
5810
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5811
5812
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
5813
5814
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5815
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
5816
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, flags);
5817
5818
0
                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
5819
5820
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5821
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5822
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5823
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5824
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5825
5826
                            // Optional tensors
5827
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5828
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5829
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5830
0
                        }
5831
0
                    }
5832
0
                } break;
5833
0
            case LLM_ARCH_GLM4_MOE:
5834
0
                {
5835
0
                    const int64_t n_expert        = hparams.n_expert;
5836
0
                    const int64_t n_expert_used   = hparams.n_expert_used;
5837
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5838
5839
0
                    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
5840
0
                    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
5841
5842
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5843
5844
                    // output
5845
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5846
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
5847
                    // if output is NULL, init from the input tok embed
5848
0
                    if (output == NULL) {
5849
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
5850
0
                    }
5851
5852
                    // Load ALL tensors including NextN layer to satisfy total tensor count
5853
                    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
5854
0
                    for (int i = 0; i < n_layer; ++i) {
5855
0
                        int flags = 0;
5856
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5857
                            // skip all tensors in the NextN layers
5858
0
                            flags |= TENSOR_SKIP;
5859
0
                        }
5860
5861
0
                        auto & layer = layers[i];
5862
5863
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
5864
5865
                        // GLM-style attention with bias terms
5866
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
5867
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
5868
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
5869
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
5870
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
5871
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
5872
5873
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
5874
5875
                        // K/Q norm tensors (optional for GLM-4.5 355B variant)
5876
0
                        layer.attn_q_norm = create_tensor(
5877
0
                            tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5878
0
                        layer.attn_k_norm = create_tensor(
5879
0
                            tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5880
5881
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
5882
5883
                        // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
5884
                        // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
5885
0
                        const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
5886
5887
0
                        if (use_moe) {
5888
                            // MoE layers
5889
0
                            layer.ffn_gate_inp =
5890
0
                                create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
5891
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
5892
5893
                            // MoE branch
5894
0
                            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5895
5896
0
                            layer.ffn_gate_exps = create_tensor(
5897
0
                                tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5898
0
                            layer.ffn_down_exps = create_tensor(
5899
0
                                tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
5900
0
                            layer.ffn_up_exps = create_tensor(
5901
0
                                tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5902
5903
                            // Shared expert
5904
0
                            if (n_expert_shared > 0) {
5905
0
                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5906
0
                                layer.ffn_gate_shexp = create_tensor(
5907
0
                                    tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5908
0
                                layer.ffn_down_shexp = create_tensor(
5909
0
                                    tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
5910
0
                                layer.ffn_up_shexp = create_tensor(
5911
0
                                    tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5912
0
                            }
5913
0
                        } else {
5914
                            // Dense layers (first k layers) - GLM uses separate gate/up projections
5915
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
5916
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
5917
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
5918
0
                        }
5919
5920
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5921
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5922
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5923
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5924
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5925
5926
                            // Optional tensors
5927
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5928
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5929
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5930
0
                        }
5931
0
                    }
5932
0
                }
5933
0
                break;
5934
0
            case LLM_ARCH_GLM_DSA:
5935
0
                {
5936
0
                    const bool is_mla = hparams.is_mla();
5937
0
                    if (!is_mla) {
5938
0
                        throw std::runtime_error("GLM_DSA architecture requires MLA");
5939
0
                    }
5940
5941
                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
5942
0
                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
5943
0
                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
5944
5945
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
5946
0
                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
5947
5948
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
5949
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
5950
5951
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5952
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5953
5954
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5955
5956
                    // output
5957
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5958
                    // try to load output.weight, if not found, use token_embd (tied embeddings)
5959
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5960
0
                    if (!output) {
5961
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5962
0
                    }
5963
5964
0
                    for (int i = 0; i < n_layer; ++i) {
5965
0
                        int flags = 0;
5966
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5967
                            // skip all tensors in the NextN layers
5968
                            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
5969
0
                            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
5970
0
                        }
5971
5972
0
                        auto & layer = layers[i];
5973
5974
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5975
0
                        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
5976
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
5977
5978
0
                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
5979
0
                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
5980
5981
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
5982
5983
                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
5984
0
                        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
5985
0
                        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
5986
5987
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
5988
5989
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5990
5991
                        // DSA indexer
5992
0
                        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
5993
0
                        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
5994
0
                        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
5995
0
                        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
5996
0
                        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
5997
0
                        if (i < (int) hparams.n_layer_dense_lead) {
5998
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
5999
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
6000
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
6001
0
                        } else {
6002
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
6003
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6004
6005
0
                            if (n_expert == 0) {
6006
0
                                throw std::runtime_error("n_expert must be > 0");
6007
0
                            }
6008
0
                            if (n_expert_used == 0) {
6009
0
                                throw std::runtime_error("n_expert_used must be > 0");
6010
0
                            }
6011
6012
                            // MoE branch
6013
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
6014
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
6015
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
6016
6017
                            // Shared expert branch
6018
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
6019
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
6020
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
6021
0
                        }
6022
6023
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
6024
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6025
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
6026
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
6027
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
6028
6029
                            // Optional tensors
6030
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
6031
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
6032
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
6033
0
                        }
6034
0
                    }
6035
0
                } break;
6036
0
            case LLM_ARCH_NEMOTRON:
6037
0
                {
6038
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6039
6040
                    // output
6041
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6042
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
6043
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6044
6045
0
                    for (int i = 0; i < n_layer; ++i) {
6046
0
                        auto & layer = layers[i];
6047
6048
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6049
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
6050
6051
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
6052
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6053
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6054
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6055
6056
                        // optional bias tensors
6057
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
6058
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6059
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6060
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
6061
6062
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6063
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
6064
6065
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6066
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6067
6068
                        // optional MLP bias
6069
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
6070
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
6071
0
                    }
6072
0
                } break;
6073
0
            case LLM_ARCH_NEMOTRON_H:
6074
0
            case LLM_ARCH_NEMOTRON_H_MOE:
6075
0
                {
6076
                    // mamba2 Mixer SSM params
6077
                    // NOTE: int64_t for tensor dimensions
6078
0
                    const int64_t d_conv     = hparams.ssm_d_conv;
6079
0
                    const int64_t d_inner    = hparams.ssm_d_inner;
6080
0
                    const int64_t d_state    = hparams.ssm_d_state;
6081
0
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
6082
0
                    const int64_t n_group    = hparams.ssm_n_group;
6083
0
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
6084
0
                    const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
6085
6086
                    // embeddings
6087
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6088
6089
                    // output
6090
0
                    {
6091
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6092
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6093
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
6094
0
                        if (output == NULL) {
6095
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6096
0
                        }
6097
0
                    }
6098
6099
0
                    for (int i = 0; i < n_layer; ++i) {
6100
0
                        auto & layer = layers[i];
6101
6102
                        // all blocks use the attn norm
6103
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6104
6105
0
                        if (hparams.is_recurrent(i)) {
6106
                            // ssm layers
6107
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
6108
6109
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
6110
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
6111
6112
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
6113
6114
                            // no "weight" suffix for these
6115
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
6116
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
6117
6118
0
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
6119
6120
                            // out_proj
6121
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
6122
0
                        } else if (hparams.n_ff(i) == 0) {
6123
                            // attention layers (with optional bias)
6124
0
                            const int64_t n_head_i = hparams.n_head(i);
6125
0
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
6126
0
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
6127
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
6128
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
6129
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
6130
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
6131
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
6132
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias",   i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
6133
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias",   i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
6134
0
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
6135
0
                        }  else {
6136
0
                            if (n_expert != 0) {
6137
0
                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6138
0
                                const int64_t n_ff_shexp = hparams.n_ff_shexp;
6139
6140
0
                                layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
6141
0
                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);
6142
6143
                                // MoE branch
6144
0
                                layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
6145
0
                                layer.ffn_latent_up   = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP,   "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
6146
6147
0
                                layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   moe_n_embd, n_expert}, 0);
6148
0
                                layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
6149
6150
                                // Shared expert branch
6151
0
                                layer.ffn_down_shexp  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
6152
0
                                layer.ffn_up_shexp    = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
6153
6154
0
                            } else {
6155
                                // mlp layers
6156
0
                                layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  hparams.n_ff(i), n_embd}, 0);
6157
0
                                layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   hparams.n_ff(i)}, 0);
6158
0
                                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
6159
0
                                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias",   i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
6160
0
                            }
6161
0
                        }
6162
0
                    }
6163
0
                } break;
6164
0
            case LLM_ARCH_EXAONE:
6165
0
                {
6166
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6167
6168
                    // output
6169
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6170
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6171
6172
                    // if output is NULL, init from the input tok embed
6173
0
                    if (output == NULL) {
6174
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6175
0
                    }
6176
6177
0
                    for (int i = 0; i < n_layer; ++i) {
6178
0
                        auto & layer = layers[i];
6179
6180
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6181
6182
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6183
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6184
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6185
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6186
6187
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
6188
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6189
0
                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
6190
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
6191
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
6192
0
                    }
6193
0
                } break;
6194
0
            case LLM_ARCH_EXAONE4:
6195
0
                {
6196
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6197
6198
                    // output
6199
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6200
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6201
6202
                    // if output is NULL, init from the input tok embed
6203
0
                    if (output == NULL) {
6204
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6205
0
                    }
6206
6207
0
                    for (int i = 0; i < n_layer; ++i) {
6208
0
                        auto & layer = layers[i];
6209
6210
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6211
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6212
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6213
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6214
6215
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6216
6217
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6218
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6219
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6220
6221
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6222
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6223
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6224
0
                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
6225
0
                    }
6226
0
                } break;
6227
0
            case LLM_ARCH_EXAONE_MOE:
6228
0
                {
6229
0
                    const int64_t n_ff_exp       = hparams.n_ff_exp;
6230
0
                    const int64_t n_expert       = hparams.n_expert;
6231
0
                    const int64_t n_expert_used  = hparams.n_expert_used;
6232
0
                    const int64_t n_ff_shexp     = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
6233
0
                    const int64_t head_dim       = hparams.n_embd_head_k();
6234
0
                    const int64_t n_qo_dim       = n_head * head_dim;
6235
0
                    const int64_t n_kv_dim       = n_head_kv * head_dim;
6236
6237
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6238
6239
                    // output
6240
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6241
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6242
6243
0
                    if (output == NULL) {
6244
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6245
0
                    }
6246
6247
0
                    for (int i = 0; i < n_layer; ++i) {
6248
0
                        int flags = 0;
6249
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6250
                            // skip all tensors in the NextN layers
6251
0
                            flags |= TENSOR_SKIP;
6252
0
                        }
6253
6254
0
                        auto & layer = layers[i];
6255
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_qo_dim}, flags);
6256
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_kv_dim}, flags);
6257
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_kv_dim}, flags);
6258
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
6259
6260
0
                        layer.rope_freqs   = create_tensor(tn(LLM_TENSOR_ROPE_FREQS,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
6261
6262
0
                        layer.attn_norm    = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, flags);
6263
0
                        layer.attn_q_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
6264
0
                        layer.attn_k_norm  = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
6265
6266
0
                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, flags);
6267
6268
                        // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
6269
0
                        if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
6270
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
6271
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
6272
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, flags);
6273
0
                        } else {
6274
0
                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, flags);
6275
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
6276
6277
0
                            if (n_expert == 0) {
6278
0
                                throw std::runtime_error("n_expert must be > 0");
6279
0
                            }
6280
0
                            if (n_expert_used == 0) {
6281
0
                                throw std::runtime_error("n_expert_used must be > 0");
6282
0
                            }
6283
6284
0
                            layer.ffn_gate_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS,  "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
6285
0
                            layer.ffn_down_exps  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,  "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
6286
0
                            layer.ffn_up_exps    = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,    "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
6287
6288
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
6289
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
6290
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
6291
0
                        }
6292
6293
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
6294
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6295
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
6296
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,   "weight", i), {n_embd}, flags);
6297
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,   "weight", i), {n_embd}, flags);
6298
6299
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
6300
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
6301
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
6302
0
                        }
6303
0
                    }
6304
0
                } break;
6305
0
            case LLM_ARCH_RWKV6:
6306
0
                {
6307
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6308
6309
                    // Block 0, LN0
6310
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
6311
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
6312
6313
                    // output
6314
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6315
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
6316
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6317
6318
0
                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
6319
0
                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
6320
0
                    const int head_size = hparams.wkv_head_size;
6321
0
                    const int attn_hidden_size = n_embd;
6322
0
                    const int ffn_size = hparams.n_ff_arr[0];
6323
6324
0
                    for (int i = 0; i < n_layer; ++i) {
6325
0
                        auto & layer = layers[i];
6326
6327
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6328
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
6329
6330
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
6331
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
6332
6333
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
6334
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
6335
6336
0
                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
6337
0
                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
6338
0
                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
6339
0
                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
6340
0
                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
6341
0
                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
6342
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
6343
0
                        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
6344
6345
0
                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
6346
0
                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
6347
0
                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
6348
0
                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
6349
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
6350
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
6351
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
6352
0
                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
6353
6354
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
6355
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
6356
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
6357
6358
0
                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
6359
0
                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
6360
6361
0
                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
6362
0
                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
6363
0
                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
6364
0
                    }
6365
6366
0
                } break;
6367
0
            case LLM_ARCH_RWKV6QWEN2:
6368
0
                {
6369
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6370
6371
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6372
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
6373
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6374
6375
0
                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
6376
0
                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
6377
0
                    const int head_size = hparams.wkv_head_size;
6378
0
                    const int attn_hidden_size = n_embd;
6379
0
                    const int n_head_kv = hparams.n_head_kv();
6380
0
                    int attn_key_value_size;
6381
0
                    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
6382
0
                        attn_key_value_size = attn_hidden_size;
6383
0
                    } else {
6384
0
                        attn_key_value_size = n_head_kv * head_size;
6385
0
                    }
6386
6387
0
                    for (int i = 0; i < n_layer; ++i) {
6388
0
                        auto & layer = layers[i];
6389
6390
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6391
6392
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
6393
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
6394
6395
0
                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
6396
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
6397
6398
0
                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
6399
0
                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
6400
0
                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
6401
0
                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
6402
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
6403
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
6404
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
6405
0
                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
6406
                        // optional bias tensors
6407
0
                        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
6408
0
                        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
6409
0
                        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
6410
6411
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
6412
6413
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6414
6415
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6416
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6417
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6418
0
                    }
6419
0
                } break;
6420
0
            case LLM_ARCH_RWKV7:
6421
0
                {
6422
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6423
6424
                    // Block 0, LN0
6425
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
6426
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
6427
6428
                    // output
6429
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6430
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
6431
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6432
6433
0
                    const int n_lora_decay = hparams.n_lora_decay;
6434
0
                    const int n_lora_iclr = hparams.n_lora_iclr;
6435
0
                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
6436
0
                    const int n_lora_gate = hparams.n_lora_gate;
6437
0
                    const int attn_hidden_size = n_embd;
6438
0
                    const int ffn_size = hparams.n_ff_arr[0];
6439
6440
0
                    for (int i = 0; i < n_layer; ++i) {
6441
0
                        auto & layer = layers[i];
6442
6443
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6444
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
6445
6446
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
6447
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
6448
6449
0
                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
6450
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
6451
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
6452
6453
0
                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
6454
0
                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
6455
0
                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
6456
6457
0
                        if (i == 0) {
6458
                            // actually not used
6459
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
6460
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
6461
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
6462
0
                        } else {
6463
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
6464
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
6465
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
6466
0
                        }
6467
6468
0
                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
6469
0
                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
6470
6471
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
6472
6473
0
                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
6474
0
                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
6475
0
                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
6476
6477
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
6478
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
6479
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
6480
6481
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
6482
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
6483
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
6484
6485
0
                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
6486
6487
0
                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
6488
0
                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
6489
0
                    }
6490
6491
0
                } break;
6492
0
            case LLM_ARCH_ARWKV7:
6493
0
                {
6494
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6495
6496
                    // output
6497
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6498
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6499
6500
0
                    const int n_lora_decay = hparams.n_lora_decay;
6501
0
                    const int n_lora_iclr = hparams.n_lora_iclr;
6502
0
                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
6503
0
                    const int n_lora_gate = hparams.n_lora_gate;
6504
0
                    const int attn_hidden_size = n_embd;
6505
6506
0
                    for (int i = 0; i < n_layer; ++i) {
6507
0
                        auto & layer = layers[i];
6508
6509
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6510
6511
0
                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
6512
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
6513
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
6514
6515
0
                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
6516
0
                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
6517
0
                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
6518
6519
0
                        if (i == 0) {
6520
                            // actually not used
6521
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
6522
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
6523
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
6524
0
                        } else {
6525
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
6526
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
6527
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
6528
0
                        }
6529
6530
0
                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
6531
0
                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
6532
6533
0
                        try {
6534
0
                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
6535
0
                        } catch(std::runtime_error & e) {
6536
                            // ARWKV models may not have gate tensors
6537
0
                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
6538
0
                        }
6539
6540
0
                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
6541
0
                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
6542
0
                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
6543
6544
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
6545
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
6546
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
6547
6548
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
6549
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
6550
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
6551
6552
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6553
6554
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6555
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6556
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6557
0
                    }
6558
6559
0
                } break;
6560
0
            case LLM_ARCH_CHAMELEON:
6561
0
                {
6562
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6563
6564
                    // output
6565
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6566
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6567
                    // if output is NULL, init from the input tok embed
6568
0
                    if (output == NULL) {
6569
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6570
0
                    }
6571
6572
0
                    for (int i = 0; i < n_layer; ++i) {
6573
0
                        auto & layer = layers[i];
6574
6575
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6576
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
6577
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
6578
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
6579
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
6580
6581
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
6582
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6583
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6584
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6585
6586
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6587
6588
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6589
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6590
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6591
0
                    }
6592
0
                } break;
6593
0
            case LLM_ARCH_WAVTOKENIZER_DEC:
6594
0
                {
6595
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
6596
6597
0
                    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight", 0), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
6598
0
                    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias",   0), {1, hparams.posnet.n_embd}, 0);
6599
6600
                    // posnet
6601
0
                    {
6602
0
                        const int64_t n_embd = hparams.posnet.n_embd;
6603
6604
0
                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
6605
0
                            auto & layer = layers[i].posnet;
6606
6607
                            // posnet:
6608
                            //
6609
                            //  - resnet
6610
                            //  - resnet
6611
                            //  - attn
6612
                            //  - resnet
6613
                            //  - resnet
6614
                            //  - norm
6615
                            //
6616
0
                            switch (i) {
6617
0
                                case 0:
6618
0
                                case 1:
6619
0
                                case 3:
6620
0
                                case 4:
6621
0
                                    {
6622
0
                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
6623
0
                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
6624
6625
0
                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
6626
0
                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
6627
6628
0
                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
6629
0
                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
6630
6631
0
                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
6632
0
                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
6633
0
                                    } break;
6634
0
                                case 2:
6635
0
                                    {
6636
0
                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
6637
0
                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
6638
6639
0
                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
6640
0
                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
6641
6642
0
                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
6643
0
                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
6644
6645
0
                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
6646
0
                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
6647
6648
0
                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
6649
0
                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
6650
0
                                    } break;
6651
0
                                case 5:
6652
0
                                    {
6653
0
                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
6654
0
                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
6655
0
                                    } break;
6656
0
                                default: GGML_ABORT("unknown posnet layer");
6657
0
                            };
6658
0
                        }
6659
0
                    }
6660
6661
0
                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
6662
6663
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {hparams.posnet.n_embd}, 0);
6664
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {hparams.posnet.n_embd}, 0);
6665
6666
                    // convnext
6667
0
                    {
6668
0
                        const int64_t n_embd = hparams.convnext.n_embd;
6669
6670
0
                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
6671
0
                            auto & layer = layers[i].convnext;
6672
6673
0
                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
6674
0
                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
6675
6676
0
                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
6677
0
                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
6678
6679
0
                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
6680
0
                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
6681
6682
0
                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
6683
0
                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
6684
6685
0
                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
6686
0
                        }
6687
6688
                        // output
6689
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6690
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
6691
0
                    }
6692
6693
0
                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
6694
0
                    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {hparams.n_embd_out()}, 0);
6695
0
                } break;
6696
0
            case LLM_ARCH_BAILINGMOE:
6697
0
                {
6698
0
                    const int64_t n_ff_exp            = hparams.n_ff_exp;
6699
0
                    const int64_t n_expert_shared     = hparams.n_expert_shared;
6700
6701
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6702
6703
                    // output
6704
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6705
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6706
6707
0
                    for (int i = 0; i < n_layer; ++i) {
6708
0
                        auto & layer = layers[i];
6709
6710
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6711
6712
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
6713
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6714
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6715
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6716
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6717
6718
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6719
6720
0
                        if (n_expert == 0) {
6721
0
                            throw std::runtime_error("n_expert must be > 0");
6722
0
                        }
6723
0
                        if (n_expert_used == 0) {
6724
0
                            throw std::runtime_error("n_expert_used must be > 0");
6725
0
                        }
6726
6727
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6728
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6729
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6730
6731
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6732
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
6733
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6734
0
                    }
6735
0
                } break;
6736
0
            case LLM_ARCH_BAILINGMOE2:
6737
0
                {
6738
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
6739
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
6740
6741
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6742
6743
                    // output
6744
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6745
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6746
6747
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
6748
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
6749
6750
0
                    for (int i = 0; i < n_layer; ++i) {
6751
0
                        int flags = 0;
6752
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6753
                            // skip all tensors in the NextN layers
6754
0
                            flags |= TENSOR_SKIP;
6755
0
                        }
6756
6757
0
                        auto & layer = layers[i];
6758
6759
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
6760
6761
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
6762
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
6763
6764
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
6765
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
6766
6767
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
6768
6769
0
                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
6770
0
                            const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
6771
6772
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
6773
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
6774
6775
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
6776
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
6777
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
6778
6779
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
6780
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
6781
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
6782
0
                        } else { // Dense layers
6783
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
6784
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
6785
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
6786
0
                        }
6787
6788
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
6789
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
6790
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
6791
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
6792
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
6793
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
6794
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
6795
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
6796
0
                            layer.layer_out_norm         = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
6797
0
                        }
6798
0
                    }
6799
0
                } break;
6800
0
            case LLM_ARCH_DOTS1:
6801
0
                {
6802
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
6803
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
6804
6805
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6806
6807
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6808
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6809
6810
0
                    for (int i = 0; i < n_layer; ++i) {
6811
0
                        auto & layer = layers[i];
6812
6813
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6814
6815
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6816
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6817
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6818
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6819
6820
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6821
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6822
6823
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6824
6825
0
                        if (i < (int) hparams.n_layer_dense_lead) {
6826
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6827
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6828
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6829
0
                        } else {
6830
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6831
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6832
6833
0
                            if (n_expert == 0) {
6834
0
                                throw std::runtime_error("n_expert must be > 0");
6835
0
                            }
6836
0
                            if (n_expert_used == 0) {
6837
0
                                throw std::runtime_error("n_expert_used must be > 0");
6838
0
                            }
6839
6840
                            // MoE branch
6841
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6842
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6843
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6844
6845
                            // Shared expert branch
6846
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6847
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
6848
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
6849
0
                        }
6850
0
                    }
6851
0
                } break;
6852
0
            case LLM_ARCH_ARCEE:
6853
0
                {
6854
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6855
6856
                    // output
6857
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6858
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6859
6860
                    // if output is NULL, init from the input tok embed
6861
0
                    if (output == NULL) {
6862
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6863
0
                    }
6864
6865
0
                    for (int i = 0; i < n_layer; ++i) {
6866
0
                        auto & layer = layers[i];
6867
6868
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6869
6870
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6871
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6872
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6873
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6874
6875
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6876
6877
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6878
6879
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6880
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6881
0
                    }
6882
0
                } break;
6883
0
            case LLM_ARCH_AFMOE:
6884
0
                {
6885
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6886
6887
                    // output
6888
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6889
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6890
6891
                    // if output is NULL, init from the input tok embed
6892
0
                    if (output == NULL) {
6893
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6894
0
                    }
6895
6896
0
                    const int64_t n_ff_exp = hparams.n_ff_exp;
6897
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
6898
6899
0
                    for (int i = 0; i < n_layer; ++i) {
6900
0
                        auto & layer = layers[i];
6901
6902
                        // dual attention normalization
6903
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
6904
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6905
6906
                        // attention projections
6907
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6908
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6909
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6910
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6911
6912
                        // Q/K normalization
6913
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6914
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6915
6916
                        // attention gating
6917
0
                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6918
6919
                        // dual ffn normalization
6920
0
                        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), {n_embd}, 0);
6921
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
6922
6923
0
                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
6924
                            // MoE layers
6925
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6926
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6927
6928
                            // grouped expert weights
6929
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6930
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6931
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
6932
6933
                            // shared expert
6934
0
                            if (n_expert_shared > 0) {
6935
0
                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
6936
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
6937
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
6938
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
6939
0
                            }
6940
0
                        } else {
6941
                            // Dense layers
6942
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6943
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
6944
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
6945
0
                        }
6946
0
                    }
6947
0
                } break;
6948
0
            case LLM_ARCH_ERNIE4_5:
6949
0
            case LLM_ARCH_ERNIE4_5_MOE:
6950
0
            case LLM_ARCH_PADDLEOCR:
6951
0
                {
6952
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6953
6954
                    // output
6955
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6956
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6957
                    // if output is NULL, init from the input tok embed
6958
0
                    if (output == NULL) {
6959
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6960
0
                    }
6961
6962
0
                    for (int i = 0; i < n_layer; ++i) {
6963
0
                        auto & layer = layers[i];
6964
6965
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6966
6967
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6968
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6969
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6970
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6971
6972
                        // optional bias tensors
6973
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
6974
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6975
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
6976
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
6977
6978
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6979
6980
0
                        if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
6981
0
                            int n_ff_exp = hparams.n_ff_exp;
6982
6983
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
6984
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6985
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
6986
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
6987
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
6988
6989
                            // Shared expert (if present)
6990
0
                            if (hparams.n_ff_shexp > 0) {
6991
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
6992
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd    }, 0);
6993
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
6994
0
                            }
6995
0
                        } else { // Dense layers
6996
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6997
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6998
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6999
0
                        }
7000
0
                    }
7001
0
                } break;
7002
0
            case LLM_ARCH_FALCON_H1:
7003
0
                {
7004
                    // Common
7005
0
                    const int64_t hidden_size = hparams.n_embd; // hidden_size
7006
7007
                    // mamba2 Mixer SSM params
7008
0
                    const int64_t ssm_conv_kernel_size  = hparams.ssm_d_conv; // ssm_conv_kernel_size
7009
0
                    const int64_t ssm_n_groups          = hparams.ssm_n_group; // ssm_n_groups
7010
0
                    const int64_t ssm_state_size        = hparams.ssm_d_state; // ssm_state_size
7011
0
                    const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
7012
0
                    const int64_t ssm_num_heads         = hparams.ssm_dt_rank; // ssm_num_heads
7013
0
                    const int64_t ssm_conv_dim          = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
7014
0
                    const int64_t ssm_projection_size   = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
7015
7016
                    // attn params
7017
0
                    const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
7018
0
                    const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
7019
7020
                    // ffn params
7021
0
                    const int64_t ffn_intermediate_size = hparams.n_ff(0);
7022
7023
                    // embeddings
7024
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
7025
7026
                    // output
7027
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
7028
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
7029
7030
                    // if output is NULL, init from the input tok embed
7031
0
                    if (output == NULL) {
7032
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
7033
0
                    }
7034
7035
0
                    for (int i = 0; i < n_layer; ++i) {
7036
0
                        auto & layer = layers[i];
7037
7038
                        /*SSM LAYERS*/
7039
                        // ssm in
7040
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
7041
                        // ssm 1d conv
7042
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
7043
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
7044
                        // ssm_dt
7045
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
7046
                        // no "weight" suffix for these
7047
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
7048
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
7049
                        // ssm_norm
7050
0
                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
7051
                        // out_proj
7052
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
7053
7054
                        /*ATTENTION LAYERS*/
7055
                        // attention layers (with optional bias)
7056
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
7057
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
7058
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
7059
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
7060
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
7061
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
7062
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
7063
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
7064
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
7065
7066
7067
                        // feed forward (w/ optional biases)
7068
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
7069
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7070
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
7071
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  ffn_intermediate_size, hidden_size}, 0);
7072
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
7073
7074
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
7075
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
7076
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
7077
0
                    }
7078
0
                } break;
7079
0
            case LLM_ARCH_HUNYUAN_MOE:
7080
0
                {
7081
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7082
7083
                    // output
7084
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7085
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7086
                    // if output is NULL, init from the input tok embed
7087
0
                    if (output == NULL) {
7088
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7089
0
                    }
7090
7091
0
                    for (int i = 0; i < n_layer; ++i) {
7092
0
                        auto & layer = layers[i];
7093
0
                        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
7094
7095
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7096
7097
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7098
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7099
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7100
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7101
7102
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7103
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7104
7105
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7106
7107
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
7108
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, 0);
7109
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
7110
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
7111
7112
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
7113
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
7114
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
7115
0
                    }
7116
0
                } break;
7117
0
            case LLM_ARCH_HUNYUAN_DENSE:
7118
0
                {
7119
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7120
7121
                    // output
7122
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7123
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7124
                    // if output is NULL, init from the input tok embed
7125
0
                    if (output == NULL) {
7126
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7127
0
                    }
7128
7129
0
                    for (int i = 0; i < n_layer; ++i) {
7130
0
                        auto & layer = layers[i];
7131
7132
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7133
7134
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7135
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7136
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7137
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7138
7139
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7140
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7141
7142
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7143
7144
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7145
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7146
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7147
7148
0
                    }
7149
0
                } break;
7150
0
            case LLM_ARCH_SMOLLM3:
7151
0
                {
7152
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7153
7154
                    // output
7155
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7156
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7157
7158
                    // if output is NULL, init from the input tok embed
7159
0
                    if (output == NULL) {
7160
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7161
0
                    }
7162
7163
0
                    for (int i = 0; i < n_layer; ++i) {
7164
0
                        auto & layer = layers[i];
7165
7166
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7167
7168
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7169
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7170
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7171
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7172
7173
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7174
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7175
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7176
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7177
0
                    }
7178
0
                } break;
7179
0
            case LLM_ARCH_OPENAI_MOE:
7180
0
                {
7181
0
                    const int64_t n_ff_exp = hparams.n_ff_exp;
7182
7183
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7184
7185
                    // output
7186
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7187
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7188
7189
0
                    for (int i = 0; i < n_layer; ++i) {
7190
0
                        auto & layer = layers[i];
7191
7192
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
7193
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
7194
7195
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
7196
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
7197
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
7198
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
7199
7200
0
                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
7201
7202
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
7203
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
7204
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
7205
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
7206
7207
                        // bias
7208
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_head * n_rot}, 0);
7209
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_head_kv * n_rot}, 0);
7210
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_head_kv * n_rot}, 0);
7211
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
7212
7213
0
                        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
7214
0
                        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
7215
0
                        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
7216
0
                        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
7217
0
                    }
7218
0
                } break;
7219
0
            case LLM_ARCH_LFM2:
7220
0
            case LLM_ARCH_LFM2MOE:
7221
0
                {
7222
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7223
7224
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
7225
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7226
7227
0
                    if (output == NULL) {
7228
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7229
0
                    }
7230
7231
0
                    for (int i = 0; i < n_layer; ++i) {
7232
0
                        auto & layer = layers[i];
7233
7234
0
                        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
7235
7236
                        // ffn/moe is same for transformer and conv layers
7237
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7238
0
                        if (is_moe_layer) {
7239
0
                            GGML_ASSERT(n_expert && n_expert_used);
7240
0
                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
7241
0
                            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
7242
0
                            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
7243
0
                            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
7244
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
7245
0
                        } else {  // dense
7246
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7247
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7248
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7249
0
                        }
7250
7251
                        // for operator_norm
7252
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7253
7254
0
                        if (!hparams.is_recurrent(i)) {
7255
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7256
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7257
0
                            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
7258
7259
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
7260
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
7261
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
7262
7263
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
7264
0
                        } else {
7265
0
                            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
7266
0
                            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
7267
0
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
7268
0
                        }
7269
0
                    }
7270
7271
                    // for LFM2-ColBert-350M
7272
0
                    dense_2_out_layers   = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
7273
0
                    dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"),   {hparams.n_embd_out()        }, TENSOR_NOT_REQUIRED);
7274
0
                } break;
7275
0
            case LLM_ARCH_SMALLTHINKER:
7276
0
                {
7277
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7278
7279
                    // output
7280
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7281
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7282
7283
                    // if output is NULL, init from the input tok embed
7284
0
                    if (output == NULL) {
7285
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7286
0
                    }
7287
7288
0
                    for (int i = 0; i < n_layer; ++i) {
7289
0
                        auto & layer = layers[i];
7290
7291
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7292
7293
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
7294
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
7295
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
7296
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7297
7298
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
7299
7300
0
                        GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
7301
0
                        GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
7302
7303
                        // MoE branch
7304
0
                        const int64_t n_ff_exp = hparams.n_ff_exp;
7305
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
7306
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7307
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7308
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7309
0
                    }
7310
0
                } break;
7311
0
            case LLM_ARCH_GROVEMOE:
7312
0
                {
7313
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7314
7315
                    // output
7316
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7317
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7318
                    // if output is NULL, init from the input tok embed
7319
0
                    if (output == NULL) {
7320
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7321
0
                    }
7322
7323
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
7324
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
7325
0
                    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
7326
7327
0
                    for (int i = 0; i < n_layer; ++i) {
7328
0
                        auto & layer = layers[i];
7329
7330
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7331
7332
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7333
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
7334
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
7335
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7336
7337
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7338
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7339
7340
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7341
7342
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
7343
7344
                        // MoE branch
7345
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7346
0
                        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
7347
0
                        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
7348
7349
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
7350
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
7351
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
7352
7353
0
                        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
7354
0
                        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
7355
0
                        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
7356
0
                    }
7357
0
                } break;
7358
0
            case LLM_ARCH_APERTUS:
7359
0
                {
7360
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7361
7362
                    // output
7363
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7364
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
7365
7366
0
                    for (int i = 0; i < n_layer; ++i) {
7367
0
                        auto & layer = layers[i];
7368
7369
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7370
7371
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7372
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7373
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7374
0
                        } else {
7375
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7376
0
                        }
7377
7378
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
7379
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_gqa }, 0);
7380
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_gqa }, 0);
7381
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7382
7383
                        // optional bias tensors
7384
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
7385
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
7386
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
7387
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
7388
7389
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
7390
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
7391
0
                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
7392
7393
                        // Q and K layernorms for Apertus
7394
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7395
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
7396
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7397
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
7398
0
                    }
7399
0
                } break;
7400
0
            case LLM_ARCH_MINIMAX_M2:
7401
0
                {
7402
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7403
7404
                    // output
7405
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7406
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7407
7408
0
                    for (int i = 0; i < n_layer; ++i) {
7409
0
                        auto & layer = layers[i];
7410
7411
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
7412
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
7413
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
7414
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7415
7416
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7417
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
7418
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
7419
7420
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7421
7422
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
7423
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
7424
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
7425
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
7426
0
                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
7427
0
                    }
7428
0
                } break;
7429
0
            case LLM_ARCH_KIMI_LINEAR:
7430
0
                {
7431
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7432
7433
                    // output
7434
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7435
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7436
7437
0
                    for (int i = 0; i < n_layer; ++i) {
7438
0
                        auto & layer = layers[i];
7439
7440
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7441
7442
                        // Check for KDA specific tensors to determine layer type or if it's a mixed model
7443
                        // Assuming KDA layer if KDA tensors are present
7444
7445
                        // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
7446
0
                        const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
7447
0
                        const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
7448
0
                        const int64_t ssm_d_conv = hparams.ssm_d_conv;
7449
7450
0
                        if (hparams.is_recurrent(i)) {
7451
                            // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
7452
                            // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
7453
0
                            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
7454
0
                            if (!layer.ssm_q_conv) {
7455
0
                                layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
7456
0
                            }
7457
7458
                             // KDA Layer - Conv1d weights may be 3D or 4D
7459
0
                             layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
7460
0
                             if (!layer.ssm_k_conv) {
7461
0
                                 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
7462
0
                             }
7463
0
                             layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
7464
0
                             if (!layer.ssm_v_conv) {
7465
0
                                 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
7466
0
                             }
7467
7468
                             // q, k, v projections
7469
                             // Python: q_proj, k_proj, v_proj
7470
0
                             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
7471
0
                             layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
7472
0
                             layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
7473
7474
                             // KDA specific projections
7475
                             // f_a_proj, f_b_proj
7476
0
                             layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
7477
0
                             layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
7478
7479
                             // b_proj (beta mixing coefficient)
7480
0
                             layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
7481
7482
                             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
7483
0
                             layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
7484
0
                             if (!layer.ssm_a) {
7485
0
                                 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
7486
0
                             }
7487
7488
                             // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
7489
0
                             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
7490
7491
                             // g_a_proj, g_b_proj (output gate)
7492
0
                             layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
7493
0
                             layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
7494
7495
                             // o_norm (reusing SSM_NORM)
7496
0
                             layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
7497
7498
                             // o_proj
7499
0
                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
7500
7501
0
                        } else {
7502
                             // MLA Layer - use MLA-specific head dimensions
7503
0
                             const int64_t q_lora_rank  = hparams.n_lora_q;
7504
0
                             const int64_t kv_lora_rank = hparams.n_lora_kv;
7505
0
                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
7506
0
                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
7507
7508
0
                             layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
7509
0
                             layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
7510
7511
0
                             if (layer.attn_q_a_norm) {
7512
0
                                 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
7513
0
                                 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
7514
0
                             } else {
7515
                                 // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
7516
0
                                 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
7517
0
                             }
7518
7519
                             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
7520
                             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
7521
0
                             const int64_t qk_rope_head_dim = hparams.n_rot();  // From config: qk_rope_head_dim
7522
0
                             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
7523
                             // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
7524
0
                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
7525
0
                                {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
7526
0
                             if (!layer.wkv_b) { // MLA KV cache enabled
7527
0
                                 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
7528
0
                                 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
7529
0
                             }
7530
0
                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
7531
0
                        }
7532
7533
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7534
7535
                        // MoE intermediate size (different from dense FFN)
7536
0
                        const int64_t n_ff_exp = hparams.n_ff_exp;
7537
7538
                        // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
7539
                        // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
7540
0
                        if (i < (int) hparams.n_layer_dense_lead) {
7541
                            // Dense FFN layer - use normal n_ff
7542
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7543
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
7544
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
7545
0
                        } else {
7546
                            // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
7547
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
7548
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7549
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
7550
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7551
7552
                            // Shared experts use moe_intermediate_size * num_shared_experts
7553
                            // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
7554
                            // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
7555
0
                            const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
7556
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7557
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
7558
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7559
7560
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
7561
0
                        }
7562
0
                    }
7563
0
                } break;
7564
0
            case LLM_ARCH_COGVLM:
7565
0
                {
7566
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7567
7568
                    // output
7569
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7570
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7571
7572
                    // if output is NULL, init from the input tok embed
7573
0
                    if (output == NULL) {
7574
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7575
0
                    }
7576
7577
0
                    for (int i = 0; i < n_layer; ++i) {
7578
0
                        auto & layer = layers[i];
7579
7580
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7581
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
7582
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7583
7584
0
                        layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
7585
0
                        layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7586
7587
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7588
7589
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7590
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7591
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7592
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7593
7594
0
                        layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7595
0
                        layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7596
0
                        layer.visexp_ffn_up   = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7597
0
                    }
7598
0
                } break;
7599
0
            case LLM_ARCH_PANGU_EMBED:
7600
0
                {
7601
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7602
7603
                    // output
7604
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7605
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7606
7607
                    // if output is NULL, init from the input tok embed
7608
0
                    if (output == NULL) {
7609
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7610
0
                    }
7611
7612
0
                    for (int i = 0; i < n_layer; ++i) {
7613
0
                        auto & layer = layers[i];
7614
7615
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7616
7617
                        // weight tensors
7618
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7619
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7620
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7621
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7622
7623
                        // bias tensors
7624
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, 0);
7625
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
7626
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
7627
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
7628
7629
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7630
7631
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7632
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7633
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7634
0
                        } else {
7635
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7636
0
                        }
7637
7638
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7639
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7640
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7641
0
                    }
7642
0
                } break;
7643
0
            case LLM_ARCH_QWEN3NEXT:
7644
0
                {
7645
0
                    if (n_expert == 0) {
7646
0
                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
7647
0
                    }
7648
7649
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7650
7651
                    // output
7652
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7653
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7654
7655
                    // if output is NULL, init from the input tok embed
7656
0
                    if (output == NULL) {
7657
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7658
0
                    }
7659
7660
0
                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7661
7662
                    // Calculate dimensions from hyperparameters
7663
0
                    const int64_t head_k_dim = hparams.ssm_d_state;
7664
0
                    const int64_t head_v_dim = hparams.ssm_d_state;
7665
0
                    const int64_t n_k_heads  = hparams.ssm_n_group;
7666
0
                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
7667
0
                    const int64_t key_dim    = head_k_dim * n_k_heads;
7668
0
                    const int64_t value_dim  = head_v_dim * n_v_heads;
7669
0
                    const int64_t conv_dim   = key_dim * 2 + value_dim;
7670
7671
                    // Calculate projection sizes
7672
0
                    const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
7673
0
                    const int64_t ba_dim   = n_v_heads * 2;
7674
7675
0
                    for (int i = 0; i < n_layer; ++i) {
7676
0
                        auto & layer = layers[i];
7677
0
                        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
7678
7679
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
7680
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7681
7682
0
                        if (!hparams.is_recurrent(i)) {
7683
                            // Attention layers
7684
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7685
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
7686
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
7687
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7688
7689
                            // Q/K normalization for attention layers
7690
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7691
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7692
0
                        } else {
7693
                            // Linear attention (gated delta net) specific tensors
7694
                            // Create tensors with calculated dimensions
7695
                            // note: ssm_in is used by legacy GGUF
7696
0
                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
7697
0
                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7698
0
                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7699
0
                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7700
0
                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
7701
0
                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
7702
0
                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
7703
0
                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
7704
0
                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
7705
0
                        }
7706
7707
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
7708
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7709
0
                        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
7710
7711
                        // Shared experts
7712
0
                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7713
0
                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
7714
0
                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
7715
0
                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
7716
0
                    }
7717
0
                } break;
7718
0
            case LLM_ARCH_QWEN35MOE:
7719
0
                {
7720
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7721
7722
                    // output
7723
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7724
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7725
7726
                    // if output is NULL, init from the input tok embed
7727
0
                    if (output == NULL) {
7728
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7729
0
                    }
7730
7731
0
                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7732
7733
                    // Calculate dimensions from hyperparameters
7734
0
                    const int64_t head_k_dim = hparams.ssm_d_state;
7735
0
                    const int64_t head_v_dim = hparams.ssm_d_state;
7736
0
                    const int64_t n_k_heads  = hparams.ssm_n_group;
7737
0
                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
7738
0
                    const int64_t key_dim    = head_k_dim * n_k_heads;
7739
0
                    const int64_t value_dim  = head_v_dim * n_v_heads;
7740
0
                    const int64_t conv_dim   = key_dim * 2 + value_dim;
7741
7742
0
                    for (int i = 0; i < n_layer; ++i) {
7743
0
                        auto & layer = layers[i];
7744
7745
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
7746
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7747
7748
0
                        if (!hparams.is_recurrent(i)) {
7749
                            // Attention layers
7750
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7751
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
7752
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
7753
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7754
7755
                            // Q/K normalization for attention layers
7756
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7757
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7758
0
                        } else {
7759
                            // Linear attention (gated delta net) specific tensors
7760
                            // Create tensors with calculated dimensions
7761
0
                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7762
0
                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7763
0
                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7764
0
                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
7765
0
                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
7766
0
                            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
7767
0
                            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
7768
0
                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
7769
0
                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
7770
0
                        }
7771
7772
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
7773
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7774
0
                        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
7775
7776
                        // Shared experts
7777
0
                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
7778
7779
0
                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7780
0
                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
7781
0
                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
7782
0
                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
7783
0
                    }
7784
0
                } break;
7785
0
            case LLM_ARCH_QWEN35:
7786
0
                {
7787
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7788
7789
                    // output
7790
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7791
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7792
7793
                    // if output is NULL, init from the input tok embed
7794
0
                    if (output == NULL) {
7795
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7796
0
                    }
7797
7798
                    // Calculate dimensions from hyperparameters
7799
0
                    const int64_t head_k_dim = hparams.ssm_d_state;
7800
0
                    const int64_t head_v_dim = hparams.ssm_d_state;
7801
0
                    const int64_t n_k_heads  = hparams.ssm_n_group;
7802
0
                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
7803
0
                    const int64_t key_dim    = head_k_dim * n_k_heads;
7804
0
                    const int64_t value_dim  = head_v_dim * n_v_heads;
7805
0
                    const int64_t conv_dim   = key_dim * 2 + value_dim;
7806
7807
0
                    for (int i = 0; i < n_layer; ++i) {
7808
0
                        auto & layer = layers[i];
7809
7810
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
7811
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7812
7813
0
                        if (!hparams.is_recurrent(i)) {
7814
                            // Attention layers
7815
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7816
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
7817
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
7818
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7819
7820
                            // Q/K normalization for attention layers
7821
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7822
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7823
0
                        } else {
7824
                            // Linear attention (gated delta net) specific tensors
7825
                            // Create tensors with calculated dimensions
7826
0
                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7827
0
                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7828
0
                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7829
0
                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
7830
0
                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
7831
0
                            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
7832
0
                            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
7833
0
                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
7834
0
                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
7835
0
                        }
7836
7837
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7838
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7839
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7840
0
                    }
7841
0
                } break;
7842
0
            case LLM_ARCH_MIMO2:
7843
0
                {
7844
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7845
7846
                    // output
7847
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7848
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7849
7850
0
                    for (int i = 0; i < n_layer; ++i) {
7851
0
                        auto & layer = layers[i];
7852
0
                        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
7853
0
                        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
7854
0
                        uint32_t n_head = hparams.n_head(i);
7855
7856
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
7857
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7858
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7859
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
7860
7861
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM,  "weight", i), {n_embd}, 0);
7862
0
                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
7863
7864
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7865
7866
                        // non-MoE branch
7867
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7868
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7869
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7870
7871
                        // MoE branch
7872
0
                        int64_t n_ff_exp = hparams.n_ff_exp;
7873
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7874
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7875
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7876
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7877
0
                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7878
0
                    }
7879
0
                } break;
7880
0
            case LLM_ARCH_STEP35:
7881
0
                {
7882
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7883
7884
                    // output
7885
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7886
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
7887
7888
                    // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
7889
                    // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
7890
0
                    uint32_t n_rot_max = 0;
7891
0
                    for (int i = 0; i < n_layer; ++i) {
7892
0
                        n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
7893
0
                    }
7894
0
                    if (n_rot_max == 0) {
7895
0
                        n_rot_max = n_rot;
7896
0
                    }
7897
7898
0
                    for (int i = 0; i < n_layer; ++i) {
7899
0
                        auto & layer = layers[i];
7900
7901
0
                        const uint32_t n_head_l      = hparams.n_head(i);
7902
0
                        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
7903
0
                        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
7904
7905
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7906
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7907
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7908
7909
                        // optional rope factors (llama3) / longrope tensors
7910
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7911
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7912
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7913
0
                        } else {
7914
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7915
0
                        }
7916
7917
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
7918
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
7919
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
7920
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
7921
7922
                        // head-wise attention gate (Step35 self_attn.g_proj)
7923
0
                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
7924
7925
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7926
7927
                        // dense MLP (leading dense blocks)
7928
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7929
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7930
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
7931
7932
                        // MoE routed experts + selection bias (router_bias)
7933
0
                        const int64_t n_ff_exp = hparams.n_ff_exp;
7934
0
                        layer.ffn_gate_inp      = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7935
0
                        layer.ffn_gate_exps     = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7936
0
                        layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7937
0
                        layer.ffn_up_exps       = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
7938
0
                        layer.ffn_exp_probs_b   = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7939
7940
                        // shared expert MLP
7941
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7942
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7943
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
7944
0
                    }
7945
0
                } break;
7946
0
            case LLM_ARCH_MAINCODER:
7947
0
                {
7948
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7949
7950
                    // output
7951
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7952
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
7953
                    // if output is NULL, init from the input tok embed
7954
0
                    if (output == NULL) {
7955
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
7956
0
                    }
7957
7958
0
                    for (int i = 0; i < n_layer; ++i) {
7959
0
                        auto & layer = layers[i];
7960
7961
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7962
7963
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
7964
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
7965
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
7966
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
7967
7968
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
7969
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
7970
7971
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7972
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
7973
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
7974
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
7975
0
                    }
7976
0
                } break;
7977
0
            default:
7978
0
                throw std::runtime_error("unknown architecture");
7979
0
        }
7980
7981
        // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
7982
        // this avoids having to add scale loading to every architecture
7983
0
        for (int i = 0; i < n_layer; ++i) {
7984
0
            auto & layer = layers[i];
7985
7986
            // attention weight scales (per-tensor, shape {1})
7987
0
            if (!layer.wq_s && layer.wq) {
7988
0
                layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
7989
0
            }
7990
0
            if (!layer.wk_s && layer.wk) {
7991
0
                layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
7992
0
            }
7993
0
            if (!layer.wv_s && layer.wv) {
7994
0
                layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
7995
0
            }
7996
0
            if (!layer.wo_s && layer.wo) {
7997
0
                layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7998
0
            }
7999
0
            if (!layer.wqkv_s && layer.wqkv) {
8000
0
                layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8001
0
            }
8002
0
            if (!layer.wqkv_gate_s && layer.wqkv_gate) {
8003
0
                layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8004
0
            }
8005
8006
            // dense FFN weight scales (per-tensor, shape {1})
8007
0
            if (!layer.ffn_gate_s && layer.ffn_gate) {
8008
0
                layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8009
0
            }
8010
0
            if (!layer.ffn_down_s && layer.ffn_down) {
8011
0
                layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8012
0
            }
8013
0
            if (!layer.ffn_up_s && layer.ffn_up) {
8014
0
                layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8015
0
            }
8016
0
            if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
8017
0
                layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8018
0
            }
8019
0
            if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
8020
0
                layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8021
0
            }
8022
0
            if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
8023
0
                layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8024
0
            }
8025
8026
            // MoE expert weight scales (per-expert, shape {n_expert})
8027
0
            if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
8028
0
                layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
8029
0
            }
8030
0
            if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
8031
0
                layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
8032
0
            }
8033
0
            if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
8034
0
                layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
8035
0
            }
8036
8037
            // recurrent / linear-attention weight scales (per-tensor, shape {1})
8038
0
            if (!layer.ssm_in_s && layer.ssm_in) {
8039
0
                layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8040
0
            }
8041
0
            if (!layer.ssm_out_s && layer.ssm_out) {
8042
0
                layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8043
0
            }
8044
0
            if (!layer.ssm_alpha_s && layer.ssm_alpha) {
8045
0
                layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8046
0
            }
8047
0
            if (!layer.ssm_beta_s && layer.ssm_beta) {
8048
0
                layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
8049
0
            }
8050
8051
            // input scales
8052
0
            if (!layer.wq_in_s && layer.wq) {
8053
0
                layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8054
0
            }
8055
0
            if (!layer.wk_in_s && layer.wk) {
8056
0
                layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8057
0
            }
8058
0
            if (!layer.wv_in_s && layer.wv) {
8059
0
                layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8060
0
            }
8061
0
            if (!layer.wo_in_s && layer.wo) {
8062
0
                layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8063
0
            }
8064
0
            if (!layer.wqkv_in_s && layer.wqkv) {
8065
0
                layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8066
0
            }
8067
0
            if (!layer.wqkv_gate_in_s && layer.wqkv_gate) {
8068
0
                layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8069
0
            }
8070
0
            if (!layer.ffn_gate_in_s && layer.ffn_gate) {
8071
0
                layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8072
0
            }
8073
0
            if (!layer.ffn_down_in_s && layer.ffn_down) {
8074
0
                layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8075
0
            }
8076
0
            if (!layer.ffn_up_in_s && layer.ffn_up) {
8077
0
                layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8078
0
            }
8079
0
            if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) {
8080
0
                layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
8081
0
            }
8082
0
            if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) {
8083
0
                layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
8084
0
            }
8085
0
            if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) {
8086
0
                layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
8087
0
            }
8088
0
            if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) {
8089
0
                layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8090
0
            }
8091
0
            if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) {
8092
0
                layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8093
0
            }
8094
0
            if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) {
8095
0
                layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8096
0
            }
8097
0
            if (!layer.ssm_in_in_s && layer.ssm_in) {
8098
0
                layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8099
0
            }
8100
0
            if (!layer.ssm_out_in_s && layer.ssm_out) {
8101
0
                layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8102
0
            }
8103
0
            if (!layer.ssm_alpha_in_s && layer.ssm_alpha) {
8104
0
                layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8105
0
            }
8106
0
            if (!layer.ssm_beta_in_s && layer.ssm_beta) {
8107
0
                layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
8108
0
            }
8109
0
        }
8110
0
    }
8111
8112
0
    ml.done_getting_tensors();
8113
8114
    // populate tensors_by_name
8115
0
    for (auto & [_, ctx_ptr] : ml.ctx_map) {
8116
0
        for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
8117
0
            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
8118
0
        }
8119
0
    }
8120
8121
0
    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
8122
0
    pimpl->mappings.reserve(ml.mappings.size());
8123
8124
    // create the backend buffers
8125
0
    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
8126
0
    ctx_buf_maps.reserve(ml.ctx_map.size());
8127
8128
    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
8129
0
    const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
8130
0
    pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
8131
8132
0
    for (auto & [buft, ctx_ptr] : ml.ctx_map) {
8133
0
        ggml_context * ctx = ctx_ptr.get();
8134
8135
        // skip contexts without tensors
8136
0
        if (ggml_get_first_tensor(ctx) == nullptr) {
8137
0
            continue;
8138
0
        }
8139
8140
0
        llama_buf_map buf_map;
8141
0
        buf_map.reserve(n_max_backend_buffer);
8142
8143
        // check if it is possible to use buffer_from_host_ptr with this buffer type
8144
0
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
8145
0
        if (!dev) {
8146
            // FIXME: workaround for CPU backend buft having a NULL device
8147
0
            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
8148
0
            if (!dev) {
8149
0
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
8150
0
            }
8151
0
        }
8152
0
        ggml_backend_dev_props props;
8153
0
        ggml_backend_dev_get_props(dev, &props);
8154
0
        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
8155
0
        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
8156
8157
0
        std::vector<ggml_backend_buffer_ptr> bufs;
8158
0
        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
8159
0
            GGML_ASSERT(!ml.no_alloc);
8160
0
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
8161
                // only the mmap region containing the tensors in the model is mapped to the backend buffer
8162
                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
8163
                //     then we could just use metal for all layers
8164
                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
8165
0
                void * addr = nullptr;
8166
0
                size_t first, last; // NOLINT
8167
0
                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
8168
0
                if (first >= last) {
8169
0
                    continue;
8170
0
                }
8171
0
                const size_t max_size = ggml_get_max_tensor_size(ctx);
8172
0
                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
8173
0
                if (buf == nullptr) {
8174
0
                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
8175
0
                }
8176
0
                bufs.emplace_back(buf);
8177
0
                buf_map.emplace(idx, buf);
8178
0
            }
8179
0
        } else {
8180
0
            ggml_backend_buffer_t buf;
8181
0
            if (ml.no_alloc) {
8182
0
                buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
8183
0
                for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
8184
0
                    t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
8185
0
                }
8186
0
            } else {
8187
0
                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
8188
0
            }
8189
0
            if (buf == nullptr) {
8190
0
                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
8191
0
            }
8192
0
            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
8193
0
                pimpl->mlock_bufs.emplace_back(new llama_mlock);
8194
0
                auto & mlock_buf = pimpl->mlock_bufs.back();
8195
0
                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
8196
0
                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
8197
0
            }
8198
0
            bufs.emplace_back(buf);
8199
0
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
8200
0
                buf_map.emplace(idx, buf);
8201
0
            }
8202
0
        }
8203
8204
0
        for (auto & buf : bufs) {
8205
            // indicate that this buffer contains weights
8206
            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
8207
0
            ggml_backend_buffer_set_usage(buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
8208
0
        }
8209
8210
0
        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
8211
8212
0
        ctx_buf_maps.emplace_back(ctx, buf_map);
8213
0
    }
8214
8215
0
    if (llama_supports_gpu_offload()) {
8216
0
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
8217
8218
0
        int n_repeating = n_gpu;
8219
0
        if (n_repeating > 0) {
8220
0
            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
8221
0
            n_repeating--;
8222
0
        }
8223
0
        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
8224
8225
0
        const int max_backend_supported_layers = hparams.n_layer + 1;
8226
0
        const int max_offloadable_layers       = hparams.n_layer + 1;
8227
8228
0
        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
8229
0
    }
8230
8231
    // print memory requirements per buffer type
8232
0
    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
8233
0
        for (auto & buf: bufs) {
8234
0
            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
8235
0
                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
8236
0
        }
8237
0
    }
8238
8239
0
    if (ml.no_alloc) {
8240
0
        return true;
8241
0
    }
8242
8243
    // load tensor data
8244
0
    for (auto & [ctx, buf_map] : ctx_buf_maps) {
8245
0
        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
8246
0
            return false;
8247
0
        }
8248
0
    }
8249
8250
0
    if (use_mmap_buffer) {
8251
0
        for (auto & mapping : ml.mappings) {
8252
0
            pimpl->mappings.emplace_back(std::move(mapping));
8253
0
        }
8254
0
    }
8255
8256
0
    return true;
8257
0
}
8258
8259
0
std::string llama_model::arch_name() const {
8260
0
    return llm_arch_name(arch);
8261
0
}
8262
8263
0
std::string llama_model::type_name() const {
8264
0
    return llm_type_name(type);
8265
0
}
8266
8267
0
std::string llama_model::desc() const {
8268
0
    return pimpl->desc_str;
8269
0
}
8270
8271
0
size_t llama_model::size() const {
8272
0
    return pimpl->n_bytes;
8273
0
}
8274
8275
0
size_t llama_model::n_tensors() const {
8276
0
    return tensors_by_name.size();
8277
0
}
8278
8279
0
size_t llama_model::n_devices() const {
8280
0
    return devices.size();
8281
0
}
8282
8283
0
const float * llama_model::tensor_split() const {
8284
0
    return params.tensor_split;
8285
0
}
8286
8287
0
uint32_t llama_model::n_gpu_layers() const {
8288
0
    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
8289
0
}
8290
8291
0
llama_split_mode llama_model::split_mode() const {
8292
0
    return params.split_mode;
8293
0
}
8294
8295
0
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
8296
0
    std::map<ggml_backend_buffer_type_t, size_t> ret;
8297
0
    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
8298
0
        if (hparams.no_alloc) {
8299
0
            GGML_ASSERT(bufs.size() == 1);
8300
0
            ggml_backend_buffer_t buf = bufs[0].get();
8301
0
            GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
8302
0
            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
8303
0
            ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
8304
0
        } else {
8305
0
            for (const auto & buf : bufs) {
8306
                // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
8307
0
                ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
8308
0
            }
8309
0
        }
8310
0
    }
8311
0
    return ret;
8312
0
}
8313
8314
0
uint64_t llama_model::n_elements() const {
8315
0
    return pimpl->n_elements;
8316
0
}
8317
8318
0
void llama_model::print_info() const {
8319
0
    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
8320
8321
0
    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
8322
0
        bool is_var = false;
8323
8324
0
        std::vector<uint32_t> v;
8325
0
        for (uint32_t i = 0; i < n; ++i) {
8326
0
            v.push_back(f(i));
8327
0
            if (v[i] != v[0]) {
8328
0
                is_var = true;
8329
0
            }
8330
0
        }
8331
8332
0
        std::stringstream ss;
8333
8334
0
        if (is_var) {
8335
0
            ss << "[";
8336
0
            for (uint32_t i = 0; i < n; ++i) {
8337
0
                ss << v[i];
8338
0
                if (i < n - 1) {
8339
0
                    ss << ", ";
8340
0
                }
8341
0
            }
8342
0
            ss << "]";
8343
0
        } else {
8344
0
            ss << v[0];
8345
0
        }
8346
8347
0
        return ss.str();
8348
0
    };
8349
8350
    // hparams
8351
0
    LLAMA_LOG_INFO("%s: arch                  = %s\n",     __func__, arch_name().c_str());
8352
0
    LLAMA_LOG_INFO("%s: vocab_only            = %d\n",     __func__, hparams.vocab_only);
8353
0
    LLAMA_LOG_INFO("%s: no_alloc              = %d\n",     __func__, hparams.no_alloc);
8354
8355
0
    if (!hparams.vocab_only) {
8356
0
        LLAMA_LOG_INFO("%s: n_ctx_train           = %u\n",     __func__, hparams.n_ctx_train);
8357
0
        LLAMA_LOG_INFO("%s: n_embd                = %u\n",     __func__, hparams.n_embd);
8358
0
        LLAMA_LOG_INFO("%s: n_embd_inp            = %u\n",     __func__, hparams.n_embd_inp());
8359
0
        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
8360
0
        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
8361
0
        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
8362
0
        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
8363
0
        LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
8364
0
        LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
8365
0
        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
8366
0
        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
8367
0
        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
8368
0
        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
8369
0
        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
8370
0
        LLAMA_LOG_INFO("%s: f_norm_eps            = %.1e\n",   __func__, hparams.f_norm_eps);
8371
0
        LLAMA_LOG_INFO("%s: f_norm_rms_eps        = %.1e\n",   __func__, hparams.f_norm_rms_eps);
8372
0
        LLAMA_LOG_INFO("%s: f_clamp_kqv           = %.1e\n",   __func__, hparams.f_clamp_kqv);
8373
0
        LLAMA_LOG_INFO("%s: f_max_alibi_bias      = %.1e\n",   __func__, hparams.f_max_alibi_bias);
8374
0
        LLAMA_LOG_INFO("%s: f_logit_scale         = %.1e\n",   __func__, hparams.f_logit_scale);
8375
0
        LLAMA_LOG_INFO("%s: f_attn_scale          = %.1e\n",   __func__, hparams.f_attention_scale);
8376
0
        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
8377
0
        LLAMA_LOG_INFO("%s: n_expert              = %u\n",     __func__, hparams.n_expert);
8378
0
        LLAMA_LOG_INFO("%s: n_expert_used         = %u\n",     __func__, hparams.n_expert_used);
8379
0
        LLAMA_LOG_INFO("%s: n_expert_groups       = %d\n",     __func__, hparams.n_expert_groups);
8380
0
        LLAMA_LOG_INFO("%s: n_group_used          = %d\n",     __func__, hparams.n_group_used);
8381
0
        LLAMA_LOG_INFO("%s: causal attn           = %d\n",     __func__, hparams.causal_attn);
8382
0
        LLAMA_LOG_INFO("%s: pooling type          = %d\n",     __func__, hparams.pooling_type);
8383
0
        LLAMA_LOG_INFO("%s: rope type             = %d\n",     __func__, hparams.rope_type);
8384
0
        LLAMA_LOG_INFO("%s: rope scaling          = %s\n",     __func__, rope_scaling_type.c_str());
8385
0
        LLAMA_LOG_INFO("%s: freq_base_train       = %.1f\n",   __func__, hparams.rope_freq_base_train);
8386
0
        LLAMA_LOG_INFO("%s: freq_scale_train      = %g\n",     __func__, hparams.rope_freq_scale_train);
8387
0
        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8388
0
            LLAMA_LOG_INFO("%s: freq_base_swa         = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
8389
0
            LLAMA_LOG_INFO("%s: freq_scale_swa        = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
8390
0
            LLAMA_LOG_INFO("%s: n_embd_head_k_swa     = %u\n",     __func__, hparams.n_embd_head_k_swa);
8391
0
            LLAMA_LOG_INFO("%s: n_embd_head_v_swa     = %u\n",     __func__, hparams.n_embd_head_v_swa);
8392
0
            LLAMA_LOG_INFO("%s: n_rot_swa             = %u\n",     __func__, hparams.n_rot_swa);
8393
0
        }
8394
0
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
8395
0
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
8396
0
        LLAMA_LOG_INFO("%s: rope_finetuned        = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
8397
        // MRoPE (Multi-axis Rotary Position Embedding) sections
8398
0
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
8399
0
            LLAMA_LOG_INFO("%s: mrope sections        = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
8400
0
        }
8401
0
        if (!classifier_labels.empty()) {
8402
0
            LLAMA_LOG_INFO("%s: n_cls_out             = %u\n", __func__, hparams.n_cls_out);
8403
8404
0
            size_t i = 0;
8405
0
            for (auto label : classifier_labels) {
8406
0
                LLAMA_LOG_INFO("%s: cls_label[%2zu]         = %s\n", __func__, i++, label.c_str());
8407
0
            }
8408
0
        }
8409
0
    }
8410
8411
0
    if (arch == LLM_ARCH_MAMBA ||
8412
0
        arch == LLM_ARCH_MAMBA2 ||
8413
0
        arch == LLM_ARCH_JAMBA ||
8414
0
        arch == LLM_ARCH_FALCON_H1 ||
8415
0
        arch == LLM_ARCH_PLAMO2 ||
8416
0
        arch == LLM_ARCH_GRANITE_HYBRID ||
8417
0
        arch == LLM_ARCH_QWEN3NEXT ||
8418
0
        arch == LLM_ARCH_QWEN35 ||
8419
0
        arch == LLM_ARCH_QWEN35MOE ||
8420
0
        arch == LLM_ARCH_NEMOTRON_H ||
8421
0
        arch == LLM_ARCH_NEMOTRON_H_MOE) {
8422
0
        LLAMA_LOG_INFO("%s: ssm_d_conv            = %u\n",     __func__, hparams.ssm_d_conv);
8423
0
        LLAMA_LOG_INFO("%s: ssm_d_inner           = %u\n",     __func__, hparams.ssm_d_inner);
8424
0
        LLAMA_LOG_INFO("%s: ssm_d_state           = %u\n",     __func__, hparams.ssm_d_state);
8425
0
        LLAMA_LOG_INFO("%s: ssm_dt_rank           = %u\n",     __func__, hparams.ssm_dt_rank);
8426
0
        LLAMA_LOG_INFO("%s: ssm_n_group           = %u\n",     __func__, hparams.ssm_n_group);
8427
0
        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms        = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
8428
0
    }
8429
8430
0
    LLAMA_LOG_INFO("%s: model type            = %s\n",     __func__, type_name().c_str());
8431
0
    if (pimpl->n_elements >= 1e12) {
8432
0
        LLAMA_LOG_INFO("%s: model params          = %.2f T\n", __func__, pimpl->n_elements*1e-12);
8433
0
    } else if (pimpl->n_elements >= 1e9) {
8434
0
        LLAMA_LOG_INFO("%s: model params          = %.2f B\n", __func__, pimpl->n_elements*1e-9);
8435
0
    } else if (pimpl->n_elements >= 1e6) {
8436
0
        LLAMA_LOG_INFO("%s: model params          = %.2f M\n", __func__, pimpl->n_elements*1e-6);
8437
0
    } else {
8438
0
        LLAMA_LOG_INFO("%s: model params          = %.2f K\n", __func__, pimpl->n_elements*1e-3);
8439
0
    }
8440
8441
    // general kv
8442
0
    LLAMA_LOG_INFO("%s: general.name          = %s\n",    __func__, name.c_str());
8443
8444
0
    if (arch == LLM_ARCH_DEEPSEEK) {
8445
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
8446
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8447
0
        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
8448
0
        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
8449
0
    }
8450
8451
0
    if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
8452
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
8453
0
        LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
8454
0
        LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
8455
0
        LLAMA_LOG_INFO("%s: n_embd_head_k_mla     = %d\n",     __func__, hparams.n_embd_head_k_mla());
8456
0
        LLAMA_LOG_INFO("%s: n_embd_head_v_mla     = %d\n",     __func__, hparams.n_embd_head_v_mla());
8457
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8458
0
        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
8459
0
        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
8460
0
        LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
8461
0
        LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
8462
0
    }
8463
8464
0
    if (arch == LLM_ARCH_QWEN2MOE) {
8465
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8466
0
        LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
8467
0
    }
8468
8469
0
    if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
8470
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8471
0
    }
8472
8473
0
    if (arch == LLM_ARCH_MINICPM ||
8474
0
        arch == LLM_ARCH_GRANITE ||
8475
0
        arch == LLM_ARCH_GRANITE_MOE ||
8476
0
        arch == LLM_ARCH_GRANITE_HYBRID ||
8477
0
        arch == LLM_ARCH_NEMOTRON_H_MOE) {
8478
0
        LLAMA_LOG_INFO("%s: f_embedding_scale     = %f\n", __func__, hparams.f_embedding_scale);
8479
0
        LLAMA_LOG_INFO("%s: f_residual_scale      = %f\n", __func__, hparams.f_residual_scale);
8480
0
        LLAMA_LOG_INFO("%s: f_attention_scale     = %f\n", __func__, hparams.f_attention_scale);
8481
0
        LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n", __func__, hparams.n_ff_shexp);
8482
0
    }
8483
8484
0
    if (arch == LLM_ARCH_BAILINGMOE) {
8485
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
8486
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8487
0
        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
8488
0
        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
8489
0
        LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
8490
0
    }
8491
8492
0
    if (arch == LLM_ARCH_BAILINGMOE2) {
8493
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
8494
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8495
0
        LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
8496
0
        LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
8497
0
        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
8498
0
        LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
8499
0
        LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
8500
0
        LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
8501
0
    }
8502
8503
0
    if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
8504
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8505
0
        LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
8506
0
    }
8507
8508
0
    if (arch == LLM_ARCH_GROVEMOE) {
8509
0
        LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
8510
0
        LLAMA_LOG_INFO("%s: n_ff_chexp            = %d\n",     __func__, hparams.n_ff_chexp);
8511
0
        LLAMA_LOG_INFO("%s: n_group_experts       = %d\n",     __func__, hparams.n_group_experts);
8512
0
        LLAMA_LOG_INFO("%s: expert_group_scale    = %.2f\n",   __func__, hparams.expert_group_scale);
8513
0
    }
8514
8515
0
    vocab.print_info();
8516
0
}
8517
8518
0
ggml_backend_dev_t llama_model::dev_layer(int il) const {
8519
0
    return pimpl->dev_layer.at(il).dev;
8520
0
}
8521
8522
0
ggml_backend_dev_t llama_model::dev_output() const {
8523
0
    return pimpl->dev_output.dev;
8524
0
}
8525
8526
template<typename F>
8527
0
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
8528
0
    ggml_init_params params = {
8529
0
        /*.mem_size   =*/ ggml_tensor_overhead()*8,
8530
0
        /*.mem_buffer =*/ NULL,
8531
0
        /*.no_alloc   =*/ true,
8532
0
    };
8533
8534
0
    ggml_context_ptr ctx { ggml_init(params) };
8535
0
    if (!ctx) {
8536
0
        throw std::runtime_error(format("failed to create ggml context"));
8537
0
    }
8538
8539
0
    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
8540
0
    ggml_tensor * op_tensor = fn(ctx.get());
8541
0
    for (int i = 0; i < GGML_MAX_SRC; i++) {
8542
0
        if (op_tensor->src[i] != nullptr) {
8543
0
            assert(op_tensor->src[i]->buffer == nullptr);
8544
0
            op_tensor->src[i]->buffer = buf.get();
8545
0
        }
8546
0
    }
8547
8548
0
    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
8549
8550
0
    return op_supported;
8551
0
}
8552
8553
template<typename F>
8554
0
static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
8555
0
    for (const auto & cur : buft_list) {
8556
0
        ggml_backend_dev_t cur_dev = cur.first;
8557
0
        ggml_backend_buffer_type_t cur_buft = cur.second;
8558
0
        if (buft_supported(cur_buft, cur_dev, fn)) {
8559
0
            return cur_buft;
8560
0
        }
8561
0
    }
8562
8563
0
    throw std::runtime_error(format("no suitable buffer type found"));
8564
0
}
8565
8566
0
ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
8567
0
    return ::select_buft(
8568
0
            *pimpl->dev_layer.at(il).buft_list,
8569
0
            [&](ggml_context * ctx) {
8570
0
                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
8571
0
                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
8572
0
                return ggml_add(ctx, cur, layer_dir);
8573
0
            });
8574
0
}
8575
8576
0
bool llama_model::has_tensor_overrides() const {
8577
0
    return pimpl->has_tensor_overrides;
8578
0
}
8579
8580
0
const ggml_tensor * llama_model::get_tensor(const char * name) const {
8581
0
    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
8582
0
            [name](const std::pair<std::string, ggml_tensor *> & it) {
8583
0
                return it.first == name;
8584
0
            });
8585
0
    if (it == tensors_by_name.end()) {
8586
0
        return nullptr;
8587
0
    }
8588
8589
0
    return it->second;
8590
0
}
8591
8592
0
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
8593
0
    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
8594
0
}
8595
8596
0
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
8597
0
    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
8598
0
}
8599
8600
0
ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
8601
0
    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
8602
8603
    // choose long/short freq factors based on the context size
8604
0
    if (layers[il].rope_freqs != nullptr) {
8605
0
        return layers[il].rope_freqs;
8606
0
    }
8607
8608
0
    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
8609
0
        return layers[il].rope_long;
8610
0
    }
8611
8612
0
    return layers[il].rope_short;
8613
0
}
8614
8615
0
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
8616
0
    llama_memory_i * res;
8617
8618
0
    switch (arch) {
8619
        // Models that need specific instantiation should be handled in the
8620
        // switch statement
8621
0
        case LLM_ARCH_BERT:
8622
0
        case LLM_ARCH_JINA_BERT_V2:
8623
0
        case LLM_ARCH_JINA_BERT_V3:
8624
0
        case LLM_ARCH_NOMIC_BERT:
8625
0
        case LLM_ARCH_NOMIC_BERT_MOE:
8626
0
        case LLM_ARCH_NEO_BERT:
8627
0
        case LLM_ARCH_EUROBERT:
8628
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
8629
0
        case LLM_ARCH_MODERN_BERT:
8630
0
        case LLM_ARCH_GEMMA_EMBEDDING:
8631
0
        case LLM_ARCH_DREAM:
8632
0
        case LLM_ARCH_LLADA:
8633
0
        case LLM_ARCH_LLADA_MOE:
8634
0
        case LLM_ARCH_RND1:
8635
0
            {
8636
0
                res = nullptr;
8637
0
            } break;
8638
        // Models that need standard caching should rely on recurrent/hybrid
8639
        // checks
8640
0
        default:
8641
0
            {
8642
0
                if (llm_arch_is_recurrent(arch)) {
8643
0
                    res = new llama_memory_recurrent(
8644
0
                            *this,
8645
0
                            GGML_TYPE_F32,
8646
0
                            GGML_TYPE_F32,
8647
0
                            cparams.offload_kqv,
8648
0
                            std::max((uint32_t) 1, cparams.n_seq_max),
8649
0
                            cparams.n_seq_max,
8650
0
                            nullptr);
8651
0
                } else if (llm_arch_is_hybrid(arch)) {
8652
                    // The main difference between hybrid architectures is the
8653
                    // layer filters, so pick the right one here
8654
0
                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
8655
0
                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
8656
0
                    if (arch == LLM_ARCH_FALCON_H1) {
8657
0
                        filter_attn = [&](int32_t) { return true; };
8658
0
                        filter_recr = [&](int32_t) { return true; };
8659
0
                    } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
8660
0
                        filter_attn = [&](int32_t il) {
8661
0
                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
8662
0
                        };
8663
0
                        filter_recr = [&](int32_t il) {
8664
0
                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
8665
0
                        };
8666
0
                    }
8667
8668
0
                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8669
                        // Use hybrid-iswa for hybrid models with SWA
8670
0
                        res = new llama_memory_hybrid_iswa(
8671
0
                            /* model             */ *this,
8672
0
                            /* attn_type_k       */ params.type_k,
8673
0
                            /* attn_type_v       */ params.type_v,
8674
0
                            /* attn_v_trans      */ !cparams.flash_attn,
8675
0
                            /* attn_swa_full     */ params.swa_full,
8676
0
                            /* attn_kv_size      */ cparams.n_ctx_seq,
8677
0
                            /* attn_n_ubatch     */ cparams.n_ubatch,
8678
0
                            /* attn_n_pad        */ 1,
8679
0
                            /* recurrent_type_r  */ GGML_TYPE_F32,
8680
0
                            /* recurrent_type_s  */ GGML_TYPE_F32,
8681
0
                            /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
8682
0
                            /* n_seq_max         */ cparams.n_seq_max,
8683
0
                            /* offload           */ cparams.offload_kqv,
8684
0
                            /* unified           */ cparams.kv_unified,
8685
0
                            /* filter_attn       */ std::move(filter_attn),
8686
0
                            /* filter_recr       */ std::move(filter_recr));
8687
0
                    } else {
8688
0
                        res = new llama_memory_hybrid(
8689
0
                            /* model             */ *this,
8690
0
                            /* attn_type_k       */ params.type_k,
8691
0
                            /* attn_type_v       */ params.type_v,
8692
0
                            /* attn_v_trans      */ !cparams.flash_attn,
8693
0
                            /* attn_kv_size      */ cparams.n_ctx_seq,
8694
0
                            /* attn_n_pad        */ 1,
8695
0
                            /* attn_n_swa        */ hparams.n_swa,
8696
0
                            /* attn_swa_type     */ hparams.swa_type,
8697
0
                            /* recurrent_type_k  */ GGML_TYPE_F32,
8698
0
                            /* recurrent_type_v  */ GGML_TYPE_F32,
8699
0
                            /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
8700
0
                            /* n_seq_max         */ cparams.n_seq_max,
8701
0
                            /* offload           */ cparams.offload_kqv,
8702
0
                            /* unified           */ cparams.kv_unified,
8703
0
                            /* filter_attn       */ std::move(filter_attn),
8704
0
                            /* filter_recr       */ std::move(filter_recr));
8705
0
                    }
8706
0
                } else {
8707
0
                    llama_memory_i::layer_reuse_cb reuse = nullptr;
8708
8709
0
                    if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
8710
0
                        reuse = [&](int32_t il) {
8711
0
                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
8712
0
                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
8713
0
                            }
8714
8715
0
                            return -1;
8716
0
                        };
8717
0
                    }
8718
8719
0
                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8720
0
                        GGML_ASSERT(hparams.is_swa_any());
8721
8722
0
                        res = new llama_kv_cache_iswa(
8723
0
                                *this,
8724
0
                                params.type_k,
8725
0
                                params.type_v,
8726
0
                                !cparams.flash_attn,
8727
0
                                cparams.offload_kqv,
8728
0
                                params.swa_full,
8729
0
                                cparams.kv_unified,
8730
0
                                cparams.n_ctx_seq,
8731
0
                                cparams.n_seq_max,
8732
0
                                cparams.n_ubatch,
8733
0
                                1,
8734
0
                                nullptr,
8735
0
                                reuse);
8736
0
                    } else {
8737
0
                        GGML_ASSERT(!hparams.is_swa_any());
8738
8739
0
                        res = new llama_kv_cache(
8740
0
                                *this,
8741
0
                                params.type_k,
8742
0
                                params.type_v,
8743
0
                                !cparams.flash_attn,
8744
0
                                cparams.offload_kqv,
8745
0
                                cparams.kv_unified,
8746
0
                                cparams.n_ctx_seq,
8747
0
                                cparams.n_seq_max,
8748
0
                                1,
8749
0
                                hparams.n_swa,
8750
0
                                hparams.swa_type,
8751
0
                                nullptr,
8752
0
                                nullptr);
8753
0
                    }
8754
0
                }
8755
0
            }
8756
0
    }
8757
8758
0
    return res;
8759
0
}
8760
8761
0
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
8762
0
    std::unique_ptr<llm_graph_context> llm;
8763
8764
0
    switch (arch) {
8765
0
        case LLM_ARCH_LLAMA:
8766
0
            {
8767
0
                llm = std::make_unique<llm_build_llama<false>>(*this, params);
8768
0
            } break;
8769
0
        case LLM_ARCH_LLAMA4:
8770
0
            {
8771
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
8772
0
                    llm = std::make_unique<llm_build_llama<false>>(*this, params);
8773
0
                } else {
8774
0
                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
8775
0
                }
8776
0
            } break;
8777
0
        case LLM_ARCH_LLAMA_EMBED:
8778
0
            {
8779
0
                llm = std::make_unique<llm_build_llama<true>>(*this, params);
8780
0
            } break;
8781
0
        case LLM_ARCH_MAINCODER:
8782
0
            {
8783
0
                llm = std::make_unique<llm_build_maincoder>(*this, params);
8784
0
            } break;
8785
0
        case LLM_ARCH_DECI:
8786
0
            {
8787
0
                llm = std::make_unique<llm_build_deci>(*this, params);
8788
0
            } break;
8789
0
        case LLM_ARCH_BAICHUAN:
8790
0
            {
8791
0
                llm = std::make_unique<llm_build_baichuan>(*this, params);
8792
0
            } break;
8793
0
        case LLM_ARCH_FALCON:
8794
0
            {
8795
0
                llm = std::make_unique<llm_build_falcon>(*this, params);
8796
0
            } break;
8797
0
        case LLM_ARCH_GROK:
8798
0
            {
8799
0
                llm = std::make_unique<llm_build_grok>(*this, params);
8800
0
            } break;
8801
0
        case LLM_ARCH_STARCODER:
8802
0
            {
8803
0
                llm = std::make_unique<llm_build_starcoder>(*this, params);
8804
0
            } break;
8805
0
        case LLM_ARCH_REFACT:
8806
0
            {
8807
0
                llm = std::make_unique<llm_build_refact>(*this, params);
8808
0
            } break;
8809
0
        case LLM_ARCH_BERT:
8810
0
        case LLM_ARCH_JINA_BERT_V2:
8811
0
        case LLM_ARCH_JINA_BERT_V3:
8812
0
        case LLM_ARCH_NOMIC_BERT:
8813
0
        case LLM_ARCH_NOMIC_BERT_MOE:
8814
0
            {
8815
0
                llm = std::make_unique<llm_build_bert>(*this, params);
8816
0
            } break;
8817
0
        case LLM_ARCH_MODERN_BERT:
8818
0
            {
8819
0
                llm = std::make_unique<llm_build_modern_bert>(*this, params);
8820
0
            } break;
8821
0
        case LLM_ARCH_NEO_BERT:
8822
0
            {
8823
0
                llm = std::make_unique<llm_build_neo_bert>(*this, params);
8824
0
            } break;
8825
0
        case LLM_ARCH_EUROBERT:
8826
0
            {
8827
0
                llm = std::make_unique<llm_build_eurobert>(*this, params);
8828
0
            } break;
8829
0
        case LLM_ARCH_BLOOM:
8830
0
            {
8831
0
                llm = std::make_unique<llm_build_bloom>(*this, params);
8832
0
            } break;
8833
0
        case LLM_ARCH_MPT:
8834
0
            {
8835
0
                llm = std::make_unique<llm_build_mpt>(*this, params);
8836
0
            } break;
8837
0
        case LLM_ARCH_STABLELM:
8838
0
            {
8839
0
                llm = std::make_unique<llm_build_stablelm>(*this, params);
8840
0
            } break;
8841
0
        case LLM_ARCH_QWEN:
8842
0
            {
8843
0
                llm = std::make_unique<llm_build_qwen>(*this, params);
8844
0
            } break;
8845
0
        case LLM_ARCH_QWEN2:
8846
0
            {
8847
0
                llm = std::make_unique<llm_build_qwen2>(*this, params);
8848
0
            } break;
8849
0
        case LLM_ARCH_DREAM:
8850
0
            {
8851
0
                llm = std::make_unique<llm_build_dream>(*this, params);
8852
0
            }
8853
0
            break;
8854
0
        case LLM_ARCH_LLADA:
8855
0
            {
8856
0
                llm = std::make_unique<llm_build_llada>(*this, params);
8857
0
            }
8858
0
            break;
8859
0
        case LLM_ARCH_LLADA_MOE:
8860
0
            {
8861
0
                llm = std::make_unique<llm_build_llada_moe>(*this, params);
8862
0
            }
8863
0
            break;
8864
0
        case LLM_ARCH_RND1:
8865
0
            {
8866
0
                llm = std::make_unique<llm_build_rnd1>(*this, params);
8867
0
            }
8868
0
            break;
8869
0
        case LLM_ARCH_QWEN2VL:
8870
0
            {
8871
0
                llm = std::make_unique<llm_build_qwen2vl>(*this, params);
8872
0
            } break;
8873
0
        case LLM_ARCH_QWEN2MOE:
8874
0
            {
8875
0
                llm = std::make_unique<llm_build_qwen2moe>(*this, params);
8876
0
            } break;
8877
0
        case LLM_ARCH_QWEN3:
8878
0
            {
8879
0
                llm = std::make_unique<llm_build_qwen3>(*this, params);
8880
0
            } break;
8881
0
        case LLM_ARCH_QWEN3MOE:
8882
0
            {
8883
0
                llm = std::make_unique<llm_build_qwen3moe>(*this, params);
8884
0
            } break;
8885
0
        case LLM_ARCH_QWEN3VL:
8886
0
            {
8887
0
                llm = std::make_unique<llm_build_qwen3vl>(*this, params);
8888
0
            } break;
8889
0
        case LLM_ARCH_QWEN3VLMOE:
8890
0
            {
8891
0
                llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
8892
0
            } break;
8893
0
        case LLM_ARCH_PHI2:
8894
0
            {
8895
0
                llm = std::make_unique<llm_build_phi2>(*this, params);
8896
0
            } break;
8897
0
        case LLM_ARCH_PHI3:
8898
0
        case LLM_ARCH_PHIMOE:
8899
0
            {
8900
0
                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8901
0
                    llm = std::make_unique<llm_build_phi3<true>> (*this, params);
8902
0
                } else {
8903
0
                    llm = std::make_unique<llm_build_phi3<false>>(*this, params);
8904
0
                }
8905
0
            } break;
8906
0
        case LLM_ARCH_PLAMO:
8907
0
            {
8908
0
                llm = std::make_unique<llm_build_plamo>(*this, params);
8909
0
            } break;
8910
0
        case LLM_ARCH_PLAMO2:
8911
0
            {
8912
0
                llm = std::make_unique<llm_build_plamo2>(*this, params);
8913
0
            } break;
8914
0
        case LLM_ARCH_PLAMO3:
8915
0
            {
8916
0
                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8917
0
                    llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
8918
0
                } else {
8919
0
                    llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
8920
0
                }
8921
0
            } break;
8922
0
        case LLM_ARCH_GPT2:
8923
0
            {
8924
0
                llm = std::make_unique<llm_build_gpt2>(*this, params);
8925
0
            } break;
8926
0
        case LLM_ARCH_CODESHELL:
8927
0
            {
8928
0
                llm = std::make_unique<llm_build_codeshell>(*this, params);
8929
0
            } break;
8930
0
        case LLM_ARCH_ORION:
8931
0
            {
8932
0
                llm = std::make_unique<llm_build_orion>(*this, params);
8933
0
            } break;
8934
0
        case LLM_ARCH_INTERNLM2:
8935
0
            {
8936
0
                llm = std::make_unique<llm_build_internlm2>(*this, params);
8937
0
            } break;
8938
0
        case LLM_ARCH_MINICPM3:
8939
0
            {
8940
0
                llm = std::make_unique<llm_build_minicpm3>(*this, params);
8941
0
            } break;
8942
0
        case LLM_ARCH_GEMMA:
8943
0
            {
8944
0
                llm = std::make_unique<llm_build_gemma>(*this, params);
8945
0
            } break;
8946
0
        case LLM_ARCH_GEMMA2:
8947
0
            {
8948
0
                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
8949
0
            } break;
8950
0
        case LLM_ARCH_GEMMA3:
8951
0
            {
8952
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8953
0
                    llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
8954
0
                } else {
8955
0
                    llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
8956
0
                }
8957
0
            } break;
8958
0
        case LLM_ARCH_GEMMA3N:
8959
0
            {
8960
0
                llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
8961
0
            } break;
8962
0
        case LLM_ARCH_GEMMA4:
8963
0
            {
8964
0
                llm = std::make_unique<llm_build_gemma4_iswa>(*this, params);
8965
0
            } break;
8966
0
        case LLM_ARCH_GEMMA_EMBEDDING:
8967
0
            {
8968
0
                llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
8969
0
            } break;
8970
0
        case LLM_ARCH_STARCODER2:
8971
0
            {
8972
0
                llm = std::make_unique<llm_build_starcoder2>(*this, params);
8973
0
            } break;
8974
0
        case LLM_ARCH_MAMBA:
8975
0
        case LLM_ARCH_MAMBA2:
8976
0
            {
8977
0
                llm = std::make_unique<llm_build_mamba>(*this, params);
8978
0
            } break;
8979
0
        case LLM_ARCH_JAMBA:
8980
0
            {
8981
0
                llm = std::make_unique<llm_build_jamba>(*this, params);
8982
0
            } break;
8983
0
        case LLM_ARCH_XVERSE:
8984
0
            {
8985
0
                llm = std::make_unique<llm_build_xverse>(*this, params);
8986
0
            } break;
8987
0
        case LLM_ARCH_COMMAND_R:
8988
0
            {
8989
0
                llm = std::make_unique<llm_build_command_r>(*this, params);
8990
0
            } break;
8991
0
        case LLM_ARCH_COHERE2:
8992
0
            {
8993
0
                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
8994
0
            } break;
8995
0
        case LLM_ARCH_DBRX:
8996
0
            {
8997
0
                llm = std::make_unique<llm_build_dbrx>(*this, params);
8998
0
            } break;
8999
0
        case LLM_ARCH_OLMO:
9000
0
            {
9001
0
                llm = std::make_unique<llm_build_olmo>(*this, params);
9002
0
            } break;
9003
0
        case LLM_ARCH_OLMO2:
9004
0
            {
9005
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
9006
0
                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
9007
0
                } else {
9008
0
                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
9009
0
                }
9010
0
            } break;
9011
0
        case LLM_ARCH_OLMOE:
9012
0
            {
9013
0
                llm = std::make_unique<llm_build_olmoe>(*this, params);
9014
0
            } break;
9015
0
        case LLM_ARCH_OPENELM:
9016
0
            {
9017
0
                llm = std::make_unique<llm_build_openelm>(*this, params);
9018
0
            } break;
9019
0
        case LLM_ARCH_GPTNEOX:
9020
0
            {
9021
0
                llm = std::make_unique<llm_build_gptneox>(*this, params);
9022
0
            } break;
9023
0
        case LLM_ARCH_ARCTIC:
9024
0
            {
9025
0
                llm = std::make_unique<llm_build_arctic>(*this, params);
9026
0
            } break;
9027
0
        case LLM_ARCH_DEEPSEEK:
9028
0
            {
9029
0
                llm = std::make_unique<llm_build_deepseek>(*this, params);
9030
0
            } break;
9031
0
        case LLM_ARCH_DEEPSEEK2:
9032
0
        case LLM_ARCH_DEEPSEEK2OCR:
9033
0
        case LLM_ARCH_GLM_DSA:
9034
0
        case LLM_ARCH_MISTRAL4:
9035
0
            {
9036
0
                llm = std::make_unique<llm_build_deepseek2>(*this, params);
9037
0
            } break;
9038
0
        case LLM_ARCH_CHATGLM:
9039
0
            {
9040
0
                llm = std::make_unique<llm_build_chatglm>(*this, params);
9041
0
            } break;
9042
0
        case LLM_ARCH_GLM4:
9043
0
            {
9044
0
                llm = std::make_unique<llm_build_glm4>(*this, params);
9045
0
            } break;
9046
0
        case LLM_ARCH_GLM4_MOE:
9047
0
            {
9048
0
                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
9049
0
            } break;
9050
0
        case LLM_ARCH_BITNET:
9051
0
            {
9052
0
                llm = std::make_unique<llm_build_bitnet>(*this, params);
9053
0
            } break;
9054
0
        case LLM_ARCH_T5:
9055
0
            {
9056
0
                switch (params.gtype) {
9057
0
                    case LLM_GRAPH_TYPE_ENCODER:
9058
0
                        llm = std::make_unique<llm_build_t5_enc>(*this, params);
9059
0
                        break;
9060
0
                    case LLM_GRAPH_TYPE_DEFAULT:
9061
0
                    case LLM_GRAPH_TYPE_DECODER:
9062
0
                        llm = std::make_unique<llm_build_t5_dec>(*this, params);
9063
0
                        break;
9064
0
                    default:
9065
0
                        GGML_ABORT("invalid graph type");
9066
0
                };
9067
0
            } break;
9068
0
        case LLM_ARCH_T5ENCODER:
9069
0
            {
9070
0
                llm = std::make_unique<llm_build_t5_enc>(*this, params);
9071
0
            }
9072
0
            break;
9073
0
        case LLM_ARCH_JAIS:
9074
0
            {
9075
0
                llm = std::make_unique<llm_build_jais>(*this, params);
9076
0
            } break;
9077
0
        case LLM_ARCH_JAIS2:
9078
0
            {
9079
0
                llm = std::make_unique<llm_build_jais2>(*this, params);
9080
0
            } break;
9081
0
        case LLM_ARCH_NEMOTRON:
9082
0
            {
9083
0
                llm = std::make_unique<llm_build_nemotron>(*this, params);
9084
0
            } break;
9085
0
        case LLM_ARCH_NEMOTRON_H:
9086
0
        case LLM_ARCH_NEMOTRON_H_MOE:
9087
0
            {
9088
0
                llm = std::make_unique<llm_build_nemotron_h>(*this, params);
9089
0
            } break;
9090
0
        case LLM_ARCH_EXAONE:
9091
0
            {
9092
0
                llm = std::make_unique<llm_build_exaone>(*this, params);
9093
0
            } break;
9094
0
        case LLM_ARCH_EXAONE4:
9095
0
            {
9096
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
9097
0
                    llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
9098
0
                } else {
9099
0
                    llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
9100
0
                }
9101
0
            } break;
9102
0
        case LLM_ARCH_EXAONE_MOE:
9103
0
            {
9104
0
                llm = std::make_unique<llm_build_exaone_moe>(*this, params);
9105
0
            } break;
9106
0
        case LLM_ARCH_RWKV6:
9107
0
            {
9108
0
                llm = std::make_unique<llm_build_rwkv6>(*this, params);
9109
0
            } break;
9110
0
        case LLM_ARCH_RWKV6QWEN2:
9111
0
            {
9112
0
                llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
9113
0
            } break;
9114
0
        case LLM_ARCH_RWKV7:
9115
0
            {
9116
0
                llm = std::make_unique<llm_build_rwkv7>(*this, params);
9117
0
            } break;
9118
0
        case LLM_ARCH_ARWKV7:
9119
0
            {
9120
0
                llm = std::make_unique<llm_build_arwkv7>(*this, params);
9121
0
            } break;
9122
0
        case LLM_ARCH_GRANITE:
9123
0
        case LLM_ARCH_GRANITE_MOE:
9124
0
        case LLM_ARCH_MINICPM:
9125
0
            {
9126
0
                llm = std::make_unique<llm_build_granite>(*this, params);
9127
0
            } break;
9128
0
        case LLM_ARCH_GRANITE_HYBRID:
9129
0
            {
9130
0
                llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
9131
0
            } break;
9132
0
        case LLM_ARCH_CHAMELEON:
9133
0
            {
9134
0
                llm = std::make_unique<llm_build_chameleon>(*this, params);
9135
0
            } break;
9136
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
9137
0
            {
9138
0
                llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
9139
0
            } break;
9140
0
        case LLM_ARCH_PLM:
9141
0
            {
9142
0
                llm = std::make_unique<llm_build_plm>(*this, params);
9143
0
            } break;
9144
0
        case LLM_ARCH_BAILINGMOE:
9145
0
            {
9146
0
                llm = std::make_unique<llm_build_bailingmoe>(*this, params);
9147
0
            } break;
9148
0
        case LLM_ARCH_BAILINGMOE2:
9149
0
            {
9150
0
                llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
9151
0
            } break;
9152
0
        case LLM_ARCH_SEED_OSS:
9153
0
            {
9154
0
                llm = std::make_unique<llm_build_seed_oss>(*this, params);
9155
0
            } break;
9156
0
        case LLM_ARCH_DOTS1:
9157
0
            {
9158
0
                llm = std::make_unique<llm_build_dots1>(*this, params);
9159
0
            } break;
9160
0
        case LLM_ARCH_ARCEE:
9161
0
            {
9162
0
                llm = std::make_unique<llm_build_arcee>(*this, params);
9163
0
            } break;
9164
0
        case LLM_ARCH_AFMOE:
9165
0
            {
9166
0
                llm = std::make_unique<llm_build_afmoe>(*this, params);
9167
0
            } break;
9168
0
        case LLM_ARCH_ERNIE4_5:
9169
0
            {
9170
0
                llm = std::make_unique<llm_build_ernie4_5>(*this, params);
9171
0
            } break;
9172
0
        case LLM_ARCH_ERNIE4_5_MOE:
9173
0
            {
9174
0
                llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
9175
0
            } break;
9176
0
        case LLM_ARCH_PADDLEOCR:
9177
0
            {
9178
0
                llm = std::make_unique<llm_build_paddleocr>(*this, params);
9179
0
            } break;
9180
0
        case LLM_ARCH_HUNYUAN_MOE:
9181
0
            {
9182
0
                llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
9183
0
            } break;
9184
0
        case LLM_ARCH_HUNYUAN_DENSE:
9185
0
            {
9186
0
                llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
9187
0
            } break;
9188
0
        case LLM_ARCH_SMOLLM3:
9189
0
            {
9190
0
                llm = std::make_unique<llm_build_smollm3>(*this, params);
9191
0
            } break;
9192
0
        case LLM_ARCH_OPENAI_MOE:
9193
0
            {
9194
0
                llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
9195
0
            } break;
9196
0
        case LLM_ARCH_FALCON_H1:
9197
0
            {
9198
0
                llm = std::make_unique<llm_build_falcon_h1>(*this, params);
9199
0
            } break;
9200
0
        case LLM_ARCH_LFM2:
9201
0
        case LLM_ARCH_LFM2MOE:
9202
0
            {
9203
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
9204
0
                    llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
9205
0
                } else {
9206
0
                    llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
9207
0
                }
9208
0
            } break;
9209
0
        case LLM_ARCH_SMALLTHINKER:
9210
0
            {
9211
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
9212
0
                    llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
9213
0
                } else {
9214
0
                    llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
9215
0
                }
9216
0
            } break;
9217
0
        case LLM_ARCH_GROVEMOE:
9218
0
            {
9219
0
                llm = std::make_unique<llm_build_grovemoe>(*this, params);
9220
0
            } break;
9221
0
        case LLM_ARCH_APERTUS:
9222
0
            {
9223
0
                llm = std::make_unique<llm_build_apertus>(*this, params);
9224
0
            } break;
9225
0
        case LLM_ARCH_MINIMAX_M2:
9226
0
            {
9227
0
                llm = std::make_unique<llm_build_minimax_m2>(*this, params);
9228
0
            } break;
9229
0
        case LLM_ARCH_COGVLM:
9230
0
            {
9231
0
                llm = std::make_unique<llm_build_cogvlm>(*this, params);
9232
0
            } break;
9233
0
        case LLM_ARCH_PANGU_EMBED:
9234
0
            {
9235
0
                llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
9236
0
            } break;
9237
0
        case LLM_ARCH_QWEN3NEXT:
9238
0
            {
9239
0
                llm = std::make_unique<llm_build_qwen3next>(*this, params);
9240
0
            } break;
9241
0
        case LLM_ARCH_QWEN35:
9242
0
            {
9243
0
                llm = std::make_unique<llm_build_qwen35>(*this, params);
9244
0
            } break;
9245
0
        case LLM_ARCH_QWEN35MOE:
9246
0
            {
9247
0
                llm = std::make_unique<llm_build_qwen35moe>(*this, params);
9248
0
            } break;
9249
0
        case LLM_ARCH_MISTRAL3:
9250
0
            {
9251
0
                llm = std::make_unique<llm_build_mistral3>(*this, params);
9252
0
            } break;
9253
0
        case LLM_ARCH_MIMO2:
9254
0
            {
9255
0
                llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
9256
0
            } break;
9257
0
        case LLM_ARCH_KIMI_LINEAR:
9258
0
            {
9259
0
                llm = std::make_unique<llm_build_kimi_linear>(*this, params);
9260
0
            } break;
9261
0
        case LLM_ARCH_STEP35:
9262
0
            {
9263
0
                llm = std::make_unique<llm_build_step35_iswa>(*this, params);
9264
0
            } break;
9265
0
        default:
9266
0
            GGML_ABORT("fatal error");
9267
0
    }
9268
9269
    // add on pooling layer
9270
0
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
9271
9272
    // add backend sampling layers (if any)
9273
0
    llm->build_sampling();
9274
9275
    // if the gguf model was converted with --sentence-transformers-dense-modules
9276
    // there will be two additional dense projection layers
9277
    // dense linear projections are applied after pooling
9278
    // TODO: move reranking logic here and generalize
9279
0
    llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
9280
9281
0
    llm->res->set_outputs();
9282
9283
0
    return llm->res->get_gf();
9284
0
}
9285
9286
9287
//
9288
// interface implementation
9289
//
9290
9291
4.14k
llama_model_params llama_model_default_params() {
9292
4.14k
    llama_model_params result = {
9293
4.14k
        /*.devices                     =*/ nullptr,
9294
4.14k
        /*.tensor_buft_overrides       =*/ nullptr,
9295
4.14k
        /*.n_gpu_layers                =*/ -1,
9296
4.14k
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
9297
4.14k
        /*.main_gpu                    =*/ 0,
9298
4.14k
        /*.tensor_split                =*/ nullptr,
9299
4.14k
        /*.progress_callback           =*/ nullptr,
9300
4.14k
        /*.progress_callback_user_data =*/ nullptr,
9301
4.14k
        /*.kv_overrides                =*/ nullptr,
9302
4.14k
        /*.vocab_only                  =*/ false,
9303
4.14k
        /*.use_mmap                    =*/ true,
9304
4.14k
        /*.use_direct_io               =*/ false,
9305
4.14k
        /*.use_mlock                   =*/ false,
9306
4.14k
        /*.check_tensors               =*/ false,
9307
4.14k
        /*.use_extra_bufts             =*/ true,
9308
4.14k
        /*.no_host                     =*/ false,
9309
4.14k
        /*.no_alloc                    =*/ false,
9310
4.14k
    };
9311
9312
4.14k
    return result;
9313
4.14k
}
9314
9315
0
const llama_vocab * llama_model_get_vocab(const llama_model * model) {
9316
0
    return &model->vocab;
9317
0
}
9318
9319
0
void llama_free_model(llama_model * model) {
9320
0
    llama_model_free(model);
9321
0
}
9322
9323
3.90k
void llama_model_free(llama_model * model) {
9324
3.90k
    delete model;
9325
3.90k
}
9326
9327
0
int32_t llama_model_n_ctx_train(const llama_model * model) {
9328
0
    return model->hparams.n_ctx_train;
9329
0
}
9330
9331
0
int32_t llama_model_n_embd(const llama_model * model) {
9332
0
    return model->hparams.n_embd;
9333
0
}
9334
9335
0
int32_t llama_model_n_embd_inp(const llama_model * model) {
9336
0
    return model->hparams.n_embd_inp();
9337
0
}
9338
9339
0
int32_t llama_model_n_embd_out(const llama_model * model) {
9340
0
    return model->hparams.n_embd_out();
9341
0
}
9342
9343
0
int32_t llama_model_n_layer(const llama_model * model) {
9344
0
    return model->hparams.n_layer;
9345
0
}
9346
9347
0
int32_t llama_model_n_head(const llama_model * model) {
9348
0
    return model->hparams.n_head();
9349
0
}
9350
9351
0
int32_t llama_model_n_head_kv(const llama_model * model) {
9352
0
    return model->hparams.n_head_kv();
9353
0
}
9354
9355
0
int32_t llama_model_n_swa(const llama_model * model) {
9356
0
    return model->hparams.n_swa;
9357
0
}
9358
9359
0
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
9360
0
    return model->hparams.n_cls_out;
9361
0
}
9362
9363
0
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
9364
0
    if (i < model->classifier_labels.size()) {
9365
0
        return model->classifier_labels[i].c_str();
9366
0
    }
9367
9368
0
    return nullptr;
9369
0
}
9370
9371
// deprecated
9372
0
int32_t llama_n_ctx_train(const llama_model * model) {
9373
0
    return llama_model_n_ctx_train(model);
9374
0
}
9375
9376
// deprecated
9377
0
int32_t llama_n_embd(const llama_model * model) {
9378
0
    return llama_model_n_embd(model);
9379
0
}
9380
9381
// deprecated
9382
0
int32_t llama_n_layer(const llama_model * model) {
9383
0
    return llama_model_n_layer(model);
9384
0
}
9385
9386
// deprecated
9387
0
int32_t llama_n_head(const llama_model * model) {
9388
0
    return llama_model_n_head(model);
9389
0
}
9390
9391
0
llama_rope_type llama_model_rope_type(const llama_model * model) {
9392
0
    switch (model->arch) {
9393
        // these models do not use RoPE
9394
0
        case LLM_ARCH_CLIP:
9395
0
        case LLM_ARCH_GPT2:
9396
0
        case LLM_ARCH_GPTJ:
9397
0
        case LLM_ARCH_MPT:
9398
0
        case LLM_ARCH_REFACT:
9399
0
        case LLM_ARCH_BLOOM:
9400
0
        case LLM_ARCH_MAMBA:
9401
0
        case LLM_ARCH_MAMBA2:
9402
0
        case LLM_ARCH_JAMBA:
9403
0
        case LLM_ARCH_JINA_BERT_V2:
9404
0
        case LLM_ARCH_T5:
9405
0
        case LLM_ARCH_T5ENCODER:
9406
0
        case LLM_ARCH_JAIS:
9407
0
        case LLM_ARCH_RWKV6:
9408
0
        case LLM_ARCH_RWKV6QWEN2:
9409
0
        case LLM_ARCH_RWKV7:
9410
0
        case LLM_ARCH_ARWKV7:
9411
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
9412
0
        case LLM_ARCH_NEMOTRON_H:
9413
0
        case LLM_ARCH_NEMOTRON_H_MOE:
9414
0
        case LLM_ARCH_KIMI_LINEAR:
9415
0
            return LLAMA_ROPE_TYPE_NONE;
9416
9417
        // use what we call a normal RoPE, operating on pairs of consecutive head values
9418
0
        case LLM_ARCH_LLAMA:
9419
0
        case LLM_ARCH_LLADA:
9420
0
        case LLM_ARCH_LLAMA4:
9421
0
        case LLM_ARCH_DECI:
9422
0
        case LLM_ARCH_BAICHUAN:
9423
0
        case LLM_ARCH_STARCODER:
9424
0
        case LLM_ARCH_INTERNLM2:
9425
0
        case LLM_ARCH_MINICPM:
9426
0
        case LLM_ARCH_XVERSE:
9427
0
        case LLM_ARCH_COMMAND_R:
9428
0
        case LLM_ARCH_COHERE2:
9429
0
        case LLM_ARCH_OLMO:
9430
0
        case LLM_ARCH_ARCTIC:
9431
0
        case LLM_ARCH_DEEPSEEK:
9432
0
        case LLM_ARCH_DEEPSEEK2:
9433
0
        case LLM_ARCH_DEEPSEEK2OCR:
9434
0
        case LLM_ARCH_PLM:
9435
0
        case LLM_ARCH_CHATGLM:
9436
0
        case LLM_ARCH_GRANITE:
9437
0
        case LLM_ARCH_GRANITE_MOE:
9438
0
        case LLM_ARCH_GRANITE_HYBRID:
9439
0
        case LLM_ARCH_CHAMELEON:
9440
0
        case LLM_ARCH_BAILINGMOE:
9441
0
        case LLM_ARCH_NEO_BERT:
9442
0
        case LLM_ARCH_SMOLLM3:
9443
0
        case LLM_ARCH_ARCEE:
9444
0
        case LLM_ARCH_ERNIE4_5:
9445
0
        case LLM_ARCH_ERNIE4_5_MOE:
9446
0
        case LLM_ARCH_MISTRAL3:
9447
0
        case LLM_ARCH_MISTRAL4:
9448
0
        case LLM_ARCH_LLAMA_EMBED:
9449
0
        case LLM_ARCH_MAINCODER:
9450
0
        case LLM_ARCH_GLM_DSA:
9451
0
            return LLAMA_ROPE_TYPE_NORM;
9452
9453
        // the pairs of head values are offset by n_rot/2
9454
0
        case LLM_ARCH_FALCON:
9455
0
        case LLM_ARCH_FALCON_H1:
9456
0
        case LLM_ARCH_GROK:
9457
0
        case LLM_ARCH_DBRX:
9458
0
        case LLM_ARCH_BERT:
9459
0
        case LLM_ARCH_JINA_BERT_V3:
9460
0
        case LLM_ARCH_MODERN_BERT:
9461
0
        case LLM_ARCH_NOMIC_BERT:
9462
0
        case LLM_ARCH_NOMIC_BERT_MOE:
9463
0
        case LLM_ARCH_EUROBERT:
9464
0
        case LLM_ARCH_STABLELM:
9465
0
        case LLM_ARCH_BITNET:
9466
0
        case LLM_ARCH_QWEN:
9467
0
        case LLM_ARCH_QWEN2:
9468
0
        case LLM_ARCH_DREAM:
9469
0
        case LLM_ARCH_QWEN2MOE:
9470
0
        case LLM_ARCH_QWEN3:
9471
0
        case LLM_ARCH_QWEN3MOE:
9472
0
        case LLM_ARCH_LLADA_MOE:
9473
0
        case LLM_ARCH_RND1:
9474
0
        case LLM_ARCH_OLMO2:
9475
0
        case LLM_ARCH_OLMOE:
9476
0
        case LLM_ARCH_PHI2:
9477
0
        case LLM_ARCH_PHI3:
9478
0
        case LLM_ARCH_PHIMOE:
9479
0
        case LLM_ARCH_PLAMO:
9480
0
        case LLM_ARCH_PLAMO2:
9481
0
        case LLM_ARCH_PLAMO3:
9482
0
        case LLM_ARCH_GEMMA:
9483
0
        case LLM_ARCH_GEMMA2:
9484
0
        case LLM_ARCH_GEMMA3:
9485
0
        case LLM_ARCH_GEMMA3N:
9486
0
        case LLM_ARCH_GEMMA4:
9487
0
        case LLM_ARCH_GEMMA_EMBEDDING:
9488
0
        case LLM_ARCH_STARCODER2:
9489
0
        case LLM_ARCH_OPENELM:
9490
0
        case LLM_ARCH_GPTNEOX:
9491
0
        case LLM_ARCH_CODESHELL:
9492
0
        case LLM_ARCH_ORION:
9493
0
        case LLM_ARCH_NEMOTRON:
9494
0
        case LLM_ARCH_EXAONE:
9495
0
        case LLM_ARCH_EXAONE4:
9496
0
        case LLM_ARCH_EXAONE_MOE:
9497
0
        case LLM_ARCH_MINICPM3:
9498
0
        case LLM_ARCH_BAILINGMOE2:
9499
0
        case LLM_ARCH_DOTS1:
9500
0
        case LLM_ARCH_HUNYUAN_MOE:
9501
0
        case LLM_ARCH_JAIS2:
9502
0
        case LLM_ARCH_OPENAI_MOE:
9503
0
        case LLM_ARCH_HUNYUAN_DENSE:
9504
0
        case LLM_ARCH_LFM2:
9505
0
        case LLM_ARCH_LFM2MOE:
9506
0
        case LLM_ARCH_SMALLTHINKER:
9507
0
        case LLM_ARCH_SEED_OSS:
9508
0
        case LLM_ARCH_GROVEMOE:
9509
0
        case LLM_ARCH_APERTUS:
9510
0
        case LLM_ARCH_MINIMAX_M2:
9511
0
        case LLM_ARCH_COGVLM:
9512
0
        case LLM_ARCH_PANGU_EMBED:
9513
0
        case LLM_ARCH_AFMOE:
9514
0
        case LLM_ARCH_QWEN3NEXT:
9515
0
        case LLM_ARCH_MIMO2:
9516
0
        case LLM_ARCH_STEP35:
9517
0
            return LLAMA_ROPE_TYPE_NEOX;
9518
9519
0
        case LLM_ARCH_QWEN2VL:
9520
0
        case LLM_ARCH_PADDLEOCR:
9521
0
            return LLAMA_ROPE_TYPE_MROPE;
9522
0
        case LLM_ARCH_QWEN3VL:
9523
0
        case LLM_ARCH_QWEN3VLMOE:
9524
0
        case LLM_ARCH_QWEN35:
9525
0
        case LLM_ARCH_QWEN35MOE:
9526
0
            return LLAMA_ROPE_TYPE_IMROPE;
9527
9528
0
        case LLM_ARCH_GLM4:
9529
0
            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
9530
0
        case LLM_ARCH_GLM4_MOE:
9531
0
            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
9532
9533
        // all model arches should be listed explicitly here
9534
0
        case LLM_ARCH_UNKNOWN:
9535
0
            GGML_ABORT("unknown architecture");
9536
0
    }
9537
9538
0
    return LLAMA_ROPE_TYPE_NONE;
9539
0
}
9540
9541
0
float llama_model_rope_freq_scale_train(const llama_model * model) {
9542
0
    return model->hparams.rope_freq_scale_train;
9543
0
}
9544
9545
0
int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
9546
0
    const auto & it = model->gguf_kv.find(key);
9547
0
    if (it == model->gguf_kv.end()) {
9548
0
        if (buf_size > 0) {
9549
0
            buf[0] = '\0';
9550
0
        }
9551
0
        return -1;
9552
0
    }
9553
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
9554
0
}
9555
9556
0
int32_t llama_model_meta_count(const llama_model * model) {
9557
0
    return (int)model->gguf_kv.size();
9558
0
}
9559
9560
0
const char * llama_model_meta_key_str(llama_model_meta_key key) {
9561
0
    switch (key) {
9562
0
        case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE:        return "general.sampling.sequence";
9563
0
        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K:           return "general.sampling.top_k";
9564
0
        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P:           return "general.sampling.top_p";
9565
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P:           return "general.sampling.min_p";
9566
0
        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
9567
0
        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD:   return "general.sampling.xtc_threshold";
9568
0
        case LLAMA_MODEL_META_KEY_SAMPLING_TEMP:            return "general.sampling.temp";
9569
0
        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N:  return "general.sampling.penalty_last_n";
9570
0
        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT:  return "general.sampling.penalty_repeat";
9571
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT:        return "general.sampling.mirostat";
9572
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU:    return "general.sampling.mirostat_tau";
9573
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA:    return "general.sampling.mirostat_eta";
9574
0
        default:                                            return nullptr;
9575
0
    }
9576
0
}
9577
9578
0
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
9579
0
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
9580
0
        if (buf_size > 0) {
9581
0
            buf[0] = '\0';
9582
0
        }
9583
0
        return -1;
9584
0
    }
9585
0
    auto it = model->gguf_kv.begin();
9586
0
    std::advance(it, i);
9587
0
    return snprintf(buf, buf_size, "%s", it->first.c_str());
9588
0
}
9589
9590
0
int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
9591
0
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
9592
0
        if (buf_size > 0) {
9593
0
            buf[0] = '\0';
9594
0
        }
9595
0
        return -1;
9596
0
    }
9597
0
    auto it = model->gguf_kv.begin();
9598
0
    std::advance(it, i);
9599
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
9600
0
}
9601
9602
0
int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
9603
0
    return snprintf(buf, buf_size, "%s", model->desc().c_str());
9604
0
}
9605
9606
0
uint64_t llama_model_size(const llama_model * model) {
9607
0
    return model->size();
9608
0
}
9609
9610
0
const char * llama_model_chat_template(const llama_model * model, const char * name) {
9611
0
    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
9612
0
        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
9613
0
    const auto & it = model->gguf_kv.find(key);
9614
0
    if (it == model->gguf_kv.end()) {
9615
        // one-off fix for very popular models (so we are not flooded with issues)
9616
        // do not extend this list unless absolutely necessary
9617
        // Mistral-Small-2503 does not have built-in chat template
9618
0
        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
9619
0
        if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
9620
0
            return "mistral-v7-tekken";
9621
0
        }
9622
9623
0
        return nullptr;
9624
0
    }
9625
9626
0
    return it->second.c_str();
9627
0
}
9628
9629
0
uint64_t llama_model_n_params(const llama_model * model) {
9630
0
    return model->n_elements();
9631
0
}
9632
9633
0
bool llama_model_has_encoder(const llama_model * model) {
9634
0
    switch (model->arch) {
9635
0
        case LLM_ARCH_T5:        return true;
9636
0
        case LLM_ARCH_T5ENCODER: return true;
9637
0
        default:                 return false;
9638
0
    }
9639
0
}
9640
9641
0
bool llama_model_has_decoder(const llama_model * model) {
9642
0
    switch (model->arch) {
9643
0
        case LLM_ARCH_T5ENCODER: return false;
9644
0
        default:                 return true;
9645
0
    }
9646
0
}
9647
9648
0
llama_token llama_model_decoder_start_token(const llama_model * model) {
9649
0
    return model->hparams.dec_start_token_id;
9650
0
}
9651
9652
0
bool llama_model_is_recurrent(const llama_model * model) {
9653
0
    return llm_arch_is_recurrent(model->arch);
9654
0
}
9655
9656
0
bool llama_model_is_hybrid(const llama_model * model) {
9657
0
    return llm_arch_is_hybrid(model->arch);
9658
0
}
9659
9660
0
bool llama_model_is_diffusion(const llama_model * model) {
9661
0
    return llm_arch_is_diffusion(model->arch);
9662
0
}
9663
9664
0
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
9665
0
    return model->tensors_by_name;
9666
0
}