Coverage Report

Created: 2025-12-14 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/llama-model.cpp
Line
Count
Source
1
#include "llama-model.h"
2
3
#include "llama-impl.h"
4
#include "llama-mmap.h"
5
#include "llama-cparams.h"
6
#include "llama-model-loader.h"
7
8
#include "llama-kv-cache.h"
9
#include "llama-kv-cache-iswa.h"
10
#include "llama-memory-hybrid.h"
11
#include "llama-memory-recurrent.h"
12
13
#include "ggml-cpp.h"
14
15
#include "models/models.h"
16
17
#include <algorithm>
18
#include <cassert>
19
#include <cfloat>
20
#include <cstring>
21
#include <cmath>
22
#include <functional>
23
#include <map>
24
#include <regex>
25
#include <sstream>
26
#include <stdexcept>
27
28
0
const char * llm_type_name(llm_type type) {
29
0
    switch (type) {
30
0
        case LLM_TYPE_14M:           return "14M";
31
0
        case LLM_TYPE_17M:           return "17M";
32
0
        case LLM_TYPE_22M:           return "22M";
33
0
        case LLM_TYPE_33M:           return "33M";
34
0
        case LLM_TYPE_60M:           return "60M";
35
0
        case LLM_TYPE_70M:           return "70M";
36
0
        case LLM_TYPE_80M:           return "80M";
37
0
        case LLM_TYPE_109M:          return "109M";
38
0
        case LLM_TYPE_137M:          return "137M";
39
0
        case LLM_TYPE_140M:          return "140M";
40
0
        case LLM_TYPE_160M:          return "160M";
41
0
        case LLM_TYPE_190M:          return "190M";
42
0
        case LLM_TYPE_220M:          return "220M";
43
0
        case LLM_TYPE_250M:          return "250M";
44
0
        case LLM_TYPE_256M:          return "256M";
45
0
        case LLM_TYPE_270M:          return "270M";
46
0
        case LLM_TYPE_335M:          return "335M";
47
0
        case LLM_TYPE_350M:          return "350M";
48
0
        case LLM_TYPE_360M:          return "360M";
49
0
        case LLM_TYPE_410M:          return "410M";
50
0
        case LLM_TYPE_450M:          return "450M";
51
0
        case LLM_TYPE_475M:          return "475M";
52
0
        case LLM_TYPE_558M:          return "558M";
53
0
        case LLM_TYPE_700M:          return "700M";
54
0
        case LLM_TYPE_770M:          return "770M";
55
0
        case LLM_TYPE_780M:          return "780M";
56
0
        case LLM_TYPE_950M:          return "950M";
57
0
        case LLM_TYPE_0_3B:          return "0.3B";
58
0
        case LLM_TYPE_0_5B:          return "0.5B";
59
0
        case LLM_TYPE_0_6B:          return "0.6B";
60
0
        case LLM_TYPE_1B:            return "1B";
61
0
        case LLM_TYPE_1_2B:          return "1.2B";
62
0
        case LLM_TYPE_1_3B:          return "1.3B";
63
0
        case LLM_TYPE_1_4B:          return "1.4B";
64
0
        case LLM_TYPE_1_5B:          return "1.5B";
65
0
        case LLM_TYPE_1_6B:          return "1.6B";
66
0
        case LLM_TYPE_1_7B:          return "1.7B";
67
0
        case LLM_TYPE_1_8B:          return "1.8B";
68
0
        case LLM_TYPE_2B:            return "2B";
69
0
        case LLM_TYPE_2_6B:          return "2.6B";
70
0
        case LLM_TYPE_2_8B:          return "2.8B";
71
0
        case LLM_TYPE_2_9B:          return "2.9B";
72
0
        case LLM_TYPE_3B:            return "3B";
73
0
        case LLM_TYPE_4B:            return "4B";
74
0
        case LLM_TYPE_6B:            return "6B";
75
0
        case LLM_TYPE_6_9B:          return "6.9B";
76
0
        case LLM_TYPE_7B:            return "7B";
77
0
        case LLM_TYPE_8B:            return "8B";
78
0
        case LLM_TYPE_9B:            return "9B";
79
0
        case LLM_TYPE_11B:           return "11B";
80
0
        case LLM_TYPE_12B:           return "12B";
81
0
        case LLM_TYPE_13B:           return "13B";
82
0
        case LLM_TYPE_14B:           return "14B";
83
0
        case LLM_TYPE_15B:           return "15B";
84
0
        case LLM_TYPE_16B:           return "16B";
85
0
        case LLM_TYPE_20B:           return "20B";
86
0
        case LLM_TYPE_26B:           return "26B";
87
0
        case LLM_TYPE_27B:           return "27B";
88
0
        case LLM_TYPE_30B:           return "30B";
89
0
        case LLM_TYPE_32B:           return "32B";
90
0
        case LLM_TYPE_34B:           return "34B";
91
0
        case LLM_TYPE_35B:           return "35B";
92
0
        case LLM_TYPE_36B:           return "36B";
93
0
        case LLM_TYPE_40B:           return "40B";
94
0
        case LLM_TYPE_65B:           return "65B";
95
0
        case LLM_TYPE_70B:           return "70B";
96
0
        case LLM_TYPE_120B:          return "120B";
97
0
        case LLM_TYPE_142B:          return "142B";
98
0
        case LLM_TYPE_236B:          return "236B";
99
0
        case LLM_TYPE_290B:          return "290B";
100
0
        case LLM_TYPE_314B:          return "314B";
101
0
        case LLM_TYPE_405B:          return "405B";
102
0
        case LLM_TYPE_671B:          return "671B";
103
0
        case LLM_TYPE_SMALL:         return "0.1B";
104
0
        case LLM_TYPE_MEDIUM:        return "0.4B";
105
0
        case LLM_TYPE_LARGE:         return "0.8B";
106
0
        case LLM_TYPE_XL:            return "1.5B";
107
0
        case LLM_TYPE_A1_7B:         return "A1.7B";
108
0
        case LLM_TYPE_A2_7B:         return "A2.7B";
109
0
        case LLM_TYPE_8x7B:          return "8x7B";
110
0
        case LLM_TYPE_8x22B:         return "8x22B";
111
0
        case LLM_TYPE_16x12B:        return "16x12B";
112
0
        case LLM_TYPE_16x3_8B:       return "16x3.8B";
113
0
        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
114
0
        case LLM_TYPE_57B_A14B:      return "57B.A14B";
115
0
        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
116
0
        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
117
0
        case LLM_TYPE_A13B:          return "A13B";
118
0
        case LLM_TYPE_7B_A1B:        return "7B.A1B";
119
0
        case LLM_TYPE_8B_A1B:        return "8B.A1B";
120
0
        case LLM_TYPE_16B_A1B:       return "16B.A1B";
121
0
        case LLM_TYPE_21B_A3B:       return "21B.A3B";
122
0
        case LLM_TYPE_30B_A3B:       return "30B.A3B";
123
0
        case LLM_TYPE_80B_A3B:       return "80B.A3B";
124
0
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
125
0
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
126
0
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
127
0
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
128
0
        case LLM_TYPE_300B_A47B:     return "300B.A47B";
129
0
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
130
0
        case LLM_TYPE_E2B:           return "E2B";
131
0
        case LLM_TYPE_E4B:           return "E4B";
132
0
        default:                     return "?B";
133
0
    }
134
0
}
135
136
0
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
137
0
    switch (type) {
138
0
        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
139
0
        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
140
0
        default:                                    return "unknown";
141
0
    }
142
0
}
143
144
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
145
    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
146
    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
147
    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
148
    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
149
};
150
151
0
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
152
0
    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
153
0
}
154
155
0
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
156
0
    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
157
0
        if (kv.second == name) {
158
0
            return (llama_rope_scaling_type) kv.first;
159
0
        }
160
0
    }
161
162
0
    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
163
0
}
164
165
// checks if the weight tensor can be used with the specified buffer type and device
166
0
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
167
0
    GGML_ASSERT(w != nullptr);
168
169
0
    if (op == GGML_OP_NONE) {
170
0
        return true;
171
0
    }
172
173
0
    ggml_init_params params = {
174
0
        /*.mem_size   =*/ ggml_tensor_overhead()*8,
175
0
        /*.mem_buffer =*/ NULL,
176
0
        /*.no_alloc   =*/ true,
177
0
    };
178
0
    ggml_context_ptr ctx_ptr { ggml_init(params) };
179
0
    if (!ctx_ptr) {
180
0
        throw std::runtime_error(format("failed to create ggml context"));
181
0
    }
182
0
    ggml_context * ctx = ctx_ptr.get();
183
184
0
    ggml_tensor * op_tensor = nullptr;
185
186
0
    switch (op) {
187
0
        case GGML_OP_GET_ROWS:
188
0
            {
189
0
                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
190
0
                op_tensor = ggml_get_rows(ctx, w, b);
191
0
            } break;
192
0
        case GGML_OP_MUL_MAT:
193
0
            {
194
0
                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
195
0
                op_tensor = ggml_mul_mat(ctx, w, b);
196
0
            } break;
197
0
        case GGML_OP_MUL_MAT_ID:
198
0
            {
199
0
                int n_expert_used = hparams.n_expert_used;
200
0
                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
201
0
                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
202
0
                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
203
0
            } break;
204
0
        case GGML_OP_ADD:
205
0
            {
206
0
                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
207
0
                op_tensor = ggml_add(ctx, a, w);
208
0
            } break;
209
0
        case GGML_OP_ADD_ID:
210
0
            {
211
0
                int n_expert_used = hparams.n_expert_used;
212
0
                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
213
0
                ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
214
0
                op_tensor = ggml_add_id(ctx, a, w, c);
215
0
            } break;
216
0
        case GGML_OP_MUL:
217
0
            {
218
0
                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
219
0
                op_tensor = ggml_mul(ctx, a, w);
220
0
            } break;
221
0
        case GGML_OP_DIV:
222
0
            {
223
0
                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
224
0
                op_tensor = ggml_div(ctx, a, w);
225
0
            } break;
226
0
        case GGML_OP_ROPE:
227
0
            {
228
0
                int n_embd_head = hparams.n_embd_head_v;
229
0
                int n_head = hparams.n_head();
230
0
                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
231
0
                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
232
0
                op_tensor = ggml_rope_ext(
233
0
                    ctx, a, b, w,
234
0
                    0, 0, 0, 0, 0,
235
0
                    0, 0, 0, 0
236
0
                );
237
238
0
            } break;
239
0
        case GGML_OP_SSM_CONV:
240
0
            {
241
0
                const int64_t n_seq_tokens = 512;
242
0
                const int64_t n_seqs       = 3;
243
0
                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
244
0
                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
245
0
            } break;
246
0
        case GGML_OP_SSM_SCAN:
247
0
            {
248
                // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
249
0
                const int64_t d_state      = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
250
0
                const int64_t n_head       = w->ne[1];
251
0
                const int64_t head_dim     = hparams.ssm_d_inner / n_head;
252
0
                const int64_t n_group      = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
253
0
                const int64_t n_seq_tokens = 512;
254
0
                const int64_t n_seqs       = 3;
255
0
                ggml_tensor * s   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
256
0
                ggml_tensor * x   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
257
0
                ggml_tensor * dt  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
258
0
                ggml_tensor * B   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
259
0
                ggml_tensor * C   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
260
0
                ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
261
0
                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
262
0
            } break;
263
0
        case GGML_OP_RWKV_WKV6:
264
0
            {
265
                // FIXME
266
0
                const int64_t S = 123;
267
0
                const int64_t H = 123;
268
0
                const int64_t n_tokens = 123;
269
0
                const int64_t n_seqs = 123;
270
0
                ggml_tensor  * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
271
0
                ggml_tensor  * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
272
0
                ggml_tensor  * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
273
0
                ggml_tensor  * tf = w;
274
0
                ggml_tensor  * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
275
0
                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
276
0
                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
277
0
            } break;
278
0
        case GGML_OP_IM2COL:
279
0
            {
280
0
                const int n_embd_inp = hparams.n_embd_inp();
281
0
                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
282
0
                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
283
0
            } break;
284
0
        case GGML_OP_SCALE:
285
0
            {
286
0
                op_tensor = ggml_scale(ctx, w, 1.0f);
287
0
            } break;
288
0
        default:
289
0
            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
290
0
    }
291
292
    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
293
0
    GGML_ASSERT(w->buffer == nullptr);
294
0
    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
295
0
    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
296
0
    ggml_backend_buffer_free(w->buffer);
297
0
    w->buffer = nullptr;
298
299
0
    return op_supported;
300
0
}
301
302
// lists of buffer types used for each layer
303
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
304
305
// find the first buffer type in the list that can use the tensor
306
0
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
307
0
    GGML_ASSERT(!buft_list.empty());
308
0
    for (const auto & cur : buft_list) {
309
0
        ggml_backend_dev_t cur_dev = cur.first;
310
0
        ggml_backend_buffer_type_t cur_buft = cur.second;
311
0
        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
312
0
            return cur_buft;
313
0
        }
314
0
    }
315
316
0
    return nullptr;
317
0
}
318
319
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
320
0
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
321
0
    buft_list_t buft_list;
322
323
    // add ACCEL buffer types
324
0
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
325
0
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
326
0
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
327
0
            auto * buft = ggml_backend_dev_buffer_type(dev);
328
            // skip
329
0
            if (buft != ggml_backend_cpu_buffer_type()) {
330
0
                buft_list.emplace_back(dev, buft);
331
0
            }
332
0
        }
333
0
    }
334
335
    // add a host buffer type
336
    // storing the tensors in a host buffer is useful when the processing of large batches
337
    // is offloaded to a GPU device, since it reduces the time spent on data transfers
338
    // generally, this will be done using the first device in the list
339
    // a better approach would be to handle this on a weight-by-weight basis using the offload_op
340
    // function of the device to determine if it would benefit from being stored in a host buffer
341
0
    if (!no_host) {
342
0
        for (auto * dev : devices) {
343
0
            ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
344
0
            if (buft) {
345
0
                buft_list.emplace_back(dev, buft);
346
0
                break;
347
0
            }
348
0
        }
349
0
    }
350
351
    // add extra buffer types
352
0
    if (use_extra_bufts) {
353
0
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
354
0
        if (cpu_dev == nullptr) {
355
0
            throw std::runtime_error(format("%s: no CPU backend found", __func__));
356
0
        }
357
358
0
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
359
0
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
360
0
            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
361
0
        if (ggml_backend_dev_get_extra_bufts_fn) {
362
0
            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
363
0
            while (extra_bufts && *extra_bufts) {
364
0
                buft_list.emplace_back(cpu_dev, *extra_bufts);
365
0
                ++extra_bufts;
366
0
            }
367
0
        }
368
0
    }
369
370
    // add the CPU buffer type
371
0
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
372
0
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
373
0
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
374
0
            buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
375
0
        }
376
0
    }
377
378
0
    return buft_list;
379
0
}
380
381
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
382
0
static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
383
0
    buft_list_t buft_list;
384
385
    // add the device split buffer type if requested and available
386
0
    if (split_mode == LLAMA_SPLIT_MODE_ROW) {
387
0
        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
388
0
        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
389
0
            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
390
0
        if (ggml_backend_split_buffer_type_fn) {
391
0
            size_t dev_index = [&]() {
392
0
                auto * reg = ggml_backend_dev_backend_reg(dev);
393
0
                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
394
0
                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
395
0
                        return i;
396
0
                    }
397
0
                }
398
0
                throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
399
0
            }();
400
0
            auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
401
0
            if (buft != nullptr) {
402
0
                buft_list.emplace_back(dev, buft);
403
0
            }
404
0
        }
405
0
    }
406
407
    // add the device default buffer type
408
0
    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
409
410
    // add the device extra buffer type (if any)
411
0
    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
412
0
    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
413
0
        ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
414
415
0
    if (ggml_backend_dev_get_extra_bufts_fn) {
416
0
        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
417
0
        while (extra_bufts && *extra_bufts) {
418
0
            buft_list.emplace_back(dev, *extra_bufts);
419
0
            ++extra_bufts;
420
0
        }
421
0
    }
422
423
0
    return buft_list;
424
0
}
425
426
struct llama_model::impl {
427
1.03k
    impl() = default;
428
965
    ~impl() = default;
429
430
    uint64_t n_elements = 0;
431
432
    size_t n_bytes = 0;
433
434
    std::string desc_str;
435
436
    // model memory mapped files
437
    llama_mmaps mappings;
438
439
    // objects representing data potentially being locked in memory
440
    llama_mlocks mlock_bufs;
441
    llama_mlocks mlock_mmaps;
442
443
    // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
444
    std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
445
446
    buft_list_t cpu_buft_list;
447
    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
448
449
    struct layer_dev {
450
        ggml_backend_dev_t dev;
451
        buft_list_t * buft_list;
452
    };
453
454
    layer_dev dev_input = {};
455
    layer_dev dev_output = {};
456
    std::vector<layer_dev> dev_layer;
457
458
    bool has_tensor_overrides;
459
};
460
461
1.03k
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
462
1.03k
    pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
463
1.03k
}
464
465
965
llama_model::~llama_model() = default;
466
467
0
void llama_model::load_stats(llama_model_loader & ml) {
468
0
    pimpl->n_elements = ml.n_elements;
469
0
    pimpl->n_bytes = ml.n_bytes;
470
0
}
471
472
221
void llama_model::load_arch(llama_model_loader & ml) {
473
221
    arch = ml.get_arch();
474
221
    if (arch == LLM_ARCH_UNKNOWN) {
475
177
        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
476
177
    }
477
221
}
478
479
44
void llama_model::load_hparams(llama_model_loader & ml) {
480
44
    const gguf_context * ctx = ml.meta.get();
481
482
    // get metadata as string
483
439
    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
484
395
        gguf_type type = gguf_get_kv_type(ctx, i);
485
395
        if (type == GGUF_TYPE_ARRAY) {
486
6
            continue;
487
6
        }
488
389
        const char * name = gguf_get_key(ctx, i);
489
389
        const std::string value = gguf_kv_to_str(ctx, i);
490
389
        gguf_kv.emplace(name, value);
491
389
    }
492
493
    // get general kv
494
44
    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
495
496
    // everything past this point is not vocab-related
497
    // for CLIP models, we only need to load tensors, no hparams
498
44
    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
499
0
        return;
500
0
    }
501
502
44
    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
503
44
    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
504
44
    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
505
44
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
506
44
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
507
44
    ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
508
44
    ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
509
510
44
    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
511
0
        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
512
513
0
        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
514
0
        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
515
516
0
        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
517
0
        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
518
0
    }
519
520
44
    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
521
44
    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
522
44
    if (hparams.n_expert > 0) {
523
0
        GGML_ASSERT(hparams.n_expert_used > 0);
524
0
        GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
525
0
        if (hparams.n_expert_groups > 1) {
526
0
            GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
527
0
            GGML_ASSERT(hparams.n_group_used > 0);
528
0
            GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
529
0
        }
530
44
    } else {
531
44
        GGML_ASSERT(hparams.n_expert_used == 0);
532
44
        GGML_ASSERT(hparams.n_expert_groups == 0);
533
44
    }
534
535
44
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
536
44
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
537
44
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
538
44
    std::fill(
539
44
        hparams.recurrent_layer_arr.begin(),
540
44
        hparams.recurrent_layer_arr.end(),
541
44
        llm_arch_is_recurrent(ml.get_arch()));
542
543
44
    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
544
44
    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
545
546
44
    std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
547
44
    std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
548
44
    std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
549
44
    std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
550
551
44
    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
552
44
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
553
554
    // n_head_kv is optional, default to n_head
555
44
    hparams.n_head_kv_arr = hparams.n_head_arr;
556
557
44
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
558
559
44
    bool rope_finetuned = false;
560
44
    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
561
44
    hparams.rope_finetuned = rope_finetuned;
562
563
44
    hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
564
44
    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
565
566
    // rope_freq_base (optional)
567
44
    hparams.rope_freq_base_train = 10000.0f;
568
44
    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
569
570
44
    std::string rope_scaling("linear");
571
44
    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
572
44
    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
573
44
    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
574
575
    // rope_freq_scale (inverse of the kv) is optional
576
44
    float ropescale = 0.0f;
577
44
    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
578
        // try the old key name
579
0
        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
580
0
    }
581
44
    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
582
583
    // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
584
44
    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
585
44
    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
586
587
44
    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
588
589
    // non-transformer models do not have attention heads
590
44
    if (hparams.n_head() > 0) {
591
        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
592
        // gpt-j n_rot = rotary_dim
593
594
0
        hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
595
0
        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
596
597
0
        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
598
0
        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
599
600
        // sanity check for n_rot (optional)
601
0
        hparams.n_rot = hparams.n_embd_head_k;
602
603
0
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
604
605
0
        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
606
0
            if (hparams.n_rot != hparams.n_embd_head_k) {
607
0
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
608
0
            }
609
0
        }
610
44
    } else {
611
44
        hparams.n_rot = 0;
612
44
        hparams.n_embd_head_k = 0;
613
44
        hparams.n_embd_head_v = 0;
614
44
    }
615
616
    // for differentiating model types
617
44
    uint32_t n_vocab = 0;
618
44
    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
619
620
    // for classifier models
621
44
    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
622
44
    if (!classifier_labels.empty()) {
623
0
        hparams.n_cls_out = classifier_labels.size();
624
0
    }
625
626
    // arch-specific KVs
627
44
    switch (arch) {
628
0
        case LLM_ARCH_LLAMA:
629
0
            {
630
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
631
632
0
                if (hparams.n_expert == 8) {
633
0
                    switch (hparams.n_layer) {
634
0
                        case 32: type = LLM_TYPE_8x7B; break;
635
0
                        case 56: type = LLM_TYPE_8x22B; break;
636
0
                        default: type = LLM_TYPE_UNKNOWN;
637
0
                    }
638
0
                } else {
639
0
                    switch (hparams.n_layer) {
640
0
                        case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
641
0
                        case 22: type = LLM_TYPE_1B; break;
642
0
                        case 26: type = LLM_TYPE_3B; break;
643
0
                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
644
0
                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
645
                        // granite uses a vocab with len 49152
646
0
                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
647
0
                        case 36: type = LLM_TYPE_8B; break; // granite
648
0
                        case 40: type = LLM_TYPE_13B; break;
649
0
                        case 48: type = LLM_TYPE_34B; break;
650
0
                        case 60: type = LLM_TYPE_30B; break;
651
0
                        case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
652
0
                        default: type = LLM_TYPE_UNKNOWN;
653
0
                    }
654
0
                }
655
0
            } break;
656
0
        case LLM_ARCH_LLAMA4:
657
0
            {
658
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
659
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
660
0
                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
661
662
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
663
0
                if (found_swa && hparams.n_swa == 0) {
664
0
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
665
0
                    hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
666
0
                } else {
667
0
                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
668
0
                    hparams.n_swa                   = 8192;
669
0
                    hparams.n_attn_temp_floor_scale = 8192;
670
0
                    hparams.f_attn_temp_scale       = 0.1f;
671
0
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
672
0
                }
673
674
0
                switch (hparams.n_expert) {
675
0
                    case 0: {
676
                        // MobileLLM (no MoE)
677
0
                        switch (hparams.n_embd) {
678
0
                            case 2048: type = LLM_TYPE_140M; break;
679
0
                            case 4096: type = LLM_TYPE_360M; break;
680
0
                            case 6144: type = LLM_TYPE_950M; break;
681
0
                            default:   type = LLM_TYPE_UNKNOWN;
682
0
                        }
683
0
                    } break;
684
0
                    case 16:  type = LLM_TYPE_17B_16E; break;
685
0
                    case 128: type = LLM_TYPE_17B_128E; break;
686
0
                    default:  type = LLM_TYPE_UNKNOWN;
687
0
                }
688
689
0
                hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
690
0
            } break;
691
0
        case LLM_ARCH_ARCEE:
692
0
            {
693
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
694
695
                // Arcee uses the same structure as Llama
696
0
                switch (hparams.n_layer) {
697
0
                    case 36: type = LLM_TYPE_4B; break;
698
0
                    default: type = LLM_TYPE_UNKNOWN;
699
0
                }
700
0
            } break;
701
0
        case LLM_ARCH_AFMOE:
702
0
            {
703
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
704
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
705
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
706
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
707
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
708
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
709
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
710
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
711
712
                // Set up interleaved sliding window attention (ISWA)
713
                // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
714
0
                if (hparams.n_swa > 0) {
715
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
716
0
                    hparams.set_swa_pattern(4);
717
0
                } else {
718
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
719
0
                }
720
721
                // Default to sigmoid if not set
722
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
723
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
724
0
                }
725
726
0
                switch (hparams.n_layer) {
727
0
                    case 56: type = LLM_TYPE_6B; break;
728
0
                    case 32: type = LLM_TYPE_26B; break;
729
0
                    default: type = LLM_TYPE_UNKNOWN;
730
0
                }
731
0
            } break;
732
0
        case LLM_ARCH_DECI:
733
0
            {
734
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
735
0
                switch (hparams.n_layer) {
736
0
                    case 32: type = LLM_TYPE_7B; break;
737
0
                    case 80: type = LLM_TYPE_70B; break;
738
0
                    case 162: type = LLM_TYPE_405B; break;
739
0
                    default: type = LLM_TYPE_UNKNOWN;
740
0
                }
741
0
            } break;
742
0
        case LLM_ARCH_MINICPM:
743
0
            {
744
                // Backward-compatible defaults for older MiniCPM GGUFs
745
0
                hparams.f_embedding_scale = 12.0f;
746
0
                hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
747
0
                hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
748
749
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
750
751
                // Optional KV reads, override defaults if present in newer GGUF exports
752
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
753
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
754
0
                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
755
756
                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
757
0
                hparams.rope_finetuned = true;
758
759
0
                switch (hparams.n_layer) {
760
0
                    case 52: type = LLM_TYPE_1B; break;
761
0
                    case 40: type = LLM_TYPE_2B; break;
762
0
                    default: type = LLM_TYPE_UNKNOWN;
763
0
                }
764
0
            } break;
765
0
        case LLM_ARCH_MINICPM3:
766
0
            {
767
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
768
0
                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
769
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
770
771
0
                switch (hparams.n_layer) {
772
0
                    case 62: type = LLM_TYPE_4B; break;
773
0
                    default: type = LLM_TYPE_UNKNOWN;
774
0
                }
775
0
            } break;
776
0
        case LLM_ARCH_GROK:
777
0
            {
778
                // defaults for old GGUFs
779
0
                hparams.yarn_beta_fast = 8.0f;
780
0
                hparams.f_logit_scale = 0.5773502691896257f;
781
0
                hparams.f_embedding_scale = 78.38367176906169f;
782
0
                hparams.f_attn_out_scale = 0.08838834764831845f;
783
0
                hparams.f_attn_logit_softcapping = 30.0f;
784
0
                hparams.f_router_logit_softcapping = 30.0f;
785
                // no final_logit_softcapping in grok-1
786
0
                hparams.f_final_logit_softcapping = 0.0f;
787
788
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
789
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp, false);
790
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
791
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
792
0
                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
793
0
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
794
0
                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
795
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
796
797
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,  hparams.attn_temp_length, false);
798
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,  hparams.yarn_ext_factor, false);
799
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
800
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
801
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
802
803
0
                switch (hparams.n_layer) {
804
0
                    case 64: type = LLM_TYPE_314B; break;
805
0
                    default: type = LLM_TYPE_UNKNOWN;
806
0
                }
807
0
            } break;
808
0
        case LLM_ARCH_FALCON:
809
0
            {
810
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
811
812
0
                switch (hparams.n_layer) {
813
0
                    case 32: type = LLM_TYPE_7B; break;
814
0
                    case 60: type = LLM_TYPE_40B; break;
815
0
                    default: type = LLM_TYPE_UNKNOWN;
816
0
                }
817
0
            } break;
818
0
        case LLM_ARCH_BAICHUAN:
819
0
            {
820
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
821
0
                switch (hparams.n_layer) {
822
0
                    case 32: type = LLM_TYPE_7B; break;
823
0
                    case 40: type = LLM_TYPE_13B; break;
824
0
                    default: type = LLM_TYPE_UNKNOWN;
825
0
                }
826
827
0
                if (type == LLM_TYPE_13B) {
828
                    // TODO: become GGUF KV parameter
829
0
                    hparams.f_max_alibi_bias = 8.0f;
830
0
                }
831
0
            } break;
832
0
        case LLM_ARCH_STARCODER:
833
0
            {
834
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
835
0
                switch (hparams.n_layer) {
836
0
                    case 24: type = LLM_TYPE_1B; break;
837
0
                    case 36: type = LLM_TYPE_3B; break;
838
0
                    case 42: type = LLM_TYPE_7B; break;
839
0
                    case 40: type = LLM_TYPE_15B; break;
840
0
                    default: type = LLM_TYPE_UNKNOWN;
841
0
                }
842
0
            } break;
843
0
        case LLM_ARCH_REFACT:
844
0
            {
845
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
846
0
                switch (hparams.n_layer) {
847
0
                    case 32: type = LLM_TYPE_1B; break;
848
0
                    default: type = LLM_TYPE_UNKNOWN;
849
0
                }
850
851
                // TODO: become GGUF KV parameter
852
0
                hparams.f_max_alibi_bias = 8.0f;
853
0
            } break;
854
0
        case LLM_ARCH_BERT:
855
0
            {
856
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
857
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
858
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
859
860
0
                switch (hparams.n_layer) {
861
0
                    case 3:
862
0
                        type = LLM_TYPE_17M; break; // bge-micro
863
0
                    case 6:
864
0
                        type = LLM_TYPE_22M; break; // MiniLM-L6
865
0
                    case 12:
866
0
                        switch (hparams.n_embd) {
867
0
                            case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
868
0
                            case 768: type = LLM_TYPE_109M; break; // bge-base
869
0
                            default: type = LLM_TYPE_UNKNOWN;
870
0
                        } break;
871
0
                    case 24:
872
0
                        type = LLM_TYPE_335M; break; // bge-large
873
0
                    default: type = LLM_TYPE_UNKNOWN;
874
0
                }
875
0
            } break;
876
0
        case LLM_ARCH_JINA_BERT_V2:
877
0
            {
878
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
879
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
880
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
881
0
                hparams.f_max_alibi_bias = 8.0f;
882
883
0
                switch (hparams.n_layer) {
884
0
                    case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
885
0
                    case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
886
0
                    default: type = LLM_TYPE_UNKNOWN;
887
0
                }
888
0
            } break;
889
0
        case LLM_ARCH_JINA_BERT_V3:
890
0
            {
891
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
892
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
893
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
894
895
0
                switch (hparams.n_layer) {
896
0
                    case 24:
897
0
                        type = LLM_TYPE_558M; break;
898
0
                    default: type = LLM_TYPE_UNKNOWN;
899
0
                }
900
0
            } break;
901
0
        case LLM_ARCH_NOMIC_BERT:
902
0
        case LLM_ARCH_NOMIC_BERT_MOE:
903
0
            {
904
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
905
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
906
0
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
907
0
                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
908
909
0
                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
910
0
                    if (arch == LLM_ARCH_NOMIC_BERT) {
911
0
                        type = LLM_TYPE_137M;
912
0
                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
913
0
                        type = LLM_TYPE_475M;
914
0
                    }
915
0
                }
916
0
            } break;
917
0
        case LLM_ARCH_NEO_BERT:
918
0
            {
919
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
920
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
921
0
                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
922
923
0
                if (hparams.n_layer == 28) {
924
0
                    type = LLM_TYPE_250M;
925
0
                }
926
0
            } break;
927
0
        case LLM_ARCH_BLOOM:
928
0
            {
929
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
930
931
0
                switch (hparams.n_layer) {
932
0
                    case 24: type = LLM_TYPE_1B; break;
933
0
                    case 30:
934
0
                        switch (hparams.n_embd) {
935
0
                            case 2560: type = LLM_TYPE_3B; break;
936
0
                            case 4096: type = LLM_TYPE_7B; break;
937
0
                            default: type = LLM_TYPE_UNKNOWN;
938
0
                        } break;
939
0
                    default: type = LLM_TYPE_UNKNOWN;
940
0
                }
941
942
                // TODO: become GGUF KV parameter
943
0
                hparams.f_max_alibi_bias = 8.0f;
944
0
            } break;
945
0
        case LLM_ARCH_MPT:
946
0
            {
947
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
948
0
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
949
0
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
950
951
0
                switch (hparams.n_layer) {
952
0
                    case 32: type = LLM_TYPE_7B; break;
953
0
                    case 48: type = LLM_TYPE_30B; break;
954
0
                    default: type = LLM_TYPE_UNKNOWN;
955
0
                }
956
0
            } break;
957
0
        case LLM_ARCH_STABLELM:
958
0
            {
959
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
960
961
0
                switch (hparams.n_layer) {
962
0
                    case 24: type = LLM_TYPE_1B; break;
963
0
                    case 32: type = LLM_TYPE_3B; break;
964
0
                    case 40: type = LLM_TYPE_12B; break;
965
0
                    default: type = LLM_TYPE_UNKNOWN;
966
0
               }
967
0
            } break;
968
0
        case LLM_ARCH_QWEN:
969
0
            {
970
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
971
972
0
                switch (hparams.n_layer) {
973
0
                    case 32: type = LLM_TYPE_7B; break;
974
0
                    case 40: type = LLM_TYPE_13B; break;
975
0
                    default: type = LLM_TYPE_UNKNOWN;
976
0
                }
977
0
            } break;
978
0
        case LLM_ARCH_QWEN2VL:
979
0
            {
980
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
981
0
            }
982
            // fall through
983
0
        case LLM_ARCH_QWEN2:
984
0
            {
985
0
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
986
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
987
0
                switch (hparams.n_layer) {
988
0
                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
989
0
                    case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
990
0
                    case 32: type = LLM_TYPE_7B; break;
991
0
                    case 36: type = LLM_TYPE_3B; break;
992
0
                    case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
993
0
                    case 48: type = LLM_TYPE_14B; break;
994
0
                    case 64: type = LLM_TYPE_32B; break;
995
0
                    case 80: type = LLM_TYPE_70B; break;
996
0
                    default: type = LLM_TYPE_UNKNOWN;
997
0
                }
998
0
            } break;
999
0
        case LLM_ARCH_DREAM:
1000
0
            {
1001
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1002
                // Dream models are primarily 7B with 28 layers
1003
0
                switch (hparams.n_layer) {
1004
0
                    case 28:
1005
0
                        type = LLM_TYPE_7B;
1006
0
                        break;
1007
0
                    default:
1008
0
                        type = LLM_TYPE_UNKNOWN;
1009
0
                }
1010
                // Set non-causal attention for diffusion models
1011
0
                hparams.causal_attn = false;
1012
0
            }
1013
0
            break;
1014
0
        case LLM_ARCH_LLADA:
1015
0
            {
1016
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1017
                // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
1018
0
                switch (hparams.n_layer) {
1019
0
                    case 32:
1020
0
                        type = LLM_TYPE_8B;
1021
0
                        break;
1022
0
                    default:
1023
0
                        type = LLM_TYPE_UNKNOWN;
1024
0
                }
1025
                // Set non-causal attention for diffusion models
1026
0
                hparams.causal_attn = false;
1027
0
            }
1028
0
            break;
1029
0
        case LLM_ARCH_LLADA_MOE:
1030
0
            {
1031
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1032
1033
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1034
                // diffusion language model uses non-causal attention
1035
0
                hparams.causal_attn = false;
1036
0
                switch (hparams.n_layer) {
1037
0
                    case 16: type = LLM_TYPE_A1_7B; break;
1038
0
                    default: type = LLM_TYPE_UNKNOWN;
1039
0
                }
1040
0
            } break;
1041
0
        case LLM_ARCH_RND1:
1042
0
            {
1043
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1044
1045
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1046
0
                switch (hparams.n_layer) {
1047
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1048
0
                    default: type = LLM_TYPE_UNKNOWN;
1049
0
                }
1050
                // Set non-causal attention for diffusion models
1051
0
                hparams.causal_attn = false;
1052
0
            } break;
1053
0
        case LLM_ARCH_QWEN2MOE:
1054
0
            {
1055
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1056
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1057
1058
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1059
0
                switch (hparams.n_layer) {
1060
0
                    case 24: type = LLM_TYPE_A2_7B; break;
1061
0
                    case 28: type = LLM_TYPE_57B_A14B; break;
1062
0
                    default: type = LLM_TYPE_UNKNOWN;
1063
0
                }
1064
0
            } break;
1065
0
        case LLM_ARCH_QWEN3:
1066
0
            {
1067
0
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1068
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1069
0
                switch (hparams.n_layer) {
1070
0
                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
1071
0
                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1072
0
                    case 40: type = LLM_TYPE_14B; break;
1073
0
                    case 64: type = LLM_TYPE_32B; break;
1074
0
                    default: type = LLM_TYPE_UNKNOWN;
1075
0
                }
1076
0
            } break;
1077
0
        case LLM_ARCH_QWEN3VL:
1078
0
            {
1079
0
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1080
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1081
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1082
0
                switch (hparams.n_layer) {
1083
0
                    case 28: type = LLM_TYPE_1_7B; break;
1084
0
                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1085
0
                    case 64: type = LLM_TYPE_32B; break;
1086
0
                    default: type = LLM_TYPE_UNKNOWN;
1087
0
                }
1088
0
            } break;
1089
0
        case LLM_ARCH_QWEN3MOE:
1090
0
            {
1091
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
1092
1093
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1094
0
                switch (hparams.n_layer) {
1095
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1096
0
                    case 94: type = LLM_TYPE_235B_A22B; break;
1097
0
                    default: type = LLM_TYPE_UNKNOWN;
1098
0
                }
1099
0
            } break;
1100
0
        case LLM_ARCH_QWEN3VLMOE:
1101
0
            {
1102
0
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
1103
0
                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
1104
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1105
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1106
0
                switch (hparams.n_layer) {
1107
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
1108
0
                    case 94: type = LLM_TYPE_235B_A22B; break;
1109
0
                    default: type = LLM_TYPE_UNKNOWN;
1110
0
                }
1111
0
            } break;
1112
0
        case LLM_ARCH_PHI2:
1113
0
            {
1114
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1115
1116
0
                switch (hparams.n_layer) {
1117
0
                    case 24: type = LLM_TYPE_1B; break;
1118
0
                    case 32: type = LLM_TYPE_3B; break;
1119
0
                    default: type = LLM_TYPE_UNKNOWN;
1120
0
                }
1121
0
            } break;
1122
0
        case LLM_ARCH_PHI3:
1123
0
            {
1124
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1125
1126
0
                switch (hparams.n_layer) {
1127
0
                    case 24: type = LLM_TYPE_1B; break;
1128
0
                    case 32: type = LLM_TYPE_3B; break;
1129
0
                    case 40: type = LLM_TYPE_14B; break;
1130
0
                    default: type = LLM_TYPE_UNKNOWN;
1131
0
                }
1132
1133
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1134
1135
0
                if (found_swa && hparams.n_swa > 0) {
1136
0
                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
1137
0
                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
1138
1139
                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
1140
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1141
1142
0
                    hparams.n_swa         = 0;
1143
0
                    hparams.set_swa_pattern(1);
1144
0
                }
1145
0
            } break;
1146
0
        case LLM_ARCH_PHIMOE:
1147
0
            {
1148
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1149
1150
0
                switch (hparams.n_layer) {
1151
0
                    case 32: type = LLM_TYPE_16x3_8B; break;
1152
0
                    default: type = LLM_TYPE_UNKNOWN;
1153
0
                }
1154
0
            } break;
1155
0
        case LLM_ARCH_PLAMO:
1156
0
            {
1157
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1158
1159
0
                switch (hparams.n_layer) {
1160
0
                    case 40: type = LLM_TYPE_13B; break;
1161
0
                    default: type = LLM_TYPE_UNKNOWN;
1162
0
               }
1163
0
            } break;
1164
0
        case LLM_ARCH_PLAMO2:
1165
0
            {
1166
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1167
1168
                // Load Mamba SSM parameters
1169
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1170
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1171
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1172
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1173
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1174
1175
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1176
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1177
0
                }
1178
1179
0
                switch (hparams.n_layer) {
1180
0
                    case 16: type = LLM_TYPE_1B; break;
1181
0
                    case 32:
1182
0
                        if (hparams.n_embd == 2048) {
1183
0
                            type = LLM_TYPE_2B;
1184
0
                        } else if (hparams.n_embd == 4096) {
1185
0
                            type = LLM_TYPE_8B;
1186
0
                        }
1187
0
                        break;
1188
0
                    default: type = LLM_TYPE_UNKNOWN;
1189
0
                }
1190
1191
                // Load attention parameters
1192
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
1193
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1194
0
            } break;
1195
0
        case LLM_ARCH_GPT2:
1196
0
            {
1197
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1198
0
                switch (hparams.n_layer) {
1199
0
                    case 12: type = LLM_TYPE_SMALL; break;
1200
0
                    case 24: type = LLM_TYPE_MEDIUM; break;
1201
0
                    case 36: type = LLM_TYPE_LARGE; break;
1202
0
                    case 48: type = LLM_TYPE_XL; break;
1203
0
                    default: type = LLM_TYPE_UNKNOWN;
1204
0
                }
1205
0
            } break;
1206
0
        case LLM_ARCH_CODESHELL:
1207
0
            {
1208
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1209
0
                switch (hparams.n_layer) {
1210
0
                    case 42: type = LLM_TYPE_7B; break;
1211
0
                    default: type = LLM_TYPE_UNKNOWN;
1212
0
                }
1213
0
            } break;
1214
0
        case LLM_ARCH_ORION:
1215
0
            {
1216
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1217
1218
0
                switch (hparams.n_layer) {
1219
0
                    case 40: type = LLM_TYPE_14B; break;
1220
0
                    default: type = LLM_TYPE_UNKNOWN;
1221
0
                }
1222
0
            } break;
1223
0
        case LLM_ARCH_INTERNLM2:
1224
0
            {
1225
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1226
0
                switch (hparams.n_layer) {
1227
0
                    case 32: type = LLM_TYPE_7B; break;
1228
0
                    case 48: type = LLM_TYPE_20B; break;
1229
0
                    default: type = LLM_TYPE_UNKNOWN;
1230
0
                }
1231
0
            } break;
1232
0
        case LLM_ARCH_GEMMA:
1233
0
            {
1234
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1235
1236
0
                switch (hparams.n_layer) {
1237
0
                    case 18: type = LLM_TYPE_2B; break;
1238
0
                    case 28: type = LLM_TYPE_7B; break;
1239
0
                    default: type = LLM_TYPE_UNKNOWN;
1240
0
               }
1241
0
            } break;
1242
0
        case LLM_ARCH_GEMMA2:
1243
0
            {
1244
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1245
0
                hparams.n_swa = 4096; // default value of gemma 2
1246
0
                hparams.set_swa_pattern(2);
1247
0
                hparams.attn_soft_cap = true;
1248
1249
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1250
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1251
0
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
1252
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
1253
1254
0
                switch (hparams.n_layer) {
1255
0
                    case 26: type = LLM_TYPE_2B; break;
1256
0
                    case 42: type = LLM_TYPE_9B; break;
1257
0
                    case 46: type = LLM_TYPE_27B; break;
1258
0
                    default: type = LLM_TYPE_UNKNOWN;
1259
0
               }
1260
1261
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1262
0
                hparams.f_attention_scale = type == LLM_TYPE_27B
1263
0
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1264
0
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1265
0
            } break;
1266
0
        case LLM_ARCH_GEMMA3:
1267
0
            {
1268
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1269
0
                if (found_swa && hparams.n_swa > 0) {
1270
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1271
0
                    hparams.set_swa_pattern(6);
1272
1273
0
                    hparams.rope_freq_base_train_swa  = 10000.0f;
1274
0
                    hparams.rope_freq_scale_train_swa = 1.0f;
1275
0
                } else {
1276
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1277
0
                }
1278
1279
0
                hparams.f_final_logit_softcapping = 0.0f;
1280
0
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1281
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1282
1283
0
                switch (hparams.n_layer) {
1284
0
                    case 18: type = LLM_TYPE_270M; break;
1285
0
                    case 26: type = LLM_TYPE_1B; break;
1286
0
                    case 32: type = LLM_TYPE_8B; break; // Rnj-1
1287
0
                    case 34: type = LLM_TYPE_4B; break;
1288
0
                    case 48: type = LLM_TYPE_12B; break;
1289
0
                    case 62: type = LLM_TYPE_27B; break;
1290
0
                    default: type = LLM_TYPE_UNKNOWN;
1291
0
                }
1292
1293
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1294
0
                hparams.f_attention_scale = type == LLM_TYPE_27B
1295
0
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1296
0
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1297
0
            } break;
1298
0
        case LLM_ARCH_GEMMA3N:
1299
0
            {
1300
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1301
0
                hparams.set_swa_pattern(5);
1302
1303
0
                hparams.n_layer_kv_from_start     = 20;
1304
0
                hparams.rope_freq_base_train_swa  = 10000.0f;
1305
0
                hparams.rope_freq_scale_train_swa = 1.0f;
1306
0
                hparams.f_attention_scale         = 1.0f;
1307
1308
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
1309
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1310
1311
0
                switch (hparams.n_layer) {
1312
0
                    case 30: type = LLM_TYPE_E2B; break;
1313
0
                    case 35: type = LLM_TYPE_E4B; break;
1314
0
                    default: type = LLM_TYPE_UNKNOWN;
1315
0
                }
1316
0
            } break;
1317
0
        case LLM_ARCH_GEMMA_EMBEDDING:
1318
0
            {
1319
0
                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1320
0
                hparams.set_swa_pattern(6);
1321
1322
0
                hparams.causal_attn = false; // embeddings do not use causal attention
1323
0
                hparams.rope_freq_base_train_swa = 10000.0f;
1324
0
                hparams.rope_freq_scale_train_swa = 1.0f;
1325
1326
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1327
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1328
0
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1329
1330
                //applied only if model converted with --sentence-transformers-dense-modules
1331
0
                ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1332
0
                ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1333
0
                ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1334
0
                ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1335
1336
0
                GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1337
0
                GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1338
1339
0
                switch (hparams.n_layer) {
1340
0
                    case 24: type = LLM_TYPE_0_3B; break;
1341
0
                    default: type = LLM_TYPE_UNKNOWN;
1342
0
                }
1343
0
                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1344
1345
0
            } break;
1346
0
        case LLM_ARCH_STARCODER2:
1347
0
            {
1348
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1349
0
                switch (hparams.n_layer) {
1350
0
                    case 30: type = LLM_TYPE_3B; break;
1351
0
                    case 32: type = LLM_TYPE_7B; break;
1352
0
                    case 40: type = LLM_TYPE_15B; break;
1353
0
                    case 52: type = LLM_TYPE_20B; break; // granite
1354
0
                    case 88: type = LLM_TYPE_34B; break; // granite
1355
0
                    default: type = LLM_TYPE_UNKNOWN;
1356
0
                }
1357
0
            } break;
1358
0
        case LLM_ARCH_MAMBA:
1359
0
            {
1360
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1361
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1362
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1363
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1364
0
                ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
1365
1366
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1367
1368
0
                switch (hparams.n_layer) {
1369
0
                    case 24:
1370
0
                        switch (hparams.n_embd) {
1371
0
                            case 768: type = LLM_TYPE_SMALL; break;
1372
0
                            default: type = LLM_TYPE_UNKNOWN;
1373
0
                        } break;
1374
0
                    case 48:
1375
0
                        switch (hparams.n_embd) {
1376
0
                            case 1024: type = LLM_TYPE_MEDIUM; break;
1377
0
                            case 1536: type = LLM_TYPE_LARGE; break;
1378
0
                            case 2048: type = LLM_TYPE_XL; break;
1379
0
                            default:   type = LLM_TYPE_UNKNOWN;
1380
0
                        } break;
1381
0
                    case 64:
1382
0
                        switch (hparams.n_embd) {
1383
0
                            case 2560: type = LLM_TYPE_3B; break;
1384
0
                            default: type = LLM_TYPE_UNKNOWN;
1385
0
                        } break;
1386
0
                    default: type = LLM_TYPE_UNKNOWN;
1387
0
                }
1388
0
            } break;
1389
0
        case LLM_ARCH_MAMBA2:
1390
0
            {
1391
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1392
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1393
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1394
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1395
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1396
1397
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1398
1399
0
                switch (hparams.n_layer) {
1400
0
                    case 24:
1401
0
                        switch (hparams.n_embd) {
1402
0
                            case 768: type = LLM_TYPE_SMALL; break;
1403
0
                            default: type = LLM_TYPE_UNKNOWN;
1404
0
                        } break;
1405
0
                    case 48:
1406
0
                        switch (hparams.n_embd) {
1407
0
                            case 1024: type = LLM_TYPE_MEDIUM; break;
1408
0
                            case 1536: type = LLM_TYPE_LARGE; break;
1409
0
                            case 2048: type = LLM_TYPE_XL; break;
1410
0
                            default: type = LLM_TYPE_UNKNOWN;
1411
0
                        } break;
1412
0
                    case 64:
1413
0
                        switch (hparams.n_embd) {
1414
0
                            case 2560: type = LLM_TYPE_3B; break;
1415
0
                            case 4096: type = LLM_TYPE_7B; break;
1416
0
                            default: type = LLM_TYPE_UNKNOWN;
1417
0
                        } break;
1418
0
                    default: type = LLM_TYPE_UNKNOWN;
1419
0
                }
1420
0
            } break;
1421
0
        case LLM_ARCH_JAMBA:
1422
0
            {
1423
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1424
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1425
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1426
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1427
1428
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1429
1430
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1431
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1432
0
                }
1433
1434
0
                switch (hparams.n_layer) {
1435
                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1436
0
                    case 12: // 900M  8x???M
1437
0
                    case 32: // 51B  16x?B
1438
0
                    default: type = LLM_TYPE_UNKNOWN;
1439
0
                }
1440
0
            } break;
1441
0
        case LLM_ARCH_XVERSE:
1442
0
            {
1443
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1444
0
                switch (hparams.n_layer) {
1445
0
                    case 32: type = LLM_TYPE_7B; break;
1446
0
                    case 40: type = LLM_TYPE_13B; break;
1447
0
                    case 80: type = LLM_TYPE_65B; break;
1448
0
                    default: type = LLM_TYPE_UNKNOWN;
1449
0
                }
1450
0
            } break;
1451
0
        case LLM_ARCH_COMMAND_R:
1452
0
            {
1453
0
                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale);
1454
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1455
0
                switch (hparams.n_layer) {
1456
0
                    case 40: type = LLM_TYPE_35B; break;
1457
0
                    default: type = LLM_TYPE_UNKNOWN;
1458
0
                }
1459
0
            } break;
1460
0
        case LLM_ARCH_COHERE2:
1461
0
            {
1462
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1463
0
                hparams.set_swa_pattern(4);
1464
1465
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1466
0
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
1467
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
1468
0
                switch (hparams.n_layer) {
1469
0
                    case 32: type = LLM_TYPE_8B; break;
1470
0
                    default: type = LLM_TYPE_UNKNOWN;
1471
0
                }
1472
0
            } break;
1473
0
        case LLM_ARCH_DBRX:
1474
0
        {
1475
0
            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1476
0
            ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
1477
1478
0
            switch (hparams.n_layer) {
1479
0
                case 40: type = LLM_TYPE_16x12B; break;
1480
0
                default: type = LLM_TYPE_UNKNOWN;
1481
0
            }
1482
0
        } break;
1483
0
        case LLM_ARCH_OLMO:
1484
0
            {
1485
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1486
0
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
1487
1488
0
                switch (hparams.n_layer) {
1489
0
                    case 22: type = LLM_TYPE_1B; break;
1490
0
                    case 32: type = LLM_TYPE_7B; break;
1491
0
                    case 80: type = LLM_TYPE_70B; break;
1492
0
                    default: type = LLM_TYPE_UNKNOWN;
1493
0
                }
1494
0
            } break;
1495
0
        case LLM_ARCH_OLMO2:
1496
0
            {
1497
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1498
1499
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1500
0
                if (found_swa && hparams.n_swa > 0) {
1501
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1502
0
                    hparams.set_swa_pattern(4);
1503
0
                } else {
1504
0
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1505
0
                }
1506
1507
0
                switch (hparams.n_layer) {
1508
0
                    case 16: type = LLM_TYPE_1B; break;
1509
0
                    case 32: type = LLM_TYPE_7B; break;
1510
0
                    case 40: type = LLM_TYPE_13B; break;
1511
0
                    case 64: type = LLM_TYPE_32B; break;
1512
0
                    default: type = LLM_TYPE_UNKNOWN;
1513
0
                }
1514
0
            } break;
1515
0
        case LLM_ARCH_SEED_OSS:
1516
0
            {
1517
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1518
0
                switch (hparams.n_layer) {
1519
0
                    case 64: type = LLM_TYPE_36B; break;
1520
0
                    default: type = LLM_TYPE_UNKNOWN;
1521
0
                }
1522
0
            } break;
1523
0
        case LLM_ARCH_OLMOE:
1524
0
            {
1525
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1526
0
                switch (hparams.n_layer) {
1527
0
                    case 16: type = LLM_TYPE_A1_7B; break;
1528
0
                    default: type = LLM_TYPE_UNKNOWN;
1529
0
                }
1530
0
            } break;
1531
0
        case LLM_ARCH_OPENELM:
1532
0
            {
1533
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1534
1535
0
                switch (hparams.n_layer) {
1536
0
                case 16: type = LLM_TYPE_270M; break;
1537
0
                case 20: type = LLM_TYPE_450M; break;
1538
0
                case 28: type = LLM_TYPE_1B; break;
1539
0
                case 36: type = LLM_TYPE_3B; break;
1540
0
                default: type = LLM_TYPE_UNKNOWN;
1541
0
                }
1542
0
            } break;
1543
0
        case LLM_ARCH_GPTNEOX:
1544
0
            {
1545
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1546
0
                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
1547
0
                switch (hparams.n_layer) {
1548
0
                    case 6:
1549
0
                        switch (hparams.n_ff()) {
1550
0
                            case 512:  type = LLM_TYPE_14M; break;
1551
0
                            case 2048: type = LLM_TYPE_70M; break;
1552
0
                            default:   type = LLM_TYPE_UNKNOWN;
1553
0
                        } break;
1554
0
                    case 12:
1555
0
                        switch (hparams.n_ff()) {
1556
0
                            case 3072: type = LLM_TYPE_160M; break;
1557
0
                            default: type = LLM_TYPE_UNKNOWN;
1558
0
                        } break;
1559
0
                    case 16:
1560
0
                        switch (hparams.n_ff()) {
1561
0
                            case 8192: type = LLM_TYPE_1B; break;
1562
0
                            default: type = LLM_TYPE_UNKNOWN;
1563
0
                        } break;
1564
0
                    case 24:
1565
0
                        switch (hparams.n_ff()) {
1566
0
                            case 4096: type = LLM_TYPE_410M; break;
1567
0
                            case 8192: type = LLM_TYPE_1_4B; break;
1568
0
                            default: type = LLM_TYPE_UNKNOWN;
1569
0
                        } break;
1570
0
                    case 32:
1571
0
                        switch (hparams.n_ff()) {
1572
0
                            case 10240: type = LLM_TYPE_2_8B; break;
1573
0
                            case 16384: type = LLM_TYPE_6_9B; break;
1574
0
                            default: type = LLM_TYPE_UNKNOWN;
1575
0
                        } break;
1576
0
                    case 36:
1577
0
                        switch (hparams.n_ff()) {
1578
0
                            case 20480: type = LLM_TYPE_12B; break;
1579
0
                            default: type = LLM_TYPE_UNKNOWN;
1580
0
                        } break;
1581
0
                    case 44:
1582
0
                        switch (hparams.n_ff()) {
1583
0
                            case 24576: type = LLM_TYPE_20B; break;
1584
0
                            default: type = LLM_TYPE_UNKNOWN;
1585
0
                        } break;
1586
0
                    default: type = LLM_TYPE_UNKNOWN;
1587
0
                }
1588
0
            } break;
1589
0
        case LLM_ARCH_ARCTIC:
1590
0
            {
1591
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1592
1593
0
                if (hparams.n_expert == 128) {
1594
0
                    switch (hparams.n_layer) {
1595
0
                        case 35: type = LLM_TYPE_10B_128x3_66B; break;
1596
0
                        default: type = LLM_TYPE_UNKNOWN;
1597
0
                    }
1598
0
                } else {
1599
0
                    type = LLM_TYPE_UNKNOWN;
1600
0
                }
1601
0
            } break;
1602
0
        case LLM_ARCH_DEEPSEEK:
1603
0
            {
1604
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1605
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1606
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1607
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1608
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1609
1610
0
                switch (hparams.n_ff_exp) {
1611
0
                    case 1408: type = LLM_TYPE_16B; break;
1612
0
                    case 1792: type = LLM_TYPE_20B; break;
1613
0
                    default: type = LLM_TYPE_UNKNOWN;
1614
0
                }
1615
0
            } break;
1616
0
        case LLM_ARCH_DEEPSEEK2:
1617
0
            {
1618
                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
1619
0
                bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
1620
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1621
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1622
0
                if (!is_lite) {
1623
0
                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1624
0
                }
1625
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
1626
0
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla, false);
1627
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1628
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1629
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
1630
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
1631
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
1632
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
1633
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1634
                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1635
                    // that have no expert_gating_func model parameter set
1636
0
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1637
0
                }
1638
1639
0
                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
1640
                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
1641
                    // cancel the factor from the convert script
1642
0
                    hparams.rope_yarn_log_mul /= 0.1f;
1643
0
                }
1644
1645
                // (optional) temperature tuning - used by mistral-large
1646
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
1647
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1648
1649
0
                switch (hparams.n_layer) {
1650
0
                    case 27: type = LLM_TYPE_16B; break;
1651
0
                    case 60: type = LLM_TYPE_236B; break;
1652
0
                    case 61: type = LLM_TYPE_671B; break;
1653
0
                    default: type = LLM_TYPE_UNKNOWN;
1654
0
                }
1655
0
            } break;
1656
0
        case LLM_ARCH_PLM:
1657
0
            {
1658
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1659
0
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1660
0
                switch (hparams.n_layer) {
1661
0
                    case 32: type = LLM_TYPE_1_8B; break;
1662
0
                    default: type = LLM_TYPE_UNKNOWN;
1663
0
                }
1664
0
            } break;
1665
0
        case LLM_ARCH_CHATGLM:
1666
0
            {
1667
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1668
0
                switch (hparams.n_layer) {
1669
0
                    case 28: {
1670
0
                        if (hparams.n_head(0) == 16) {
1671
0
                            type = LLM_TYPE_1_5B;
1672
0
                        } else {
1673
0
                            type = LLM_TYPE_6B;
1674
0
                        }
1675
0
                    } break;
1676
0
                    case 40: {
1677
0
                        if (hparams.n_head(0) == 24) {
1678
0
                            type = LLM_TYPE_4B;
1679
0
                        } else {
1680
0
                            type = LLM_TYPE_9B;
1681
0
                        }
1682
0
                    } break;
1683
0
                    default: type = LLM_TYPE_UNKNOWN;
1684
0
                }
1685
0
            } break;
1686
0
        case LLM_ARCH_GLM4:
1687
0
            {
1688
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1689
0
                switch (hparams.n_layer) {
1690
0
                    case 40: type = LLM_TYPE_9B; break;
1691
0
                    case 61: type = LLM_TYPE_32B; break;
1692
0
                    default: type = LLM_TYPE_UNKNOWN;
1693
0
                }
1694
0
            } break;
1695
0
        case LLM_ARCH_GLM4_MOE:
1696
0
            {
1697
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1698
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1699
1700
                // MoE parameters
1701
0
                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
1702
0
                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
1703
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1704
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
1705
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1706
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
1707
1708
                // Expert gating function (GLM-4.5 uses sigmoid)
1709
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
1710
0
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1711
0
                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1712
0
                }
1713
1714
                // NextN/MTP parameters
1715
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
1716
1717
                // TODO: when MTP is implemented, this should probably be updated if needed
1718
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1719
1720
0
                switch (hparams.n_layer) {
1721
0
                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1722
0
                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1723
0
                    default: type = LLM_TYPE_UNKNOWN;
1724
0
                }
1725
0
            } break;
1726
0
        case LLM_ARCH_BITNET:
1727
0
            {
1728
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1729
1730
0
                switch (hparams.n_layer) {
1731
0
                    case 26: type = LLM_TYPE_3B; break;
1732
0
                    default: type = LLM_TYPE_UNKNOWN;
1733
0
                }
1734
0
            } break;
1735
0
        case LLM_ARCH_T5:
1736
0
            {
1737
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
1738
0
                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1739
1740
0
                uint32_t dec_start_token_id;
1741
0
                if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
1742
0
                    hparams.dec_start_token_id = dec_start_token_id;
1743
0
                }
1744
1745
0
                hparams.dec_n_layer = hparams.n_layer;
1746
0
                ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1747
1748
0
                switch (hparams.n_layer) {
1749
0
                    case 6:  type = LLM_TYPE_60M;  break; // t5-small
1750
0
                    case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
1751
0
                    case 12:
1752
0
                        switch (hparams.n_ff()) {
1753
0
                            case 3072: type = LLM_TYPE_220M; break; // t5-base
1754
0
                            case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
1755
0
                            default: type = LLM_TYPE_UNKNOWN;
1756
0
                        } break;
1757
0
                    case 24:
1758
0
                        switch (hparams.n_ff()) {
1759
0
                            case 4096:  type = LLM_TYPE_770M; break; // t5-large
1760
0
                            case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
1761
0
                            case 16384: type = LLM_TYPE_3B;   break; // t5-3b
1762
0
                            case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
1763
0
                            case 65536: type = LLM_TYPE_11B;  break; // t5-11b
1764
0
                            case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
1765
0
                            default: type = LLM_TYPE_UNKNOWN;
1766
0
                        } break;
1767
0
                    default: type = LLM_TYPE_UNKNOWN;
1768
0
               }
1769
0
            } break;
1770
0
        case LLM_ARCH_T5ENCODER:
1771
0
            {
1772
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1773
0
                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1774
0
                type = LLM_TYPE_UNKNOWN;
1775
0
            } break;
1776
0
        case LLM_ARCH_JAIS:
1777
0
            {
1778
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1779
0
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1780
1781
0
                switch (hparams.n_layer) {
1782
0
                    case 24: type = LLM_TYPE_1_3B; break;
1783
0
                    case 40: type = LLM_TYPE_13B; break;
1784
                    /* TODO: add variants */
1785
0
                    default: type = LLM_TYPE_UNKNOWN;
1786
0
                }
1787
0
            } break;
1788
0
        case LLM_ARCH_NEMOTRON:
1789
0
            {
1790
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1791
0
                switch (hparams.n_layer) {
1792
0
                    case 32: type = LLM_TYPE_4B; break;
1793
0
                    default: type = LLM_TYPE_UNKNOWN;
1794
0
                }
1795
0
            } break;
1796
0
        case LLM_ARCH_NEMOTRON_H:
1797
0
            {
1798
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1799
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1800
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1801
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1802
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1803
1804
                // A layer is recurrent IFF the n_head_kv value is set to 0 and
1805
                // the n_ff value is set to 0
1806
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1807
0
                    hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1808
0
                }
1809
1810
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1811
1812
0
                switch (hparams.n_layer) {
1813
0
                    case 56: type = LLM_TYPE_9B; break;
1814
0
                    default: type = LLM_TYPE_UNKNOWN;
1815
0
                }
1816
0
            } break;
1817
0
        case LLM_ARCH_EXAONE:
1818
0
            {
1819
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1820
1821
0
                switch (hparams.n_layer) {
1822
0
                    case 32: type = LLM_TYPE_8B; break;
1823
0
                    default: type = LLM_TYPE_UNKNOWN;
1824
0
                }
1825
0
            } break;
1826
0
        case LLM_ARCH_EXAONE4:
1827
0
            {
1828
0
                if (hparams.n_layer == 64) {    // 32B
1829
0
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1830
0
                    hparams.n_swa = 4096;
1831
0
                    hparams.set_swa_pattern(4);
1832
0
                }
1833
1834
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
1835
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1836
1837
0
                switch (hparams.n_layer) {
1838
0
                    case 30: type = LLM_TYPE_1_2B; break;
1839
0
                    case 64: type = LLM_TYPE_32B; break;
1840
0
                    default: type = LLM_TYPE_UNKNOWN;
1841
0
                }
1842
0
            } break;
1843
0
        case LLM_ARCH_RWKV6:
1844
0
        case LLM_ARCH_RWKV6QWEN2:
1845
0
            {
1846
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
1847
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
1848
0
                ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
1849
0
                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
1850
0
                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
1851
0
                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
1852
0
                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
1853
1854
0
                switch (hparams.n_layer) {
1855
0
                    case 24: type = LLM_TYPE_1_6B; break;
1856
0
                    case 32:
1857
0
                        switch (hparams.n_embd) {
1858
0
                            case 2560: type = LLM_TYPE_3B; break;
1859
0
                            case 4096: type = LLM_TYPE_7B; break;
1860
0
                            default: type = LLM_TYPE_UNKNOWN;
1861
0
                        } break;
1862
0
                    case 61: type = LLM_TYPE_14B; break;
1863
0
                    case 64: type = LLM_TYPE_32B; break;
1864
0
                    default: type = LLM_TYPE_UNKNOWN;
1865
0
                }
1866
0
            } break;
1867
0
        case LLM_ARCH_RWKV7:
1868
0
        case LLM_ARCH_ARWKV7:
1869
0
            {
1870
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
1871
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
1872
0
                ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
1873
0
                ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
1874
0
                ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
1875
0
                ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
1876
0
                ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
1877
0
                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
1878
1879
0
                switch (hparams.n_layer) {
1880
0
                    case 12:
1881
0
                        switch (hparams.n_embd) {
1882
0
                            case 768: type = LLM_TYPE_190M; break;
1883
0
                            default: type = LLM_TYPE_UNKNOWN;
1884
0
                        } break;
1885
0
                    case 24:
1886
0
                        switch (hparams.n_embd) {
1887
0
                            case 1024: type = LLM_TYPE_450M; break;
1888
0
                            case 2048: type = LLM_TYPE_1_5B; break;
1889
0
                            default: type = LLM_TYPE_UNKNOWN;
1890
0
                        } break;
1891
0
                    case 28:
1892
0
                        switch (hparams.n_embd) {
1893
0
                            case 1536: type = LLM_TYPE_1_5B; break;
1894
0
                            case 3584: type = LLM_TYPE_7B; break;
1895
0
                            default: type = LLM_TYPE_UNKNOWN;
1896
0
                        } break;
1897
0
                    case 32:
1898
0
                        switch (hparams.n_embd) {
1899
0
                            case 2560: type = LLM_TYPE_2_9B; break;
1900
0
                            case 4096: type = LLM_TYPE_7B; break;
1901
0
                            default: type = LLM_TYPE_UNKNOWN;
1902
0
                        } break;
1903
0
                    case 61:
1904
0
                        switch (hparams.n_embd) {
1905
0
                            case 4096: type = LLM_TYPE_14B; break;
1906
0
                            default: type = LLM_TYPE_UNKNOWN;
1907
0
                        } break;
1908
0
                    default: type = LLM_TYPE_UNKNOWN;
1909
0
                }
1910
0
            } break;
1911
0
        case LLM_ARCH_GRANITE:
1912
0
        case LLM_ARCH_GRANITE_MOE:
1913
0
            {
1914
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1915
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
1916
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
1917
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
1918
0
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale);
1919
1920
                // Granite uses rope_finetuned as a switch for rope, so default to true
1921
0
                bool rope_finetuned = true;
1922
0
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1923
0
                hparams.rope_finetuned = rope_finetuned;
1924
1925
0
                switch (hparams.n_layer) {
1926
0
                    case 32: type = LLM_TYPE_3B; break;
1927
0
                    case 40: type = LLM_TYPE_3B; break;
1928
                    // Add additional layer/vocab/etc checks here for other model sizes
1929
0
                    default: type = LLM_TYPE_UNKNOWN;
1930
0
                }
1931
1932
                // For Granite MoE Shared
1933
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1934
0
            } break;
1935
0
        case LLM_ARCH_GRANITE_HYBRID:
1936
0
            {
1937
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1938
0
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
1939
0
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
1940
0
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
1941
0
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
1942
1943
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
1944
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
1945
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
1946
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1947
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
1948
1949
                // Granite uses rope_finetuned as a switch for rope, so default to true
1950
0
                bool rope_finetuned = true;
1951
0
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1952
0
                hparams.rope_finetuned = rope_finetuned;
1953
1954
                // A layer is recurrent IFF the n_head_kv value is set to 0
1955
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1956
0
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1957
0
                }
1958
1959
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1960
1961
0
                switch (hparams.n_embd) {
1962
0
                    case 768: type = LLM_TYPE_350M; break;
1963
0
                    case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
1964
0
                    case 2048: case 2560: type = LLM_TYPE_3B; break;
1965
0
                    case 4096: type = LLM_TYPE_32B; break;
1966
0
                    default: type = LLM_TYPE_UNKNOWN;
1967
0
                }
1968
1969
                // For Granite MoE Shared
1970
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1971
0
            } break;
1972
0
        case LLM_ARCH_CHAMELEON:
1973
0
            {
1974
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1975
0
                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
1976
0
                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
1977
1978
0
                switch (hparams.n_layer) {
1979
0
                    case 32: type = LLM_TYPE_7B; break;
1980
0
                    case 48: type = LLM_TYPE_34B; break;
1981
0
                    default: type = LLM_TYPE_UNKNOWN;
1982
0
               }
1983
0
            } break;
1984
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
1985
0
            {
1986
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
1987
0
                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
1988
0
                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
1989
0
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
1990
0
            } break;
1991
0
        case LLM_ARCH_BAILINGMOE:
1992
0
            {
1993
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1994
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
1995
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
1996
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
1997
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
1998
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
1999
2000
0
                switch (hparams.n_layer) {
2001
0
                    case 28: type = LLM_TYPE_16B; break;
2002
0
                    case 88: type = LLM_TYPE_290B; break;
2003
0
                    default: type = LLM_TYPE_UNKNOWN;
2004
0
                }
2005
0
            } break;
2006
0
        case LLM_ARCH_BAILINGMOE2:
2007
0
            {
2008
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2009
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
2010
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2011
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2012
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
2013
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
2014
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
2015
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
2016
0
                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
2017
2018
                // TODO: when MTP is implemented, this should probably be updated if needed
2019
0
                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
2020
2021
0
                switch (hparams.n_layer) {
2022
0
                    case 20: type = LLM_TYPE_16B_A1B; break;
2023
0
                    case 21: type = LLM_TYPE_16B_A1B; break;
2024
0
                    case 32: type = LLM_TYPE_100B_A6B; break;
2025
0
                    case 33: type = LLM_TYPE_100B_A6B; break;
2026
0
                    default: type = LLM_TYPE_UNKNOWN;
2027
0
                }
2028
0
            } break;
2029
0
        case LLM_ARCH_DOTS1:
2030
0
            {
2031
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2032
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2033
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2034
0
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
2035
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
2036
0
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
2037
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2038
0
                switch (hparams.n_layer) {
2039
0
                    case 62: type = LLM_TYPE_142B; break;
2040
0
                    default: type = LLM_TYPE_UNKNOWN;
2041
0
                }
2042
0
            } break;
2043
0
        case LLM_ARCH_ERNIE4_5:
2044
0
        case LLM_ARCH_ERNIE4_5_MOE:
2045
0
            {
2046
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2047
0
                if (arch == LLM_ARCH_ERNIE4_5_MOE) {
2048
0
                    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2049
0
                    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2050
0
                    ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
2051
0
                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
2052
0
                }
2053
2054
0
                switch (hparams.n_layer) {
2055
0
                    case 18: type = LLM_TYPE_0_3B; break;
2056
0
                    case 28: type = LLM_TYPE_21B_A3B; break;
2057
0
                    case 54: type = LLM_TYPE_300B_A47B; break;
2058
0
                    default: type = LLM_TYPE_UNKNOWN;
2059
0
                }
2060
0
            } break;
2061
0
        case LLM_ARCH_FALCON_H1:
2062
0
            {
2063
                // Common parameters
2064
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2065
2066
                // SSM parameters
2067
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2068
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2069
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2070
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2071
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2072
2073
0
                std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
2074
2075
0
                switch (hparams.n_layer) {
2076
0
                    case 36:
2077
0
                        type = LLM_TYPE_0_5B; break;
2078
0
                    case 24:
2079
0
                        type = LLM_TYPE_1_5B; break;
2080
0
                    case 66:
2081
0
                        type = LLM_TYPE_1B; break;
2082
0
                    case 32:
2083
0
                        type = LLM_TYPE_3B; break;
2084
0
                    case 44:
2085
0
                        type = LLM_TYPE_7B; break;
2086
0
                    case 72:
2087
0
                        type = LLM_TYPE_34B; break;
2088
0
                    default:
2089
0
                        type = LLM_TYPE_UNKNOWN;
2090
0
                }
2091
0
            } break;
2092
0
        case LLM_ARCH_HUNYUAN_MOE:
2093
0
            {
2094
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2095
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2096
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2097
2098
0
                switch (hparams.n_layer) {
2099
0
                    case 32: type = LLM_TYPE_A13B; break;
2100
0
                    default: type = LLM_TYPE_UNKNOWN;
2101
0
                }
2102
0
            } break;
2103
0
        case LLM_ARCH_HUNYUAN_DENSE:
2104
0
            {
2105
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2106
2107
0
                switch (hparams.n_embd) {
2108
0
                    case 1024: type = LLM_TYPE_0_5B; break;
2109
0
                    case 2048: type = LLM_TYPE_1_8B; break;
2110
0
                    case 3072: type = LLM_TYPE_4B; break;
2111
0
                    case 4096: type = LLM_TYPE_7B; break;
2112
0
                    default: type = LLM_TYPE_UNKNOWN;
2113
0
                }
2114
0
            } break;
2115
0
        case LLM_ARCH_SMOLLM3:
2116
0
            {
2117
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2118
0
                hparams.n_no_rope_layer_step = 4;
2119
2120
0
                switch (hparams.n_layer) {
2121
0
                    case 36: type = LLM_TYPE_3B; break;
2122
0
                    default: type = LLM_TYPE_UNKNOWN;
2123
0
                }
2124
0
            } break;
2125
0
        case LLM_ARCH_OPENAI_MOE:
2126
0
            {
2127
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2128
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2129
0
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
2130
2131
0
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2132
0
                hparams.set_swa_pattern(2);
2133
2134
0
                switch (hparams.n_layer) {
2135
0
                    case 24: type = LLM_TYPE_20B; break;
2136
0
                    case 36: type = LLM_TYPE_120B; break;
2137
0
                    default: type = LLM_TYPE_UNKNOWN;
2138
0
                }
2139
0
            } break;
2140
0
        case LLM_ARCH_LFM2:
2141
0
            {
2142
0
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2143
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2144
0
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2145
0
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2146
0
                }
2147
0
                hparams.n_layer_dense_lead = hparams.n_layer;
2148
0
                switch (hparams.n_ff()) {
2149
0
                    case  4608: type = LLM_TYPE_350M; break;
2150
0
                    case  6912: type = LLM_TYPE_700M; break;
2151
0
                    case  8192: type = LLM_TYPE_1_2B; break;
2152
0
                    case 10752: type = LLM_TYPE_2_6B; break;
2153
0
                    default:    type = LLM_TYPE_UNKNOWN;
2154
0
                }
2155
0
            } break;
2156
0
        case LLM_ARCH_LFM2MOE:
2157
0
            {
2158
0
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
2159
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2160
0
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
2161
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
2162
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
2163
2164
0
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2165
0
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2166
0
                }
2167
2168
0
                type = LLM_TYPE_8B_A1B;
2169
0
            } break;
2170
0
        case LLM_ARCH_SMALLTHINKER:
2171
0
            {
2172
0
                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
2173
2174
0
                if (found_swa && hparams.n_swa > 0) {
2175
0
                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
2176
0
                    hparams.n_swa         = 4096;
2177
0
                    hparams.set_swa_pattern(4, true);
2178
0
                } else {
2179
0
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
2180
0
                    hparams.n_no_rope_layer_step = hparams.n_layer;
2181
0
                }
2182
2183
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
2184
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2185
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
2186
2187
0
                switch (hparams.n_layer) {
2188
0
                    case 32: type = LLM_TYPE_4B;  break;
2189
0
                    case 52: type = LLM_TYPE_20B; break;
2190
0
                    default: type = LLM_TYPE_UNKNOWN;
2191
0
                }
2192
0
            } break;
2193
0
        case LLM_ARCH_GROVEMOE:
2194
0
            {
2195
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
2196
0
                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp);
2197
0
                ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
2198
0
                ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
2199
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2200
2201
0
                switch (hparams.n_layer) {
2202
0
                    case 48: type = LLM_TYPE_30B_A3B; break;
2203
0
                    default: type = LLM_TYPE_UNKNOWN;
2204
0
                }
2205
0
            } break;
2206
0
        case LLM_ARCH_APERTUS:
2207
0
            {
2208
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2209
0
                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
2210
0
                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
2211
0
                ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
2212
0
                ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
2213
2214
0
                switch (hparams.n_layer) {
2215
0
                    case 32: type = LLM_TYPE_8B; break;
2216
0
                    default: type = LLM_TYPE_UNKNOWN;
2217
0
                }
2218
0
            } break;
2219
0
        case LLM_ARCH_MINIMAX_M2:
2220
0
            {
2221
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
2222
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
2223
0
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
2224
2225
0
                switch (hparams.n_layer) {
2226
0
                    case 62: type = LLM_TYPE_230B_A10B; break;
2227
0
                    default: type = LLM_TYPE_UNKNOWN;
2228
0
                }
2229
0
            } break;
2230
0
        case LLM_ARCH_COGVLM:
2231
0
            {
2232
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2233
0
                switch (hparams.n_layer) {
2234
0
                    case 32: type = LLM_TYPE_13B; break;
2235
0
                    default: type = LLM_TYPE_UNKNOWN;
2236
0
                }
2237
0
            } break;
2238
0
        case LLM_ARCH_PANGU_EMBED:
2239
0
            {
2240
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2241
0
                switch (hparams.n_layer) {
2242
0
                    case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
2243
0
                    case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
2244
0
                    default: type = LLM_TYPE_UNKNOWN;
2245
0
                }
2246
0
            } break;
2247
0
        case LLM_ARCH_QWEN3NEXT:
2248
0
            {
2249
0
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
2250
0
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2251
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
2252
2253
                // Load linear attention (gated delta net) parameters
2254
0
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
2255
0
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
2256
0
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
2257
0
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2258
0
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
2259
2260
                // Mark recurrent layers (linear attention layers)
2261
0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2262
0
                    hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
2263
0
                }
2264
2265
0
                switch (hparams.n_layer) {
2266
0
                    case 48: type = LLM_TYPE_80B_A3B; break;
2267
0
                    default: type = LLM_TYPE_UNKNOWN;
2268
0
                }
2269
0
            } break;
2270
0
        case LLM_ARCH_MISTRAL3:
2271
0
            {
2272
0
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2273
0
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2274
2275
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
2276
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
2277
0
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
2278
2279
                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2280
0
                if (hparams.f_attn_temp_scale != 0.0f) {
2281
0
                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
2282
0
                    if (hparams.n_attn_temp_floor_scale == 0) {
2283
0
                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
2284
0
                    }
2285
0
                }
2286
2287
0
                switch (hparams.n_layer) {
2288
0
                    case 26: type = LLM_TYPE_3B; break;
2289
0
                    case 34: type = LLM_TYPE_8B; break;
2290
0
                    case 40: type = LLM_TYPE_14B; break;
2291
0
                    default: type = LLM_TYPE_UNKNOWN;
2292
0
                }
2293
0
            } break;
2294
0
        default: throw std::runtime_error("unsupported model architecture");
2295
44
    }
2296
2297
    // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
2298
0
    if (hparams.rope_yarn_log_mul != 0.0f) {
2299
0
        const float factor = 1.0f / hparams.rope_freq_scale_train;
2300
2301
        // note: here we assume `mscale == 1.0f`
2302
        // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
2303
0
              float mscale          = 1.0f;
2304
0
        const float mscale_all_dims = hparams.rope_yarn_log_mul;
2305
2306
        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
2307
        // special-case DEEPSEEK v2:
2308
        // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
2309
0
        if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
2310
0
            mscale = mscale_all_dims;
2311
0
        }
2312
2313
0
        static auto get_mscale = [](float scale, float mscale) {
2314
0
            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
2315
0
        };
2316
2317
0
        hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
2318
2319
0
        LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
2320
0
                __func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
2321
0
    }
2322
2323
0
    pimpl->n_bytes = ml.n_bytes;
2324
2325
0
    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
2326
2327
0
    if (hparams.f_max_alibi_bias > 0.0f) {
2328
0
        hparams.use_alibi = true;
2329
0
    }
2330
2331
0
    hparams.rope_type = llama_model_rope_type(this);
2332
0
}
2333
2334
0
void llama_model::load_vocab(llama_model_loader & ml) {
2335
0
    const auto kv = LLM_KV(arch);
2336
2337
0
    vocab.load(ml, kv);
2338
0
}
2339
2340
0
bool llama_model::load_tensors(llama_model_loader & ml) {
2341
0
    const auto & split_mode   = params.split_mode;
2342
0
    const auto & n_gpu_layers = params.n_gpu_layers;
2343
0
    const auto & use_mlock    = params.use_mlock;
2344
0
    const auto & tensor_split = params.tensor_split;
2345
2346
0
    const int n_layer = hparams.n_layer;
2347
2348
0
    const bool use_mmap_buffer = true;
2349
2350
0
    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2351
2352
    // build a list of buffer types for the CPU and GPU devices
2353
0
    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2354
0
    for (auto * dev : devices) {
2355
0
        buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2356
        // add CPU buffer types as a fallback
2357
0
        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
2358
0
        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2359
0
    }
2360
2361
    // calculate the split points
2362
0
    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2363
0
    std::vector<float> splits(n_devices());
2364
0
    if (all_zero) {
2365
        // default split, by free memory
2366
0
        for (size_t i = 0; i < n_devices(); ++i) {
2367
0
            ggml_backend_dev_t dev = devices[i];
2368
0
            size_t total;
2369
0
            size_t free;
2370
0
            ggml_backend_dev_memory(dev, &free, &total);
2371
0
            splits[i] = free;
2372
0
        }
2373
0
    } else {
2374
0
        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
2375
0
    }
2376
2377
    // sum and normalize the splits to get the split points
2378
0
    float split_sum = 0.0f;
2379
0
    for (size_t i = 0; i < n_devices(); ++i) {
2380
0
        split_sum += splits[i];
2381
0
        splits[i] = split_sum;
2382
0
    }
2383
0
    for (size_t i = 0; i < n_devices(); ++i) {
2384
0
        splits[i] /= split_sum;
2385
0
    }
2386
2387
0
    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2388
0
    if (cpu_dev == nullptr) {
2389
0
        throw std::runtime_error(format("%s: no CPU backend found", __func__));
2390
0
    }
2391
0
    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
2392
0
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
2393
0
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2394
0
        const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
2395
0
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2396
0
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2397
0
            return {cpu_dev, &pimpl->cpu_buft_list};
2398
0
        }
2399
0
        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
2400
0
        auto * dev = devices.at(layer_gpu);
2401
0
        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
2402
0
        return {dev, &pimpl->gpu_buft_list.at(dev)};
2403
0
    };
2404
2405
    // assign the input layer
2406
    // there is very little benefit to offloading the input layer, so always keep it on the CPU
2407
0
    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
2408
2409
    // assign the repeating layers to the devices according to the splits
2410
0
    pimpl->dev_layer.resize(n_layer);
2411
0
    for (int il = 0; il < n_layer; ++il) {
2412
0
        pimpl->dev_layer[il] = get_layer_buft_list(il);
2413
0
    }
2414
2415
    // assign the output layer
2416
0
    pimpl->dev_output = get_layer_buft_list(n_layer);
2417
2418
    // one ggml context per buffer type
2419
0
    int max_n_tensors = ml.n_tensors;
2420
0
    max_n_tensors += 1;         // duplicated output tensor
2421
0
    max_n_tensors += n_layer*2; // duplicated rope freq tensors
2422
0
    const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2423
2424
    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2425
0
    struct ggml_backend_buft_comparator {
2426
0
        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2427
0
            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2428
0
        }
2429
0
    };
2430
0
    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2431
2432
0
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2433
0
        auto it = ctx_map.find(buft);
2434
0
        if (it == ctx_map.end()) {
2435
0
            ggml_init_params params = {
2436
0
                /*.mem_size   =*/ ctx_size,
2437
0
                /*.mem_buffer =*/ NULL,
2438
0
                /*.no_alloc   =*/ true,
2439
0
            };
2440
2441
0
            ggml_context * ctx = ggml_init(params);
2442
0
            if (!ctx) {
2443
0
                throw std::runtime_error(format("failed to create ggml context"));
2444
0
            }
2445
2446
0
            ctx_map.emplace(buft, ctx);
2447
2448
0
            return ctx;
2449
0
        }
2450
0
        return it->second.get();
2451
0
    };
2452
2453
0
    const auto TENSOR_DUPLICATED   = llama_model_loader::TENSOR_DUPLICATED;
2454
0
    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2455
0
    const auto TENSOR_SKIP         = llama_model_loader::TENSOR_SKIP;
2456
2457
    // create tensors for the weights
2458
0
    {
2459
        // note: cast to int64_t since we will use these for the tensor dimensions
2460
0
        const int64_t n_head        = hparams.n_head();
2461
0
        const int64_t n_head_kv     = hparams.n_head_kv();
2462
0
        const int64_t n_embd        = hparams.n_embd;
2463
0
        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
2464
0
        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
2465
0
        const int64_t n_embd_head_k = hparams.n_embd_head_k;
2466
0
        const int64_t n_embd_head_v = hparams.n_embd_head_v;
2467
0
        const int64_t n_ff          = hparams.n_ff();
2468
0
        const int64_t n_embd_gqa    = n_embd_v_gqa;
2469
0
        const int64_t n_vocab       = vocab.n_tokens();
2470
0
        const int64_t n_token_types = vocab.n_token_types();
2471
0
        const int64_t n_rot         = hparams.n_rot;
2472
0
        const int64_t n_expert      = hparams.n_expert;
2473
0
        const int64_t n_expert_used = hparams.n_expert_used;
2474
0
        const int64_t n_ctx_train   = hparams.n_ctx_train;
2475
2476
0
        if (n_expert > 0 && hparams.n_expert_used == 0) {
2477
0
            throw std::runtime_error("model has expert layers but no expert layers are used");
2478
0
        }
2479
2480
0
        int n_moved_tensors = 0;
2481
0
        ggml_tensor * first_moved_tensor = nullptr;
2482
0
        ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
2483
0
        ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
2484
2485
0
        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2486
0
            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
2487
2488
0
            if (!t_meta) {
2489
0
                if (flags & TENSOR_NOT_REQUIRED) {
2490
0
                    return nullptr;
2491
0
                }
2492
0
                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
2493
0
            }
2494
2495
            // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
2496
            // the tensor is duplicated
2497
            // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
2498
0
            llm_tensor tn_tensor = tn.tensor;
2499
0
            if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
2500
0
                tn_tensor = LLM_TENSOR_OUTPUT;
2501
0
            }
2502
2503
0
            llm_tensor_info info;
2504
0
            try {
2505
0
                info = llm_tensor_info_for(tn_tensor);
2506
0
            } catch (const std::out_of_range & e) {
2507
0
                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
2508
0
            }
2509
2510
            // skip unused tensors
2511
0
            if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2512
0
                const size_t nbytes = ggml_nbytes(t_meta);
2513
0
                LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2514
2515
0
                ml.size_data -= nbytes;
2516
0
                ml.n_created++;
2517
2518
0
                return nullptr;
2519
0
            }
2520
2521
            // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2522
0
            ggml_op op;
2523
0
            bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2524
0
            if (bias) {
2525
0
                if (info.op == GGML_OP_MUL_MAT_ID) {
2526
0
                    op = GGML_OP_ADD_ID;
2527
0
                } else {
2528
0
                    op = GGML_OP_ADD;
2529
0
                }
2530
0
            } else {
2531
0
                op = info.op;
2532
0
            }
2533
2534
            // sanity checks
2535
0
            if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
2536
0
                if (tn.bid != -1) {
2537
0
                    GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
2538
0
                }
2539
0
            } else {
2540
0
                if (tn.bid == -1) {
2541
0
                    GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
2542
0
                }
2543
0
            }
2544
2545
            // select the buffer type for this tensor
2546
0
            buft_list_t * buft_list;
2547
0
            switch (info.layer) {
2548
0
                case LLM_TENSOR_LAYER_INPUT:
2549
0
                    buft_list = pimpl->dev_input.buft_list;
2550
0
                    break;
2551
0
                case LLM_TENSOR_LAYER_OUTPUT:
2552
0
                    buft_list = pimpl->dev_output.buft_list;
2553
0
                    break;
2554
0
                case LLM_TENSOR_LAYER_REPEATING:
2555
0
                    buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
2556
0
                    break;
2557
0
                default:
2558
0
                    GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
2559
0
            }
2560
2561
0
            ggml_backend_buffer_type_t buft = nullptr;
2562
2563
            // check overrides
2564
0
            if (ml.tensor_buft_overrides) {
2565
0
                std::string tensor_name = tn.str();
2566
0
                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2567
0
                    std::regex pattern(overrides->pattern);
2568
0
                    if (std::regex_search(tensor_name, pattern)) {
2569
0
                        if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2570
                            // when overriding to a CPU buffer, consider the extra buffer types
2571
0
                            buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2572
0
                        } else {
2573
0
                            buft = overrides->buft;
2574
0
                        }
2575
2576
0
                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2577
0
                                tensor_name.c_str(),
2578
0
                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
2579
0
                                ggml_backend_buft_name(buft));
2580
0
                        break;
2581
0
                    }
2582
0
                }
2583
0
            }
2584
2585
0
            if (!buft) {
2586
0
                buft = select_weight_buft(hparams, t_meta, op, *buft_list);
2587
0
                if (!buft) {
2588
0
                    throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
2589
0
                }
2590
0
            }
2591
2592
            // avoid using a host buffer when using mmap
2593
0
            auto * buft_dev = ggml_backend_buft_get_device(buft);
2594
0
            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2595
0
                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2596
0
                if (!cpu_dev) {
2597
0
                    throw std::runtime_error("no CPU backend found");
2598
0
                }
2599
0
                buft = ggml_backend_dev_buffer_type(cpu_dev);
2600
0
            }
2601
2602
0
            if (buft != buft_list->front().second) {
2603
0
                n_moved_tensors++;
2604
0
                if (!first_moved_tensor) {
2605
0
                    first_moved_tensor = t_meta;
2606
0
                    first_moved_from_buft = buft_list->front().second;
2607
0
                    first_moved_to_buft   = buft;
2608
0
                }
2609
0
            }
2610
2611
0
            ggml_context * ctx = ctx_for_buft(buft);
2612
2613
            // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
2614
0
            if (flags & TENSOR_DUPLICATED) {
2615
0
                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
2616
0
                if (t) {
2617
0
                    return t;
2618
0
                }
2619
0
            }
2620
0
            return ml.create_tensor(ctx, tn, ne, flags);
2621
0
        };
2622
2623
0
        layers.resize(n_layer);
2624
2625
        // TODO: move to a separate function
2626
0
        const auto tn = LLM_TN(arch);
2627
0
        switch (arch) {
2628
0
            case LLM_ARCH_LLAMA:
2629
0
            case LLM_ARCH_REFACT:
2630
0
            case LLM_ARCH_MINICPM:
2631
0
            case LLM_ARCH_GRANITE:
2632
0
            case LLM_ARCH_GRANITE_MOE:
2633
0
            case LLM_ARCH_MISTRAL3:
2634
0
                {
2635
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2636
2637
                    // output
2638
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2639
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2640
2641
                    // if output is NULL, init from the input tok embed
2642
0
                    if (output == NULL) {
2643
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2644
0
                    }
2645
2646
0
                    for (int i = 0; i < n_layer; ++i) {
2647
0
                        auto & layer = layers[i];
2648
2649
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2650
2651
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2652
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2653
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2654
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2655
2656
                        // optional bias tensors
2657
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2658
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2659
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2660
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2661
2662
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2663
2664
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2665
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2666
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2667
0
                        }
2668
0
                        else {
2669
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2670
0
                        }
2671
2672
0
                        if (n_expert == 0) {
2673
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2674
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2675
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2676
2677
                            // optional MLP bias
2678
0
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2679
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2680
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2681
0
                        } else {
2682
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2683
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2684
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
2685
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
2686
2687
                            // For Granite MoE Shared
2688
0
                            if (hparams.n_ff_shexp > 0) {
2689
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2690
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2691
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
2692
0
                            }
2693
0
                        }
2694
0
                    }
2695
0
                } break;
2696
0
            case LLM_ARCH_LLADA:
2697
0
                {
2698
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2699
2700
                    // output
2701
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2702
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2703
2704
                    // if output is NULL, init from the input tok embed
2705
0
                    if (output == NULL) {
2706
0
                        output =
2707
0
                            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2708
0
                    }
2709
2710
0
                    for (int i = 0; i < n_layer; ++i) {
2711
0
                        auto & layer = layers[i];
2712
2713
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2714
2715
                        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2716
0
                        layer.wq =
2717
0
                            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2718
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2719
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2720
                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2721
0
                        layer.wo =
2722
0
                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2723
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2724
2725
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2726
2727
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2728
0
                                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2729
2730
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2731
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2732
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2733
2734
                        // optional MLP bias
2735
0
                        layer.ffn_gate_b =
2736
0
                            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2737
0
                        layer.ffn_down_b =
2738
0
                            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2739
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2740
0
                    }
2741
0
                }
2742
0
                break;
2743
0
            case LLM_ARCH_LLADA_MOE:
2744
0
                {
2745
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2746
2747
                    // output
2748
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2749
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
2750
2751
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2752
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2753
2754
0
                    for (int i = 0; i < n_layer; ++i) {
2755
0
                        auto & layer = layers[i];
2756
2757
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2758
2759
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
2760
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
2761
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
2762
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2763
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2764
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2765
2766
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2767
2768
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2769
2770
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2771
2772
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
2773
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
2774
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
2775
0
                    }
2776
0
                } break;
2777
0
            case LLM_ARCH_LLAMA4:
2778
0
                {
2779
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2780
2781
                    // output
2782
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2783
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2784
2785
                    // if output is NULL, init from the input tok embed
2786
0
                    if (output == NULL) {
2787
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2788
0
                    }
2789
2790
0
                    for (int i = 0; i < n_layer; ++i) {
2791
0
                        bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2792
2793
0
                        auto & layer = layers[i];
2794
2795
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2796
2797
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2798
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2799
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2800
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2801
2802
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2803
2804
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2805
2806
0
                        if (is_moe_layer) {
2807
0
                            int n_ff_exp = hparams.n_ff_exp;
2808
2809
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2810
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
2811
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
2812
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
2813
2814
                            // Shared expert
2815
0
                            const int64_t n_ff_shexp = n_ff_exp;
2816
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
2817
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
2818
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
2819
0
                        } else {
2820
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2821
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2822
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2823
0
                        }
2824
0
                    }
2825
0
                } break;
2826
0
            case LLM_ARCH_DECI:
2827
0
                {
2828
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2829
2830
                    // output
2831
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2832
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2833
2834
                    // if output is NULL, init from the input tok embed
2835
0
                    if (output == NULL) {
2836
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2837
0
                    }
2838
2839
0
                    for (int i = 0; i < n_layer; ++i) {
2840
0
                        auto & layer = layers[i];
2841
0
                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
2842
0
                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
2843
0
                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
2844
0
                        const int64_t n_ff          = hparams.n_ff(i);
2845
0
                        const int64_t n_head        = hparams.n_head(i);
2846
0
                        const int64_t n_head_kv     = hparams.n_head_kv(i);
2847
2848
0
                        if (n_head_kv == 0 && n_head > 0) {
2849
                            // linear attention for DeciLMCausalModel
2850
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2851
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2852
0
                        }
2853
0
                        else if (n_head_kv > 0) {
2854
0
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2855
2856
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2857
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
2858
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
2859
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2860
0
                        }
2861
2862
                        // optional bias tensors
2863
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2864
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2865
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2866
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
2867
2868
0
                        if (n_ff > 0) {
2869
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2870
0
                        }
2871
2872
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2873
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2874
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2875
0
                        }
2876
0
                        else {
2877
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2878
0
                        }
2879
2880
0
                        if (n_ff > 0) {
2881
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2882
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2883
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2884
0
                        }
2885
2886
                        // optional MLP bias
2887
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2888
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2889
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2890
0
                    }
2891
0
                } break;
2892
0
            case LLM_ARCH_MINICPM3:
2893
0
                {
2894
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
2895
0
                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
2896
2897
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
2898
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
2899
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2900
2901
                    // output
2902
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2903
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2904
2905
                    // if output is NULL, init from the input tok embed
2906
0
                    if (output == NULL) {
2907
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2908
0
                    }
2909
2910
0
                    for (int i = 0; i < n_layer; ++i) {
2911
0
                        auto & layer = layers[i];
2912
2913
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2914
0
                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
2915
2916
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
2917
2918
0
                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
2919
0
                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
2920
2921
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
2922
0
                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
2923
0
                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
2924
2925
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2926
2927
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
2928
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
2929
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
2930
2931
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2932
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2933
0
                    }
2934
0
                } break;
2935
0
            case LLM_ARCH_GROK:
2936
0
                {
2937
0
                    if (n_expert == 0) {
2938
0
                        throw std::runtime_error("Grok model cannot have zero experts");
2939
0
                    }
2940
2941
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2942
2943
                    // output
2944
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2945
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2946
2947
                    // if output is NULL, init from the input tok embed
2948
0
                    if (output == NULL) {
2949
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2950
0
                    }
2951
2952
0
                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
2953
0
                    for (int i = 0; i < n_layer; ++i) {
2954
0
                        auto & layer = layers[i];
2955
2956
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2957
2958
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
2959
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
2960
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
2961
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2962
2963
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2964
2965
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2966
2967
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2968
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, TENSOR_NOT_REQUIRED);
2969
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2970
2971
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
2972
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
2973
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
2974
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
2975
2976
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2977
0
                        if (!layer.ffn_post_norm) {
2978
0
                            layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2979
0
                        }
2980
0
                    }
2981
0
                } break;
2982
0
            case LLM_ARCH_DBRX:
2983
0
                {
2984
0
                    if (n_expert == 0) {
2985
0
                        throw std::runtime_error("DBRX model cannot have zero experts");
2986
0
                    }
2987
2988
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2989
2990
                    // output
2991
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2992
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
2993
2994
0
                    for (int i = 0; i < n_layer; ++i) {
2995
0
                        auto & layer = layers[i];
2996
2997
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2998
2999
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3000
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3001
3002
0
                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3003
3004
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
3005
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
3006
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
3007
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
3008
0
                    }
3009
0
                } break;
3010
0
            case LLM_ARCH_BAICHUAN:
3011
0
                {
3012
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3013
0
                    {
3014
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3015
0
                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3016
0
                    }
3017
3018
0
                    for (int i = 0; i < n_layer; ++i) {
3019
0
                        auto & layer = layers[i];
3020
3021
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3022
3023
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3024
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3025
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3026
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3027
3028
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3029
3030
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3031
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3032
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3033
0
                    }
3034
0
                } break;
3035
0
            case LLM_ARCH_FALCON:
3036
0
                {
3037
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3038
3039
                    // output
3040
0
                    {
3041
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3042
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3043
3044
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3045
0
                        if (!output) {
3046
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3047
0
                        }
3048
0
                    }
3049
3050
0
                    for (int i = 0; i < n_layer; ++i) {
3051
0
                        auto & layer = layers[i];
3052
3053
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3054
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3055
3056
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3057
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3058
3059
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3060
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3061
3062
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3063
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3064
0
                    }
3065
0
                } break;
3066
0
            case LLM_ARCH_STARCODER:
3067
0
                {
3068
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3069
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
3070
3071
                    // output
3072
0
                    {
3073
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3074
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3075
0
                        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3076
0
                        if (!output) {
3077
                            // needs to be on GPU
3078
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3079
0
                        }
3080
3081
0
                    }
3082
3083
0
                    for (int i = 0; i < n_layer; ++i) {
3084
0
                        auto & layer = layers[i];
3085
3086
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3087
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3088
3089
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3090
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3091
3092
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3093
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3094
3095
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3096
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3097
3098
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3099
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3100
3101
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
3102
0
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
3103
0
                    }
3104
0
                } break;
3105
0
            case LLM_ARCH_BERT:
3106
0
            case LLM_ARCH_NOMIC_BERT:
3107
0
            case LLM_ARCH_NOMIC_BERT_MOE:
3108
0
            case LLM_ARCH_JINA_BERT_V3:
3109
0
                {
3110
0
                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3111
0
                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
3112
3113
0
                    if (arch == LLM_ARCH_BERT) {
3114
0
                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
3115
3116
0
                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3117
0
                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3118
3119
0
                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3120
0
                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3121
0
                    }
3122
3123
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3124
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
3125
3126
0
                    for (int i = 0; i < n_layer; ++i) {
3127
0
                        auto & layer = layers[i];
3128
3129
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3130
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3131
3132
0
                        if (!layer.wqkv) {
3133
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3134
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
3135
3136
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3137
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);
3138
3139
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3140
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
3141
0
                        }
3142
3143
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
3144
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3145
3146
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3147
0
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
3148
3149
0
                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
3150
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
3151
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
3152
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
3153
0
                        } else {
3154
0
                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3155
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3156
0
                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3157
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3158
3159
0
                            if (arch == LLM_ARCH_NOMIC_BERT) {
3160
0
                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3161
0
                            }
3162
0
                        }
3163
3164
0
                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3165
0
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
3166
0
                    }
3167
0
                } break;
3168
0
            case LLM_ARCH_NEO_BERT:
3169
0
                {
3170
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
3171
3172
0
                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3173
0
                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
3174
3175
0
                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3176
0
                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
3177
3178
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
3179
3180
0
                    for (int i = 0; i < n_layer; ++i) {
3181
0
                        auto & layer = layers[i];
3182
3183
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3184
3185
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3186
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3187
3188
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3189
3190
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
3191
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3192
0
                    }
3193
0
                } break;
3194
0
            case LLM_ARCH_JINA_BERT_V2:
3195
0
                {
3196
0
                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
3197
0
                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
3198
3199
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
3200
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm bias
3201
3202
0
                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
3203
0
                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
3204
0
                    for (int i = 0; i < n_layer; ++i) {
3205
0
                        auto & layer = layers[i]; // JinaBertLayer
3206
3207
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3208
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3209
3210
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3211
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3212
3213
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3214
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3215
3216
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3217
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3218
3219
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3220
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3221
3222
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
3223
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
3224
3225
0
                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
3226
0
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
3227
3228
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3229
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3230
3231
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3232
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
3233
3234
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3235
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3236
3237
0
                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3238
0
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
3239
0
                    }
3240
0
                } break;
3241
0
            case LLM_ARCH_BLOOM:
3242
0
                {
3243
0
                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
3244
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3245
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
3246
3247
                    // output
3248
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3249
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3250
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3251
3252
                    // if output is NULL, init from the input tok embed
3253
0
                    if (output == NULL) {
3254
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3255
0
                    }
3256
3257
0
                    for (int i = 0; i < n_layer; ++i) {
3258
0
                        auto & layer = layers[i];
3259
3260
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3261
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
3262
3263
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3264
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
3265
3266
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3267
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
3268
3269
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3270
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
3271
3272
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3273
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
3274
3275
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3276
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
3277
0
                    }
3278
0
                } break;
3279
0
            case LLM_ARCH_MPT:
3280
0
                {
3281
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3282
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
3283
3284
                    // output
3285
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3286
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
3287
3288
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3289
0
                    if (!output) {
3290
0
                        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3291
0
                    }
3292
3293
0
                    for (int i = 0; i < n_layer; ++i) {
3294
0
                        auto & layer = layers[i];
3295
3296
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3297
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3298
3299
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3300
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3301
3302
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3303
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3304
3305
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3306
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3307
3308
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3309
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3310
3311
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3312
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
3313
3314
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3315
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3316
3317
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3318
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
3319
3320
                        // AWQ ScaleActivation layer
3321
0
                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
3322
0
                    }
3323
0
                } break;
3324
0
            case LLM_ARCH_STABLELM:
3325
0
                {
3326
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3327
3328
                    // output
3329
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3330
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3331
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3332
3333
0
                    for (int i = 0; i < n_layer; ++i) {
3334
0
                        auto & layer = layers[i];
3335
3336
0
                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3337
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3338
3339
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3340
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3341
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3342
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3343
3344
                        // optional bias tensors, present in Stable LM 2 1.6B
3345
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
3346
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3347
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3348
3349
                        // optional q and k layernorms, present in StableLM 2 12B
3350
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
3351
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3352
3353
                        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
3354
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3355
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
3356
3357
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3358
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3359
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3360
0
                    }
3361
0
                } break;
3362
0
            case LLM_ARCH_QWEN:
3363
0
                {
3364
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3365
3366
                    // output
3367
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3368
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3369
3370
0
                    for (int i = 0; i < n_layer; ++i) {
3371
0
                        auto & layer = layers[i];
3372
3373
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3374
3375
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
3376
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
3377
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3378
3379
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3380
3381
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
3382
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
3383
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
3384
0
                    }
3385
0
                } break;
3386
0
            case LLM_ARCH_QWEN2:
3387
0
            case LLM_ARCH_QWEN2VL:
3388
0
            case LLM_ARCH_DREAM:
3389
0
                {
3390
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3391
3392
                    // output
3393
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3394
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3395
0
                    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
3396
                    // if output is NULL, init from the input tok embed
3397
0
                    if (output == NULL) {
3398
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3399
0
                    }
3400
3401
0
                    for (int i = 0; i < n_layer; ++i) {
3402
0
                        auto & layer = layers[i];
3403
3404
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3405
3406
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3407
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3408
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3409
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3410
3411
                        // optional bias tensors
3412
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
3413
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
3414
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
3415
3416
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3417
3418
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3419
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3420
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3421
0
                    }
3422
0
                } break;
3423
0
            case LLM_ARCH_QWEN2MOE:
3424
0
                {
3425
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3426
3427
                    // output
3428
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3429
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3430
3431
0
                    for (int i = 0; i < n_layer; ++i) {
3432
0
                        auto & layer = layers[i];
3433
3434
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3435
3436
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3437
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3438
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3439
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3440
3441
                        // optional bias tensors
3442
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3443
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3444
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3445
3446
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3447
3448
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3449
3450
0
                        if (n_expert == 0) {
3451
0
                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
3452
0
                        }
3453
0
                        if (n_expert_used == 0) {
3454
0
                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
3455
0
                        }
3456
3457
                        // MoE branch
3458
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3459
3460
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3461
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3462
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3463
3464
                        // Shared expert branch
3465
0
                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
3466
3467
0
                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
3468
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
3469
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
3470
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
3471
0
                    }
3472
0
                } break;
3473
0
            case LLM_ARCH_QWEN3:
3474
0
            case LLM_ARCH_QWEN3VL:
3475
0
                {
3476
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3477
3478
                    // output
3479
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3480
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3481
                    // if output is NULL, init from the input tok embed
3482
0
                    if (output == NULL) {
3483
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3484
0
                    }
3485
3486
                    // output rerank head
3487
0
                    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3488
3489
0
                    for (int i = 0; i < n_layer; ++i) {
3490
0
                        auto & layer = layers[i];
3491
3492
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3493
3494
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3495
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3496
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3497
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3498
3499
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3500
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3501
3502
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3503
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3504
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3505
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3506
0
                    }
3507
0
                } break;
3508
0
            case LLM_ARCH_QWEN3MOE:
3509
0
            case LLM_ARCH_QWEN3VLMOE:
3510
0
            case LLM_ARCH_RND1:
3511
0
                {
3512
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3513
3514
                    // output
3515
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3516
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3517
                    // if output is NULL, init from the input tok embed
3518
0
                    if (output == NULL) {
3519
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3520
0
                    }
3521
3522
0
                    for (int i = 0; i < n_layer; ++i) {
3523
0
                        auto & layer = layers[i];
3524
3525
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3526
3527
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3528
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3529
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3530
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3531
3532
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3533
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3534
3535
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3536
3537
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3538
3539
0
                        if (n_expert == 0) {
3540
0
                            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
3541
0
                        }
3542
0
                        if (n_expert_used == 0) {
3543
0
                            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
3544
0
                        }
3545
3546
                        // MoE branch
3547
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3548
3549
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3550
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
3551
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
3552
0
                    }
3553
0
                } break;
3554
0
            case LLM_ARCH_PHI2:
3555
0
                {
3556
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3557
3558
                    // output
3559
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3560
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3561
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3562
0
                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
3563
3564
0
                    for (int i = 0; i < n_layer; ++i) {
3565
0
                        auto & layer = layers[i];
3566
3567
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3568
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3569
3570
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3571
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3572
3573
0
                        if (layer.wqkv == nullptr) {
3574
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3575
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
3576
3577
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3578
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);
3579
3580
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3581
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);
3582
0
                        }
3583
3584
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3585
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3586
3587
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3588
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3589
3590
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3591
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
3592
0
                    }
3593
0
                } break;
3594
0
            case LLM_ARCH_PHI3:
3595
0
                {
3596
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3597
3598
                    // output
3599
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3600
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3601
3602
                    // if output is NULL, init from the input tok embed
3603
0
                    if (output == NULL) {
3604
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3605
0
                    }
3606
3607
0
                    for (int i = 0; i < n_layer; ++i) {
3608
0
                        auto & layer = layers[i];
3609
3610
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3611
3612
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3613
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3614
3615
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3616
3617
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3618
0
                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
3619
3620
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3621
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3622
0
                    }
3623
0
                } break;
3624
0
            case LLM_ARCH_PHIMOE:
3625
0
                {
3626
0
                    const int64_t n_embd_head = n_embd / n_head;
3627
3628
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3629
3630
                    // output
3631
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3632
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3633
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
3634
0
                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
3635
3636
0
                    for (int i = 0; i < n_layer; ++i) {
3637
0
                        auto & layer = layers[i];
3638
3639
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3640
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
3641
3642
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3643
0
                        if (layer.wqkv == nullptr) {
3644
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3645
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias",   i), {n_embd}, 0);
3646
3647
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3648
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
3649
3650
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3651
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
3652
0
                        }
3653
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3654
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
3655
3656
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3657
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
3658
3659
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
3660
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
3661
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
3662
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
3663
3664
0
                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3665
0
                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3666
0
                     }
3667
0
                } break;
3668
0
            case LLM_ARCH_PLAMO:
3669
0
                {
3670
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3671
3672
                    // output
3673
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3674
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3675
3676
0
                    for (int i = 0; i < n_layer; ++i) {
3677
0
                        auto & layer = layers[i];
3678
3679
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3680
3681
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3682
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3683
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3684
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3685
3686
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3687
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3688
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3689
0
                    }
3690
0
                } break;
3691
0
            case LLM_ARCH_PLAMO2:
3692
0
                {
3693
                    // mamba parameters
3694
0
                    const uint32_t d_conv             = hparams.ssm_d_conv;
3695
0
                    const uint32_t d_state            = hparams.ssm_d_state;
3696
0
                    const uint32_t num_heads          = hparams.ssm_dt_rank;
3697
0
                    const uint32_t intermediate_size  = hparams.ssm_d_inner;
3698
0
                    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
3699
3700
                    // attention parameters
3701
0
                    const uint32_t qk_dim = hparams.n_embd_head_k;
3702
0
                    const uint32_t v_dim  = hparams.n_embd_head_v;
3703
3704
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3705
3706
                    // output
3707
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3708
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3709
                    // if output is NULL, init from the input tok embed
3710
0
                    if (output == NULL) {
3711
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3712
0
                    }
3713
3714
0
                    for (int i = 0; i < n_layer; ++i) {
3715
0
                        auto & layer = layers[i];
3716
0
                        bool is_mamba_layer = hparams.is_recurrent(i);
3717
3718
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3719
3720
0
                        if (is_mamba_layer) {
3721
0
                            layer.ssm_in       = create_tensor(tn(LLM_TENSOR_SSM_IN,     "weight", i), {n_embd, 2 * intermediate_size}, 0);
3722
0
                            layer.ssm_conv1d   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
3723
3724
0
                            layer.ssm_x    = create_tensor(tn(LLM_TENSOR_SSM_X,  "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
3725
0
                            layer.ssm_dt   = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
3726
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
3727
3728
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
3729
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
3730
3731
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
3732
3733
0
                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
3734
0
                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3735
0
                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3736
0
                        } else {
3737
0
                            const int64_t num_attention_heads = hparams.n_head(i);
3738
0
                            const int64_t q_num_heads         = num_attention_heads;
3739
0
                            const int64_t num_key_value_heads = hparams.n_head_kv(i);
3740
0
                            const int64_t k_num_heads         = num_key_value_heads;
3741
0
                            const int64_t v_num_heads         = num_key_value_heads;
3742
0
                            const int64_t q_proj_dim          = q_num_heads * qk_dim;
3743
0
                            const int64_t k_proj_dim          = k_num_heads * qk_dim;
3744
0
                            const int64_t v_proj_dim          = v_num_heads * v_dim;
3745
3746
0
                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3747
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
3748
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
3749
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3750
0
                        }
3751
3752
                        // All layers have post-attention norm, FFN norm, and FFN tensors
3753
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
3754
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3755
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3756
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
3757
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3758
0
                    }
3759
0
                } break;
3760
0
            case LLM_ARCH_GPT2:
3761
0
                {
3762
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3763
0
                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
3764
3765
                    // output
3766
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3767
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3768
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3769
3770
                    // if output is NULL, init from the input tok embed
3771
0
                    if (output == NULL) {
3772
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3773
0
                    }
3774
3775
0
                    for (int i = 0; i < n_layer; ++i) {
3776
0
                        auto & layer = layers[i];
3777
3778
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
3779
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
3780
3781
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3782
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3783
3784
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3785
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3786
3787
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3788
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3789
3790
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3791
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3792
3793
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
3794
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
3795
0
                    }
3796
0
                } break;
3797
0
            case LLM_ARCH_CODESHELL:
3798
0
                {
3799
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3800
3801
                    // if tok embd is NULL, init from output
3802
0
                    if (tok_embd == NULL) {
3803
0
                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3804
0
                    }
3805
3806
                    // output
3807
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3808
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3809
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3810
3811
0
                    for (int i = 0; i < n_layer; ++i) {
3812
0
                        auto & layer = layers[i];
3813
3814
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3815
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3816
3817
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3818
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
3819
3820
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3821
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
3822
3823
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3824
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3825
3826
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3827
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
3828
3829
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
3830
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
3831
0
                    }
3832
0
                } break;
3833
0
            case LLM_ARCH_ORION:
3834
0
                {
3835
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3836
3837
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3838
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
3839
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3840
3841
0
                    for (int i = 0; i < n_layer; ++i) {
3842
0
                        auto & layer = layers[i];
3843
3844
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3845
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
3846
3847
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3848
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3849
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3850
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3851
3852
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3853
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
3854
3855
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3856
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3857
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3858
0
                    }
3859
0
                } break;
3860
0
            case LLM_ARCH_INTERNLM2:
3861
0
                {
3862
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3863
3864
                    // output
3865
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3866
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
3867
3868
0
                    for (int i = 0; i < n_layer; ++i) {
3869
0
                        auto & layer = layers[i];
3870
3871
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3872
                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3873
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
3874
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
3875
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
3876
3877
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3878
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3879
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3880
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3881
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3882
0
                    }
3883
0
                } break;
3884
0
            case LLM_ARCH_GEMMA:
3885
0
                {
3886
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3887
3888
                    // output
3889
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3890
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
3891
3892
0
                    for (int i = 0; i < n_layer; ++i) {
3893
0
                        auto & layer = layers[i];
3894
3895
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3896
3897
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3898
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3899
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3900
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3901
3902
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3903
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3904
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3905
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3906
0
                    }
3907
0
                } break;
3908
0
            case LLM_ARCH_GEMMA2:
3909
0
                {
3910
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3911
3912
                    // output
3913
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3914
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
3915
3916
0
                    for (int i = 0; i < n_layer; ++i) {
3917
0
                        auto & layer = layers[i];
3918
3919
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3920
3921
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3922
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3923
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3924
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3925
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3926
3927
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3928
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3929
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3930
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3931
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3932
0
                    }
3933
0
                } break;
3934
0
            case LLM_ARCH_GEMMA3:
3935
0
            case LLM_ARCH_GEMMA_EMBEDDING:
3936
0
                {
3937
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3938
3939
                    // output
3940
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3941
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3942
3943
                    // if output is NULL, init from the input tok embed
3944
0
                    if (output == NULL) {
3945
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3946
0
                    }
3947
3948
                    // Dense linear weights
3949
0
                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
3950
0
                    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
3951
3952
3953
0
                    for (int i = 0; i < n_layer; ++i) {
3954
0
                        auto & layer = layers[i];
3955
3956
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3957
3958
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3959
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
3960
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
3961
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3962
3963
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3964
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
3965
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
3966
3967
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3968
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
3969
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
3970
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
3971
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3972
0
                    }
3973
0
                } break;
3974
0
            case LLM_ARCH_GEMMA3N:
3975
0
                {
3976
0
                    const int64_t n_altup      = hparams.n_altup;
3977
0
                    const int64_t laurel_rank  = hparams.laurel_rank;
3978
0
                    const int64_t n_embd_altup = hparams.n_embd_altup;
3979
3980
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3981
                    // if output is NULL, init from the input tok embed
3982
0
                    if (output == NULL) {
3983
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3984
0
                    }
3985
3986
0
                    tok_embd           = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,           "weight"), {n_embd, n_vocab}, 0);
3987
0
                    tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
3988
3989
0
                    altup_proj           = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,           "weight"), {n_embd, n_embd, n_altup - 1}, 0);
3990
0
                    altup_unembd_proj    = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ,    "weight"), {n_embd, n_embd, n_altup - 1}, 0);
3991
0
                    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
3992
0
                    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight"), {n_embd_altup}, 0);
3993
3994
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3995
3996
0
                    for (int i = 0; i < n_layer; ++i) {
3997
0
                        auto & layer = layers[i];
3998
3999
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4000
4001
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4002
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4003
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4004
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4005
4006
0
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
4007
0
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
4008
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4009
4010
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4011
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4012
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4013
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4014
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4015
4016
                        // altup & laurel
4017
0
                        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
4018
0
                        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
4019
0
                        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
4020
0
                        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
4021
0
                        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
4022
0
                        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
4023
0
                        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
4024
0
                        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
4025
0
                        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
4026
0
                        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
4027
0
                        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
4028
0
                    }
4029
0
                } break;
4030
0
            case LLM_ARCH_STARCODER2:
4031
0
                {
4032
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4033
4034
                    // output
4035
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4036
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4037
4038
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4039
                    // if output is NULL, init from the input tok embed
4040
0
                    if (output == NULL) {
4041
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4042
0
                    }
4043
4044
0
                    for (int i = 0; i < n_layer; ++i) {
4045
0
                        auto & layer = layers[i];
4046
4047
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4048
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4049
4050
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4051
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4052
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4053
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4054
4055
                        // optional bias tensors
4056
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
4057
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
4058
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
4059
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4060
4061
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4062
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4063
4064
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4065
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4066
4067
                        // optional bias tensors
4068
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4069
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
4070
0
                    }
4071
0
                } break;
4072
0
            case LLM_ARCH_MAMBA:
4073
0
                {
4074
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4075
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4076
0
                    const int64_t d_state = hparams.ssm_d_state;
4077
0
                    const int64_t dt_rank = hparams.ssm_dt_rank;
4078
4079
                    // only an expansion factor of 2 is supported for now
4080
0
                    if (2 * n_embd != d_inner) {
4081
0
                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
4082
0
                    }
4083
4084
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4085
4086
                    // output
4087
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4088
4089
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4090
                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
4091
0
                    if (output == NULL) {
4092
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4093
0
                    }
4094
4095
0
                    for (int i = 0; i < n_layer; ++i) {
4096
0
                        auto & layer = layers[i];
4097
4098
                        // norm
4099
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4100
4101
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4102
4103
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4104
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4105
4106
0
                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4107
4108
0
                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4109
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4110
4111
                        // no "weight" suffix for these
4112
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4113
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4114
4115
                        // out_proj
4116
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4117
0
                    }
4118
0
                } break;
4119
0
            case LLM_ARCH_MAMBA2:
4120
0
                {
4121
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4122
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4123
0
                    const int64_t d_state = hparams.ssm_d_state;
4124
0
                    const int64_t n_head  = hparams.ssm_dt_rank;
4125
0
                    const int64_t n_group = hparams.ssm_n_group;
4126
0
                    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
4127
4128
                    // only an expansion factor of 2 is supported for now
4129
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4130
4131
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4132
4133
                    // output
4134
0
                    {
4135
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4136
4137
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4138
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4139
0
                        if (output == NULL) {
4140
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4141
0
                        }
4142
0
                    }
4143
4144
0
                    for (int i = 0; i < n_layer; ++i) {
4145
0
                        auto & layer = layers[i];
4146
4147
                        // norm
4148
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4149
4150
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4151
4152
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4153
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
4154
4155
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
4156
4157
                        // no "weight" suffix for these
4158
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
4159
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
4160
4161
0
                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4162
4163
                        // out_proj
4164
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4165
0
                    }
4166
0
                } break;
4167
0
            case LLM_ARCH_JAMBA:
4168
0
                {
4169
0
                    const int64_t d_conv  = hparams.ssm_d_conv;
4170
0
                    const int64_t d_inner = hparams.ssm_d_inner;
4171
0
                    const int64_t d_state = hparams.ssm_d_state;
4172
0
                    const int64_t dt_rank = hparams.ssm_dt_rank;
4173
4174
                    // only an expansion factor of 2 is supported for now
4175
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4176
4177
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4178
4179
                    // output
4180
0
                    {
4181
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4182
4183
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4184
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4185
0
                        if (output == NULL) {
4186
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4187
0
                        }
4188
0
                    }
4189
4190
0
                    for (int i = 0; i < n_layer; ++i) {
4191
0
                        const int64_t n_head_kv = hparams.n_head_kv(i);
4192
0
                        const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
4193
4194
0
                        auto & layer = layers[i];
4195
4196
                        // norm
4197
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4198
4199
0
                        if (n_head_kv == 0) {
4200
                            // Mamba layer
4201
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4202
4203
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4204
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4205
4206
0
                            layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4207
4208
0
                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
4209
4210
0
                            layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4211
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4212
4213
0
                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
4214
0
                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
4215
4216
                            // no "weight" suffix for these
4217
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4218
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4219
4220
                            // out_proj
4221
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4222
0
                        } else {
4223
                            // Attention layers
4224
4225
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4226
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4227
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4228
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4229
0
                        }
4230
4231
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4232
4233
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4234
4235
0
                        if (layer.ffn_gate_inp) {
4236
                            // MoE
4237
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4238
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4239
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff, n_expert}, 0);
4240
0
                        } else {
4241
                            // FFN (no MoE)
4242
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4243
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4244
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4245
0
                        }
4246
0
                    }
4247
0
                } break;
4248
0
            case LLM_ARCH_GRANITE_HYBRID:
4249
0
                {
4250
                    // mamba2 Mixer SSM params
4251
                    // NOTE: int64_t for tensor dimensions
4252
0
                    const int64_t d_conv     = hparams.ssm_d_conv;
4253
0
                    const int64_t d_inner    = hparams.ssm_d_inner;
4254
0
                    const int64_t d_state    = hparams.ssm_d_state;
4255
0
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
4256
0
                    const int64_t n_group    = hparams.ssm_n_group;
4257
0
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4258
4259
                    // only an expansion factor of 2 is supported for now
4260
0
                    GGML_ASSERT(2 * n_embd == d_inner);
4261
4262
                    // embeddings
4263
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4264
4265
                    // output
4266
0
                    {
4267
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4268
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4269
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
4270
0
                        if (output == NULL) {
4271
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4272
0
                        }
4273
0
                    }
4274
4275
0
                    for (int i = 0; i < n_layer; ++i) {
4276
0
                        auto & layer = layers[i];
4277
4278
                        // norm
4279
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4280
4281
0
                        if (hparams.is_recurrent(i)) {
4282
                            // ssm layers
4283
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4284
4285
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4286
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4287
4288
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4289
4290
                            // no "weight" suffix for these
4291
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4292
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4293
4294
0
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4295
4296
                            // out_proj
4297
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4298
0
                        } else {
4299
                            // attention layers (with optional bias)
4300
0
                            const int64_t n_head_i = hparams.n_head(i);
4301
0
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4302
0
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4303
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4304
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4305
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4306
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4307
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4308
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4309
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4310
0
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
4311
0
                        }
4312
4313
                        // feed forward (w/ optional biases)
4314
0
                        if (n_expert > 0) {
4315
                            // MoE FFN
4316
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4317
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4318
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
4319
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
4320
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4321
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4322
4323
                            // For Granite MoE Shared
4324
0
                            if (hparams.n_ff_shexp > 0) {
4325
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4326
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4327
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4328
0
                            }
4329
0
                        } else {
4330
0
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4331
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4332
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4333
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4334
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4335
0
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4336
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4337
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4338
0
                        }
4339
0
                    }
4340
0
                } break;
4341
0
            case LLM_ARCH_XVERSE:
4342
0
                {
4343
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4344
4345
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4346
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4347
4348
0
                    for (int i = 0; i < n_layer; ++i) {
4349
0
                        auto & layer = layers[i];
4350
4351
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4352
4353
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4354
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4355
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4356
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4357
4358
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4359
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4360
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4361
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4362
0
                    }
4363
0
                } break;
4364
0
            case LLM_ARCH_COMMAND_R:
4365
0
                {
4366
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4367
4368
                    // output
4369
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4370
                    // init output from the input tok embed
4371
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4372
4373
0
                    for (int i = 0; i < n_layer; ++i) {
4374
0
                        auto & layer = layers[i];
4375
4376
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4377
4378
0
                        if (n_layer >= 64){
4379
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
4380
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
4381
0
                        }
4382
4383
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4384
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4385
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4386
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4387
4388
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4389
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4390
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4391
0
                    }
4392
0
                } break;
4393
0
            case LLM_ARCH_COHERE2:
4394
0
                {
4395
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4396
4397
                    // output
4398
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4399
                    // init output from the input tok embed
4400
0
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
4401
0
                                                      TENSOR_DUPLICATED);
4402
4403
0
                    for (int i = 0; i < n_layer; ++i) {
4404
0
                        auto & layer = layers[i];
4405
4406
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4407
4408
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
4409
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
4410
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
4411
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4412
4413
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
4414
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
4415
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
4416
0
                    }
4417
0
                }
4418
0
                break;
4419
0
            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
4420
0
                {
4421
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4422
4423
                    // output
4424
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4425
                    // if output is NULL, init from the input tok embed
4426
0
                    if (output == NULL) {
4427
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4428
0
                    }
4429
4430
0
                    for (int i = 0; i < n_layer; ++i) {
4431
0
                        auto & layer = layers[i];
4432
4433
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4434
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4435
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4436
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4437
4438
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4439
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4440
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4441
0
                    }
4442
0
                } break;
4443
0
            case LLM_ARCH_OLMO2:
4444
0
                {
4445
0
                    const int64_t n_embd_head = n_embd / n_head;
4446
4447
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4448
4449
                    // output
4450
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4451
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4452
4453
0
                    for (int i = 0; i < n_layer; ++i) {
4454
0
                        auto & layer = layers[i];
4455
4456
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4457
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4458
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4459
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4460
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4461
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
4462
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4463
4464
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4465
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4466
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4467
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4468
0
                    }
4469
0
                } break;
4470
0
            case LLM_ARCH_SEED_OSS:
4471
0
                {
4472
0
                    const uint32_t head_dim             = hparams.n_embd_head_k;
4473
0
                    const int64_t n_qo_dim              = n_head * head_dim;
4474
0
                    const int64_t n_kv_dim              = n_head_kv * head_dim;
4475
4476
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4477
4478
                    // output
4479
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4480
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4481
                    // if output is NULL, init from the input tok embed
4482
0
                    if (output == NULL) {
4483
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4484
0
                    }
4485
4486
0
                    for (int i = 0; i < n_layer; ++i) {
4487
0
                        auto & layer = layers[i];
4488
4489
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_qo_dim}, 0);
4490
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_kv_dim}, 0);
4491
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_kv_dim}, 0);
4492
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4493
4494
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_qo_dim},   TENSOR_NOT_REQUIRED);
4495
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
4496
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_kv_dim},   TENSOR_NOT_REQUIRED);
4497
4498
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4499
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4500
4501
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4502
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4503
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4504
0
                    }
4505
0
                } break;
4506
4507
0
            case LLM_ARCH_OLMOE:
4508
0
                {
4509
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4510
4511
                    // output
4512
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4513
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4514
4515
0
                    for (int i = 0; i < n_layer; ++i) {
4516
0
                        auto & layer = layers[i];
4517
4518
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4519
4520
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4521
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4522
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4523
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4524
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4525
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
4526
4527
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4528
4529
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4530
4531
0
                        if (n_expert == 0) {
4532
0
                            throw std::runtime_error("n_expert must be > 0");
4533
0
                        }
4534
0
                        if (n_expert_used == 0) {
4535
0
                            throw std::runtime_error("n_expert_used must be > 0");
4536
0
                        }
4537
4538
                        // MoE branch
4539
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
4540
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
4541
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
4542
0
                    }
4543
0
                } break;
4544
0
            case LLM_ARCH_OPENELM:
4545
0
                {
4546
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4547
4548
                    // output
4549
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4550
                    // init output from the input tok embed
4551
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4552
4553
0
                    for (int i = 0; i < n_layer; ++i) {
4554
0
                        const int64_t n_head      =   hparams.n_head(i);
4555
0
                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
4556
0
                        const int64_t n_ff        =   hparams.n_ff(i);
4557
4558
0
                        auto & layer = layers[i];
4559
4560
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4561
4562
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
4563
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4564
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4565
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
4566
4567
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4568
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4569
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4570
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4571
0
                    }
4572
0
                } break;
4573
0
            case LLM_ARCH_GPTNEOX:
4574
0
                {
4575
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4576
4577
                    // output
4578
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4579
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4580
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4581
4582
0
                    for (int i = 0; i < n_layer; ++i) {
4583
0
                        auto & layer = layers[i];
4584
4585
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4586
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
4587
4588
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4589
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4590
4591
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4592
0
                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4593
4594
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4595
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4596
4597
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4598
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4599
4600
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4601
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4602
0
                    }
4603
0
                } break;
4604
0
            case LLM_ARCH_ARCTIC:
4605
0
                {
4606
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4607
4608
                    // output
4609
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4610
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4611
4612
                    // if output is NULL, init from the input tok embed
4613
0
                    if (output == NULL) {
4614
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4615
0
                    }
4616
4617
0
                    for (int i = 0; i < n_layer; ++i) {
4618
0
                        auto & layer = layers[i];
4619
4620
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4621
4622
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4623
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4624
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4625
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4626
4627
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4628
4629
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
4630
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
4631
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
4632
4633
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4634
0
                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
4635
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
4636
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
4637
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
4638
0
                    }
4639
0
                } break;
4640
0
            case LLM_ARCH_DEEPSEEK:
4641
0
                {
4642
4643
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
4644
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
4645
4646
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4647
4648
                    // output
4649
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4650
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4651
4652
0
                    for (int i = 0; i < n_layer; ++i) {
4653
0
                        auto & layer = layers[i];
4654
4655
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4656
4657
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4658
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4659
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4660
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4661
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4662
4663
0
                        if (i < (int) hparams.n_layer_dense_lead) {
4664
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4665
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4666
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4667
0
                        } else {
4668
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4669
4670
0
                            if (n_expert == 0) {
4671
0
                                throw std::runtime_error("n_expert must be > 0");
4672
0
                            }
4673
0
                            if (n_expert_used == 0) {
4674
0
                                throw std::runtime_error("n_expert_used must be > 0");
4675
0
                            }
4676
4677
                            // MoE branch
4678
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4679
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
4680
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4681
4682
                            // Shared expert branch
4683
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4684
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
4685
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4686
0
                        }
4687
0
                    }
4688
0
                } break;
4689
0
            case LLM_ARCH_DEEPSEEK2:
4690
0
                {
4691
                    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
4692
0
                    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
4693
4694
0
                    const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
4695
4696
                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
4697
0
                    const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
4698
0
                    const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
4699
4700
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
4701
0
                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
4702
4703
0
                    const int64_t q_lora_rank  = hparams.n_lora_q;
4704
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
4705
4706
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
4707
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
4708
4709
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4710
4711
                    // output
4712
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4713
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4714
4715
0
                    for (int i = 0; i < n_layer; ++i) {
4716
0
                        auto & layer = layers[i];
4717
4718
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4719
0
                        if (!is_lite) {
4720
0
                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
4721
0
                        }
4722
4723
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4724
4725
0
                        if (!is_lite) {
4726
0
                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
4727
0
                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
4728
0
                        } else {
4729
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
4730
0
                        }
4731
4732
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
4733
4734
                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
4735
0
                        if (is_mla) {
4736
0
                            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
4737
0
                            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
4738
0
                        } else {
4739
0
                            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
4740
0
                        }
4741
4742
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
4743
4744
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4745
4746
0
                        if (i < (int) hparams.n_layer_dense_lead) {
4747
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
4748
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4749
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4750
0
                        } else {
4751
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4752
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4753
4754
0
                            if (n_expert == 0) {
4755
0
                                throw std::runtime_error("n_expert must be > 0");
4756
0
                            }
4757
0
                            if (n_expert_used == 0) {
4758
0
                                throw std::runtime_error("n_expert_used must be > 0");
4759
0
                            }
4760
4761
                            // MoE branch
4762
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4763
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
4764
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
4765
4766
                            // Shared expert branch
4767
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4768
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
4769
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4770
0
                        }
4771
0
                    }
4772
0
                } break;
4773
0
            case LLM_ARCH_PLM:
4774
0
                {
4775
0
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
4776
0
                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
4777
0
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
4778
4779
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4780
4781
                    // output
4782
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4783
                    // output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4784
0
                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4785
4786
0
                    for (int i = 0; i < n_layer; ++i) {
4787
0
                        auto & layer = layers[i];
4788
4789
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4790
4791
0
                        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4792
0
                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
4793
0
                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4794
0
                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
4795
0
                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
4796
4797
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4798
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4799
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4800
0
                    }
4801
0
                } break;
4802
0
            case LLM_ARCH_BITNET:
4803
0
                {
4804
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4805
4806
                    // output
4807
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4808
4809
0
                    for (int i = 0; i < n_layer; ++i) {
4810
0
                        auto & layer = layers[i];
4811
4812
0
                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
4813
0
                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
4814
4815
0
                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
4816
0
                        layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4817
0
                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
4818
0
                        layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4819
0
                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
4820
0
                        layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4821
0
                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4822
0
                        layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4823
4824
0
                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
4825
0
                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
4826
4827
0
                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4828
0
                        layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4829
0
                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4830
0
                        layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4831
0
                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4832
0
                        layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
4833
0
                    }
4834
0
                } break;
4835
0
            case LLM_ARCH_T5:
4836
0
                {
4837
0
                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
4838
4839
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4840
4841
                    // output
4842
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4843
0
                    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4844
4845
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4846
                    // if output is NULL, init from the input tok embed
4847
0
                    if (output == NULL) {
4848
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4849
0
                    }
4850
4851
                    // n_layer:     number of encoder_layers
4852
                    // dec_n_layer: number of decoder_layers
4853
0
                    const int dec_n_layer = hparams.dec_n_layer;
4854
0
                    if (dec_n_layer > n_layer) {
4855
0
                        layers.resize(dec_n_layer);
4856
0
                    }
4857
4858
                    // load encoder layers
4859
0
                    for (int i = 0; i < n_layer; ++i) {
4860
0
                        auto & layer = layers[i];
4861
4862
0
                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
4863
0
                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4864
4865
0
                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4866
0
                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4867
0
                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4868
0
                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4869
4870
0
                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
4871
0
                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
4872
0
                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4873
0
                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4874
0
                    }
4875
4876
                    // load decoder layers
4877
0
                    for (int i = 0; i < dec_n_layer; ++i) {
4878
0
                        auto & layer = layers[i];
4879
4880
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
4881
0
                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4882
4883
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4884
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4885
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4886
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4887
4888
0
                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
4889
                        // this tensor seems to be unused in HF transformers implementation
4890
0
                        layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4891
4892
0
                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4893
0
                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4894
0
                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4895
0
                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4896
4897
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
4898
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
4899
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4900
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4901
0
                    }
4902
0
                } break;
4903
0
            case LLM_ARCH_T5ENCODER:
4904
0
                {
4905
0
                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
4906
4907
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4908
4909
                    // output
4910
0
                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4911
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4912
                    // if output is NULL, init from the input tok embed
4913
0
                    if (output == NULL) {
4914
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4915
0
                    }
4916
4917
0
                    for (int i = 0; i < n_layer; ++i) {
4918
0
                        auto & layer = layers[i];
4919
4920
0
                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
4921
0
                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4922
4923
0
                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4924
0
                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4925
0
                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4926
0
                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4927
4928
0
                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
4929
0
                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
4930
0
                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
4931
0
                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
4932
0
                    }
4933
0
                } break;
4934
0
            case LLM_ARCH_JAIS:
4935
0
                {
4936
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4937
4938
                    // output
4939
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4940
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
4941
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
4942
4943
0
                    for (int i = 0; i < n_layer; ++i) {
4944
0
                        auto & layer = layers[i];
4945
4946
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
4947
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
4948
4949
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4950
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
4951
4952
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4953
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
4954
4955
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4956
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
4957
4958
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4959
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
4960
4961
0
                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
4962
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
4963
4964
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
4965
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
4966
0
                    }
4967
0
                } break;
4968
0
            case LLM_ARCH_CHATGLM:
4969
0
                {
4970
0
                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
4971
4972
                    // output
4973
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4974
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4975
                    // if output is NULL, init from the input tok embed
4976
0
                    if (output == NULL) {
4977
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4978
0
                    }
4979
4980
0
                    for (int i = 0; i < n_layer; ++i) {
4981
0
                        auto & layer = layers[i];
4982
4983
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4984
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4985
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4986
4987
0
                        if (layer.wqkv == nullptr) {
4988
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4989
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
4990
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
4991
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4992
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4993
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4994
0
                        }
4995
4996
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4997
4998
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4999
5000
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
5001
5002
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5003
0
                    }
5004
0
                } break;
5005
0
            case LLM_ARCH_GLM4:
5006
0
                {
5007
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5008
5009
                    // output
5010
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5011
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5012
                    // if output is NULL, init from the input tok embed
5013
0
                    if (output == NULL) {
5014
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5015
0
                    }
5016
5017
0
                    for (int i = 0; i < n_layer; ++i) {
5018
0
                        auto & layer = layers[i];
5019
5020
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5021
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5022
0
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5023
5024
0
                        if (layer.wqkv == nullptr) {
5025
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5026
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5027
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5028
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5029
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5030
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5031
0
                        }
5032
5033
0
                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5034
5035
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5036
5037
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5038
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5039
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
5040
5041
0
                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5042
0
                    }
5043
0
                } break;
5044
0
            case LLM_ARCH_GLM4_MOE:
5045
0
                {
5046
0
                    const int64_t n_expert        = hparams.n_expert;
5047
0
                    const int64_t n_expert_used   = hparams.n_expert_used;
5048
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5049
5050
0
                    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
5051
0
                    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
5052
5053
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5054
5055
                    // output
5056
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5057
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
5058
                    // if output is NULL, init from the input tok embed
5059
0
                    if (output == NULL) {
5060
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
5061
0
                    }
5062
5063
                    // Load ALL tensors including NextN layer to satisfy total tensor count
5064
                    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
5065
0
                    for (int i = 0; i < n_layer; ++i) {
5066
0
                        int flags = 0;
5067
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5068
                            // skip all tensors in the NextN layers
5069
0
                            flags |= TENSOR_SKIP;
5070
0
                        }
5071
5072
0
                        auto & layer = layers[i];
5073
5074
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
5075
5076
                        // GLM-style attention with bias terms
5077
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
5078
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
5079
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
5080
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
5081
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
5082
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
5083
5084
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
5085
5086
                        // K/Q norm tensors (optional for GLM-4.5 355B variant)
5087
0
                        layer.attn_q_norm = create_tensor(
5088
0
                            tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5089
0
                        layer.attn_k_norm = create_tensor(
5090
0
                            tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
5091
5092
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
5093
5094
                        // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
5095
                        // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
5096
0
                        const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
5097
5098
0
                        if (use_moe) {
5099
                            // MoE layers
5100
0
                            layer.ffn_gate_inp =
5101
0
                                create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
5102
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
5103
5104
                            // MoE branch
5105
0
                            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5106
5107
0
                            layer.ffn_gate_exps = create_tensor(
5108
0
                                tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5109
0
                            layer.ffn_down_exps = create_tensor(
5110
0
                                tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
5111
0
                            layer.ffn_up_exps = create_tensor(
5112
0
                                tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
5113
5114
                            // Shared expert
5115
0
                            if (n_expert_shared > 0) {
5116
0
                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5117
0
                                layer.ffn_gate_shexp = create_tensor(
5118
0
                                    tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5119
0
                                layer.ffn_down_shexp = create_tensor(
5120
0
                                    tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
5121
0
                                layer.ffn_up_shexp = create_tensor(
5122
0
                                    tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
5123
0
                            }
5124
0
                        } else {
5125
                            // Dense layers (first k layers) - GLM uses separate gate/up projections
5126
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
5127
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
5128
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
5129
0
                        }
5130
5131
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5132
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5133
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5134
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5135
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5136
5137
                            // Optional tensors
5138
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5139
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5140
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5141
0
                        }
5142
0
                    }
5143
0
                }
5144
0
                break;
5145
0
            case LLM_ARCH_NEMOTRON:
5146
0
                {
5147
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5148
5149
                    // output
5150
0
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5151
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5152
0
                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5153
5154
0
                    for (int i = 0; i < n_layer; ++i) {
5155
0
                        auto & layer = layers[i];
5156
5157
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5158
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5159
5160
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5161
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5162
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5163
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5164
5165
                        // optional bias tensors
5166
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5167
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5168
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5169
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5170
5171
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5172
0
                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5173
5174
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5175
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5176
5177
                        // optional MLP bias
5178
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5179
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5180
0
                    }
5181
0
                } break;
5182
0
            case LLM_ARCH_NEMOTRON_H:
5183
0
                {
5184
                    // mamba2 Mixer SSM params
5185
                    // NOTE: int64_t for tensor dimensions
5186
0
                    const int64_t d_conv     = hparams.ssm_d_conv;
5187
0
                    const int64_t d_inner    = hparams.ssm_d_inner;
5188
0
                    const int64_t d_state    = hparams.ssm_d_state;
5189
0
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
5190
0
                    const int64_t n_group    = hparams.ssm_n_group;
5191
0
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5192
5193
                    // embeddings
5194
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5195
5196
                    // output
5197
0
                    {
5198
0
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5199
0
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5200
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
5201
0
                        if (output == NULL) {
5202
0
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5203
0
                        }
5204
0
                    }
5205
5206
0
                    for (int i = 0; i < n_layer; ++i) {
5207
0
                        auto & layer = layers[i];
5208
5209
                        // all blocks use the attn norm
5210
0
                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5211
5212
0
                        if (hparams.is_recurrent(i)) {
5213
                            // ssm layers
5214
0
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
5215
5216
0
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
5217
0
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
5218
5219
0
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
5220
5221
                            // no "weight" suffix for these
5222
0
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
5223
0
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
5224
5225
0
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
5226
5227
                            // out_proj
5228
0
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
5229
0
                        } else if (hparams.n_ff(i) == 0) {
5230
                            // attention layers (with optional bias)
5231
0
                            const int64_t n_head_i = hparams.n_head(i);
5232
0
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
5233
0
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
5234
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
5235
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
5236
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
5237
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
5238
0
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
5239
0
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias",   i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5240
0
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias",   i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5241
0
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
5242
0
                        } else {
5243
                            // mlp layers
5244
0
                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  hparams.n_ff(i), n_embd}, 0);
5245
0
                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   hparams.n_ff(i)}, 0);
5246
0
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
5247
0
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias",   i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
5248
0
                        }
5249
0
                    }
5250
0
                } break;
5251
0
            case LLM_ARCH_EXAONE:
5252
0
                {
5253
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5254
5255
                    // output
5256
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5257
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5258
5259
                    // if output is NULL, init from the input tok embed
5260
0
                    if (output == NULL) {
5261
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5262
0
                    }
5263
5264
0
                    for (int i = 0; i < n_layer; ++i) {
5265
0
                        auto & layer = layers[i];
5266
5267
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5268
5269
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5270
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5271
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5272
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5273
5274
0
                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
5275
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5276
0
                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
5277
0
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
5278
0
                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
5279
0
                    }
5280
0
                } break;
5281
0
            case LLM_ARCH_EXAONE4:
5282
0
                {
5283
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5284
5285
                    // output
5286
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5287
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5288
5289
                    // if output is NULL, init from the input tok embed
5290
0
                    if (output == NULL) {
5291
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5292
0
                    }
5293
5294
0
                    for (int i = 0; i < n_layer; ++i) {
5295
0
                        auto & layer = layers[i];
5296
5297
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5298
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5299
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5300
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5301
5302
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5303
5304
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5305
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5306
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5307
5308
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5309
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5310
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5311
0
                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5312
0
                    }
5313
0
                } break;
5314
0
            case LLM_ARCH_RWKV6:
5315
0
                {
5316
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5317
5318
                    // Block 0, LN0
5319
0
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5320
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5321
5322
                    // output
5323
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5324
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5325
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5326
5327
0
                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5328
0
                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5329
0
                    const int head_size = hparams.wkv_head_size;
5330
0
                    const int attn_hidden_size = n_embd;
5331
0
                    const int ffn_size = hparams.n_ff_arr[0];
5332
5333
0
                    for (int i = 0; i < n_layer; ++i) {
5334
0
                        auto & layer = layers[i];
5335
5336
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5337
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5338
5339
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5340
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
5341
5342
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5343
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5344
5345
0
                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5346
0
                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5347
0
                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5348
0
                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5349
0
                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5350
0
                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5351
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
5352
0
                        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
5353
5354
0
                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
5355
0
                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5356
0
                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5357
0
                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5358
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5359
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5360
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5361
0
                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5362
5363
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5364
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5365
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5366
5367
0
                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5368
0
                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
5369
5370
0
                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5371
0
                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5372
0
                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
5373
0
                    }
5374
5375
0
                } break;
5376
0
            case LLM_ARCH_RWKV6QWEN2:
5377
0
                {
5378
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5379
5380
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5381
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
5382
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5383
5384
0
                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5385
0
                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5386
0
                    const int head_size = hparams.wkv_head_size;
5387
0
                    const int attn_hidden_size = n_embd;
5388
0
                    const int n_head_kv = hparams.n_head_kv();
5389
0
                    int attn_key_value_size;
5390
0
                    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
5391
0
                        attn_key_value_size = attn_hidden_size;
5392
0
                    } else {
5393
0
                        attn_key_value_size = n_head_kv * head_size;
5394
0
                    }
5395
5396
0
                    for (int i = 0; i < n_layer; ++i) {
5397
0
                        auto & layer = layers[i];
5398
5399
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5400
5401
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5402
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5403
5404
0
                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5405
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5406
5407
0
                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
5408
0
                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5409
0
                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5410
0
                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5411
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
5412
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
5413
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5414
0
                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5415
                        // optional bias tensors
5416
0
                        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5417
0
                        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5418
0
                        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
5419
5420
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5421
5422
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5423
5424
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5425
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5426
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5427
0
                    }
5428
0
                } break;
5429
0
            case LLM_ARCH_RWKV7:
5430
0
                {
5431
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5432
5433
                    // Block 0, LN0
5434
0
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5435
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5436
5437
                    // output
5438
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5439
0
                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5440
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5441
5442
0
                    const int n_lora_decay = hparams.n_lora_decay;
5443
0
                    const int n_lora_iclr = hparams.n_lora_iclr;
5444
0
                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5445
0
                    const int n_lora_gate = hparams.n_lora_gate;
5446
0
                    const int attn_hidden_size = n_embd;
5447
0
                    const int ffn_size = hparams.n_ff_arr[0];
5448
5449
0
                    for (int i = 0; i < n_layer; ++i) {
5450
0
                        auto & layer = layers[i];
5451
5452
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5453
0
                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
5454
5455
0
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5456
0
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
5457
5458
0
                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5459
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5460
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5461
5462
0
                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5463
0
                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5464
0
                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5465
5466
0
                        if (i == 0) {
5467
                            // actually not used
5468
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5469
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5470
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5471
0
                        } else {
5472
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5473
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5474
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5475
0
                        }
5476
5477
0
                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
5478
0
                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
5479
5480
0
                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5481
5482
0
                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5483
0
                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5484
0
                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5485
5486
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5487
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5488
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5489
5490
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5491
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5492
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5493
5494
0
                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5495
5496
0
                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5497
0
                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5498
0
                    }
5499
5500
0
                } break;
5501
0
            case LLM_ARCH_ARWKV7:
5502
0
                {
5503
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5504
5505
                    // output
5506
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5507
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5508
5509
0
                    const int n_lora_decay = hparams.n_lora_decay;
5510
0
                    const int n_lora_iclr = hparams.n_lora_iclr;
5511
0
                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5512
0
                    const int n_lora_gate = hparams.n_lora_gate;
5513
0
                    const int attn_hidden_size = n_embd;
5514
5515
0
                    for (int i = 0; i < n_layer; ++i) {
5516
0
                        auto & layer = layers[i];
5517
5518
0
                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5519
5520
0
                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5521
0
                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5522
0
                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5523
5524
0
                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5525
0
                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5526
0
                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5527
5528
0
                        if (i == 0) {
5529
                            // actually not used
5530
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5531
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5532
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5533
0
                        } else {
5534
0
                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5535
0
                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5536
0
                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5537
0
                        }
5538
5539
0
                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
5540
0
                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
5541
5542
0
                        try {
5543
0
                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5544
0
                        } catch(std::runtime_error & e) {
5545
                            // ARWKV models may not have gate tensors
5546
0
                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5547
0
                        }
5548
5549
0
                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5550
0
                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5551
0
                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5552
5553
0
                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5554
0
                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5555
0
                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5556
5557
0
                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
5558
0
                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5559
0
                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5560
5561
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5562
5563
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5564
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5565
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5566
0
                    }
5567
5568
0
                } break;
5569
0
            case LLM_ARCH_CHAMELEON:
5570
0
                {
5571
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5572
5573
                    // output
5574
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5575
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5576
                    // if output is NULL, init from the input tok embed
5577
0
                    if (output == NULL) {
5578
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5579
0
                    }
5580
5581
0
                    for (int i = 0; i < n_layer; ++i) {
5582
0
                        auto & layer = layers[i];
5583
5584
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5585
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
5586
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
5587
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
5588
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
5589
5590
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
5591
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5592
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5593
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5594
5595
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5596
5597
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5598
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5599
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5600
0
                    }
5601
0
                } break;
5602
0
            case LLM_ARCH_WAVTOKENIZER_DEC:
5603
0
                {
5604
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
5605
5606
0
                    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
5607
0
                    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
5608
5609
                    // posnet
5610
0
                    {
5611
0
                        const int64_t n_embd = hparams.posnet.n_embd;
5612
5613
0
                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
5614
0
                            auto & layer = layers[i].posnet;
5615
5616
                            // posnet:
5617
                            //
5618
                            //  - resnet
5619
                            //  - resnet
5620
                            //  - attn
5621
                            //  - resnet
5622
                            //  - resnet
5623
                            //  - norm
5624
                            //
5625
0
                            switch (i) {
5626
0
                                case 0:
5627
0
                                case 1:
5628
0
                                case 3:
5629
0
                                case 4:
5630
0
                                    {
5631
0
                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
5632
0
                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
5633
5634
0
                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
5635
0
                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
5636
5637
0
                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
5638
0
                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
5639
5640
0
                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
5641
0
                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
5642
0
                                    } break;
5643
0
                                case 2:
5644
0
                                    {
5645
0
                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
5646
0
                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
5647
5648
0
                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
5649
0
                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
5650
5651
0
                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
5652
0
                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
5653
5654
0
                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
5655
0
                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
5656
5657
0
                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
5658
0
                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
5659
0
                                    } break;
5660
0
                                case 5:
5661
0
                                    {
5662
0
                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
5663
0
                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
5664
0
                                    } break;
5665
0
                                default: GGML_ABORT("unknown posnet layer");
5666
0
                            };
5667
0
                        }
5668
0
                    }
5669
5670
0
                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
5671
5672
0
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
5673
0
                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);
5674
5675
                    // convnext
5676
0
                    {
5677
0
                        const int64_t n_embd = hparams.convnext.n_embd;
5678
5679
0
                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
5680
0
                            auto & layer = layers[i].convnext;
5681
5682
0
                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
5683
0
                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
5684
5685
0
                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
5686
0
                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
5687
5688
0
                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
5689
0
                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
5690
5691
0
                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
5692
0
                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
5693
5694
0
                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
5695
0
                        }
5696
5697
                        // output
5698
0
                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5699
0
                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
5700
0
                    }
5701
5702
0
                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
5703
0
                    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
5704
0
                } break;
5705
0
            case LLM_ARCH_BAILINGMOE:
5706
0
                {
5707
0
                    const int64_t n_ff_exp            = hparams.n_ff_exp;
5708
0
                    const int64_t n_expert_shared     = hparams.n_expert_shared;
5709
5710
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5711
5712
                    // output
5713
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5714
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5715
5716
0
                    for (int i = 0; i < n_layer; ++i) {
5717
0
                        auto & layer = layers[i];
5718
5719
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5720
5721
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
5722
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5723
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5724
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5725
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5726
5727
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5728
5729
0
                        if (n_expert == 0) {
5730
0
                            throw std::runtime_error("n_expert must be > 0");
5731
0
                        }
5732
0
                        if (n_expert_used == 0) {
5733
0
                            throw std::runtime_error("n_expert_used must be > 0");
5734
0
                        }
5735
5736
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5737
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5738
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5739
5740
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5741
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5742
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5743
0
                    }
5744
0
                } break;
5745
0
            case LLM_ARCH_BAILINGMOE2:
5746
0
                {
5747
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5748
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5749
5750
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5751
5752
                    // output
5753
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5754
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5755
5756
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
5757
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
5758
5759
0
                    for (int i = 0; i < n_layer; ++i) {
5760
0
                        int flags = 0;
5761
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5762
                            // skip all tensors in the NextN layers
5763
0
                            flags |= TENSOR_SKIP;
5764
0
                        }
5765
5766
0
                        auto & layer = layers[i];
5767
5768
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5769
5770
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
5771
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5772
5773
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5774
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5775
5776
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5777
5778
0
                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5779
0
                            const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
5780
5781
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5782
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5783
5784
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
5785
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
5786
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
5787
5788
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5789
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5790
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
5791
0
                        } else { // Dense layers
5792
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
5793
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
5794
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
5795
0
                        }
5796
5797
                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5798
0
                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5799
0
                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5800
0
                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5801
0
                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5802
0
                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5803
0
                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5804
0
                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
5805
0
                            layer.layer_out_norm         = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
5806
0
                        }
5807
0
                    }
5808
0
                } break;
5809
0
            case LLM_ARCH_DOTS1:
5810
0
                {
5811
0
                    const int64_t n_ff_exp        = hparams.n_ff_exp;
5812
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5813
5814
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5815
5816
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5817
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
5818
5819
0
                    for (int i = 0; i < n_layer; ++i) {
5820
0
                        auto & layer = layers[i];
5821
5822
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5823
5824
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5825
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5826
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5827
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5828
5829
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5830
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5831
5832
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5833
5834
0
                        if (i < (int) hparams.n_layer_dense_lead) {
5835
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
5836
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5837
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5838
0
                        } else {
5839
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5840
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5841
5842
0
                            if (n_expert == 0) {
5843
0
                                throw std::runtime_error("n_expert must be > 0");
5844
0
                            }
5845
0
                            if (n_expert_used == 0) {
5846
0
                                throw std::runtime_error("n_expert_used must be > 0");
5847
0
                            }
5848
5849
                            // MoE branch
5850
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5851
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
5852
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
5853
5854
                            // Shared expert branch
5855
0
                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5856
0
                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
5857
0
                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5858
0
                        }
5859
0
                    }
5860
0
                } break;
5861
0
            case LLM_ARCH_ARCEE:
5862
0
                {
5863
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5864
5865
                    // output
5866
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5867
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5868
5869
                    // if output is NULL, init from the input tok embed
5870
0
                    if (output == NULL) {
5871
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5872
0
                    }
5873
5874
0
                    for (int i = 0; i < n_layer; ++i) {
5875
0
                        auto & layer = layers[i];
5876
5877
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5878
5879
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5880
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5881
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5882
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5883
5884
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5885
5886
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5887
5888
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
5889
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
5890
0
                    }
5891
0
                } break;
5892
0
            case LLM_ARCH_AFMOE:
5893
0
                {
5894
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5895
5896
                    // output
5897
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5898
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5899
5900
                    // if output is NULL, init from the input tok embed
5901
0
                    if (output == NULL) {
5902
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5903
0
                    }
5904
5905
0
                    const int64_t n_ff_exp = hparams.n_ff_exp;
5906
0
                    const int64_t n_expert_shared = hparams.n_expert_shared;
5907
5908
0
                    for (int i = 0; i < n_layer; ++i) {
5909
0
                        auto & layer = layers[i];
5910
5911
                        // dual attention normalization
5912
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
5913
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5914
5915
                        // attention projections
5916
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5917
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
5918
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
5919
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5920
5921
                        // Q/K normalization
5922
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5923
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5924
5925
                        // attention gating
5926
0
                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5927
5928
                        // dual ffn normalization
5929
0
                        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), {n_embd}, 0);
5930
0
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5931
5932
0
                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
5933
                            // MoE layers
5934
0
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5935
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
5936
5937
                            // grouped expert weights
5938
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5939
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5940
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5941
5942
                            // shared expert
5943
0
                            if (n_expert_shared > 0) {
5944
0
                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5945
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5946
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5947
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
5948
0
                            }
5949
0
                        } else {
5950
                            // Dense layers
5951
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5952
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5953
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
5954
0
                        }
5955
0
                    }
5956
0
                } break;
5957
0
            case LLM_ARCH_ERNIE4_5:
5958
0
            case LLM_ARCH_ERNIE4_5_MOE:
5959
0
                {
5960
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5961
5962
                    // output
5963
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5964
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5965
                    // if output is NULL, init from the input tok embed
5966
0
                    if (output == NULL) {
5967
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5968
0
                    }
5969
5970
0
                    for (int i = 0; i < n_layer; ++i) {
5971
0
                        auto & layer = layers[i];
5972
5973
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5974
5975
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5976
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
5977
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
5978
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5979
5980
                        // optional bias tensors
5981
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5982
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5983
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5984
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
5985
5986
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5987
5988
0
                        if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5989
0
                            int n_ff_exp = hparams.n_ff_exp;
5990
5991
0
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
5992
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5993
0
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
5994
0
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
5995
0
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
5996
5997
                            // Shared expert (if present)
5998
0
                            if (hparams.n_ff_shexp > 0) {
5999
0
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
6000
0
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd    }, 0);
6001
0
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
6002
0
                            }
6003
0
                        } else { // Dense layers
6004
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6005
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6006
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6007
0
                        }
6008
0
                    }
6009
0
                } break;
6010
0
            case LLM_ARCH_FALCON_H1:
6011
0
                {
6012
                    // Common
6013
0
                    const int64_t hidden_size = hparams.n_embd; // hidden_size
6014
6015
                    // mamba2 Mixer SSM params
6016
0
                    const int64_t ssm_conv_kernel_size  = hparams.ssm_d_conv; // ssm_conv_kernel_size
6017
0
                    const int64_t ssm_n_groups          = hparams.ssm_n_group; // ssm_n_groups
6018
0
                    const int64_t ssm_state_size        = hparams.ssm_d_state; // ssm_state_size
6019
0
                    const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
6020
0
                    const int64_t ssm_num_heads         = hparams.ssm_dt_rank; // ssm_num_heads
6021
0
                    const int64_t ssm_conv_dim          = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
6022
0
                    const int64_t ssm_projection_size   = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
6023
6024
                    // attn params
6025
0
                    const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
6026
0
                    const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
6027
6028
                    // ffn params
6029
0
                    const int64_t ffn_intermediate_size = hparams.n_ff(0);
6030
6031
                    // embeddings
6032
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
6033
6034
                    // output
6035
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
6036
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
6037
6038
                    // if output is NULL, init from the input tok embed
6039
0
                    if (output == NULL) {
6040
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
6041
0
                    }
6042
6043
0
                    for (int i = 0; i < n_layer; ++i) {
6044
0
                        auto & layer = layers[i];
6045
6046
                        /*SSM LAYERS*/
6047
                        // ssm in
6048
0
                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
6049
                        // ssm 1d conv
6050
0
                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
6051
0
                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
6052
                        // ssm_dt
6053
0
                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
6054
                        // no "weight" suffix for these
6055
0
                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
6056
0
                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
6057
                        // ssm_norm
6058
0
                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
6059
                        // out_proj
6060
0
                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
6061
6062
                        /*ATTENTION LAYERS*/
6063
                        // attention layers (with optional bias)
6064
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
6065
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
6066
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
6067
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
6068
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6069
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
6070
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
6071
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6072
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
6073
6074
6075
                        // feed forward (w/ optional biases)
6076
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
6077
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6078
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
6079
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  ffn_intermediate_size, hidden_size}, 0);
6080
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
6081
6082
0
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
6083
0
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
6084
0
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
6085
0
                    }
6086
0
                } break;
6087
0
            case LLM_ARCH_HUNYUAN_MOE:
6088
0
                {
6089
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6090
6091
                    // output
6092
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6093
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6094
                    // if output is NULL, init from the input tok embed
6095
0
                    if (output == NULL) {
6096
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6097
0
                    }
6098
6099
0
                    for (int i = 0; i < n_layer; ++i) {
6100
0
                        auto & layer = layers[i];
6101
6102
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6103
6104
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6105
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6106
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6107
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6108
6109
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6110
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6111
6112
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6113
6114
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
6115
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, 0);
6116
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
6117
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
6118
6119
0
                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6120
0
                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6121
0
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
6122
0
                    }
6123
0
                } break;
6124
0
            case LLM_ARCH_HUNYUAN_DENSE:
6125
0
                {
6126
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6127
6128
                    // output
6129
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6130
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6131
                    // if output is NULL, init from the input tok embed
6132
0
                    if (output == NULL) {
6133
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6134
0
                    }
6135
6136
0
                    for (int i = 0; i < n_layer; ++i) {
6137
0
                        auto & layer = layers[i];
6138
6139
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6140
6141
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6142
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6143
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6144
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6145
6146
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6147
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6148
6149
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6150
6151
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6152
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6153
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6154
6155
0
                    }
6156
0
                } break;
6157
0
            case LLM_ARCH_SMOLLM3:
6158
0
                {
6159
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6160
6161
                    // output
6162
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6163
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6164
6165
                    // if output is NULL, init from the input tok embed
6166
0
                    if (output == NULL) {
6167
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6168
0
                    }
6169
6170
0
                    for (int i = 0; i < n_layer; ++i) {
6171
0
                        auto & layer = layers[i];
6172
6173
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6174
6175
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6176
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6177
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6178
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6179
6180
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6181
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6182
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6183
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6184
0
                    }
6185
0
                } break;
6186
0
            case LLM_ARCH_OPENAI_MOE:
6187
0
                {
6188
0
                    const int64_t n_ff_exp = hparams.n_ff_exp;
6189
6190
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6191
6192
                    // output
6193
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6194
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6195
6196
0
                    for (int i = 0; i < n_layer; ++i) {
6197
0
                        auto & layer = layers[i];
6198
6199
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
6200
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
6201
6202
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_head * n_rot}, 0);
6203
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6204
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6205
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6206
6207
0
                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
6208
6209
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
6210
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6211
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6212
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6213
6214
                        // bias
6215
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_head * n_rot}, 0);
6216
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_head_kv * n_rot}, 0);
6217
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_head_kv * n_rot}, 0);
6218
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6219
6220
0
                        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
6221
0
                        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6222
0
                        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
6223
0
                        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
6224
0
                    }
6225
0
                } break;
6226
0
            case LLM_ARCH_LFM2:
6227
0
            case LLM_ARCH_LFM2MOE:
6228
0
                {
6229
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6230
6231
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6232
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6233
6234
0
                    if (output == NULL) {
6235
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6236
0
                    }
6237
6238
0
                    for (int i = 0; i < n_layer; ++i) {
6239
0
                        auto & layer = layers[i];
6240
6241
0
                        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
6242
6243
                        // ffn/moe is same for transformer and conv layers
6244
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6245
0
                        if (is_moe_layer) {
6246
0
                            GGML_ASSERT(n_expert && n_expert_used);
6247
0
                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
6248
0
                            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6249
0
                            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
6250
0
                            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
6251
0
                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6252
0
                        } else {  // dense
6253
0
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6254
0
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6255
0
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6256
0
                        }
6257
6258
                        // for operator_norm
6259
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6260
6261
0
                        if (!hparams.is_recurrent(i)) {
6262
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6263
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6264
0
                            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
6265
6266
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
6267
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
6268
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
6269
6270
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6271
0
                        } else {
6272
0
                            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
6273
0
                            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
6274
0
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6275
0
                        }
6276
0
                    }
6277
0
                } break;
6278
0
            case LLM_ARCH_SMALLTHINKER:
6279
0
                {
6280
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6281
6282
                    // output
6283
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6284
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6285
6286
                    // if output is NULL, init from the input tok embed
6287
0
                    if (output == NULL) {
6288
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6289
0
                    }
6290
6291
0
                    for (int i = 0; i < n_layer; ++i) {
6292
0
                        auto & layer = layers[i];
6293
6294
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6295
6296
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6297
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6298
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6299
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6300
6301
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6302
6303
0
                        GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
6304
0
                        GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
6305
6306
                        // MoE branch
6307
0
                        const int64_t n_ff_exp = hparams.n_ff_exp;
6308
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6309
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6310
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6311
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6312
0
                    }
6313
0
                } break;
6314
0
            case LLM_ARCH_GROVEMOE:
6315
0
                {
6316
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6317
6318
                    // output
6319
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6320
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6321
                    // if output is NULL, init from the input tok embed
6322
0
                    if (output == NULL) {
6323
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6324
0
                    }
6325
6326
0
                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
6327
0
                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
6328
0
                    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
6329
6330
0
                    for (int i = 0; i < n_layer; ++i) {
6331
0
                        auto & layer = layers[i];
6332
6333
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6334
6335
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6336
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
6337
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
6338
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6339
6340
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6341
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6342
6343
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6344
6345
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6346
6347
                        // MoE branch
6348
0
                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6349
0
                        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
6350
0
                        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
6351
6352
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6353
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
6354
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
6355
6356
0
                        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
6357
0
                        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
6358
0
                        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
6359
0
                    }
6360
0
                } break;
6361
0
            case LLM_ARCH_APERTUS:
6362
0
                {
6363
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6364
6365
                    // output
6366
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6367
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
6368
6369
0
                    for (int i = 0; i < n_layer; ++i) {
6370
0
                        auto & layer = layers[i];
6371
6372
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6373
6374
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6375
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6376
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6377
0
                        } else {
6378
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6379
0
                        }
6380
6381
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6382
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_gqa }, 0);
6383
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_gqa }, 0);
6384
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6385
6386
                        // optional bias tensors
6387
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
6388
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6389
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6390
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd },     TENSOR_NOT_REQUIRED);
6391
6392
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6393
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6394
0
                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6395
6396
                        // Q and K layernorms for Apertus
6397
0
                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6398
0
                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6399
0
                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6400
0
                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6401
0
                    }
6402
0
                } break;
6403
0
            case LLM_ARCH_MINIMAX_M2:
6404
0
                {
6405
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6406
6407
                    // output
6408
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6409
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
6410
6411
0
                    for (int i = 0; i < n_layer; ++i) {
6412
0
                        auto & layer = layers[i];
6413
6414
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6415
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6416
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6417
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6418
6419
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6420
0
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
6421
0
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
6422
6423
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6424
6425
0
                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6426
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
6427
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
6428
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
6429
0
                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6430
0
                    }
6431
0
                } break;
6432
0
            case LLM_ARCH_COGVLM:
6433
0
                {
6434
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6435
6436
                    // output
6437
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6438
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6439
6440
                    // if output is NULL, init from the input tok embed
6441
0
                    if (output == NULL) {
6442
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6443
0
                    }
6444
6445
0
                    for (int i = 0; i < n_layer; ++i) {
6446
0
                        auto & layer = layers[i];
6447
6448
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6449
0
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
6450
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6451
6452
0
                        layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
6453
0
                        layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6454
6455
0
                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6456
6457
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6458
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6459
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6460
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6461
6462
0
                        layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6463
0
                        layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6464
0
                        layer.visexp_ffn_up   = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6465
0
                    }
6466
0
                } break;
6467
0
            case LLM_ARCH_PANGU_EMBED:
6468
0
                {
6469
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6470
6471
                    // output
6472
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6473
0
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6474
6475
                    // if output is NULL, init from the input tok embed
6476
0
                    if (output == NULL) {
6477
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6478
0
                    }
6479
6480
0
                    for (int i = 0; i < n_layer; ++i) {
6481
0
                        auto & layer = layers[i];
6482
6483
0
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6484
6485
                        // weight tensors
6486
0
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6487
0
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
6488
0
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
6489
0
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6490
6491
                        // bias tensors
6492
0
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, 0);
6493
0
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
6494
0
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
6495
0
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6496
6497
0
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6498
6499
0
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6500
0
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6501
0
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6502
0
                        } else {
6503
0
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6504
0
                        }
6505
6506
0
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
6507
0
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
6508
0
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
6509
0
                    }
6510
0
                } break;
6511
0
            case LLM_ARCH_QWEN3NEXT:
6512
0
                {
6513
0
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6514
6515
                    // output
6516
0
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6517
0
                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
6518
6519
                    // if output is NULL, init from the input tok embed
6520
0
                    if (output == NULL) {
6521
0
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
6522
0
                    }
6523
6524
0
                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6525
6526
                    // Calculate dimensions from hyperparameters
6527
0
                    const int64_t head_k_dim = hparams.ssm_d_state;
6528
0
                    const int64_t head_v_dim = hparams.ssm_d_state;
6529
0
                    const int64_t n_k_heads  = hparams.ssm_n_group;
6530
0
                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
6531
0
                    const int64_t key_dim    = head_k_dim * n_k_heads;
6532
0
                    const int64_t value_dim  = head_v_dim * n_v_heads;
6533
0
                    const int64_t conv_dim   = key_dim * 2 + value_dim;
6534
6535
                    // Calculate projection sizes
6536
0
                    const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
6537
0
                    const int64_t ba_dim   = n_v_heads * 2;
6538
6539
0
                    for (int i = 0; i < n_layer; ++i) {
6540
0
                        auto & layer = layers[i];
6541
6542
0
                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
6543
0
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
6544
6545
0
                        if (!hparams.is_recurrent(i)) {
6546
                            // Attention layers
6547
0
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
6548
0
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
6549
0
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
6550
0
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6551
6552
                            // Q/K normalization for attention layers
6553
0
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6554
0
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6555
0
                        } else {
6556
                            // Linear attention (gated delta net) specific tensors
6557
                            // Create tensors with calculated dimensions
6558
0
                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, 0);
6559
0
                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6560
0
                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
6561
0
                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
6562
0
                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
6563
0
                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
6564
0
                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
6565
0
                        }
6566
6567
0
                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
6568
0
                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6569
0
                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6570
0
                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6571
6572
                        // Shared experts
6573
0
                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
6574
0
                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6575
0
                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6576
0
                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
6577
0
                    }
6578
0
                } break;
6579
0
            default:
6580
0
                throw std::runtime_error("unknown architecture");
6581
0
        }
6582
6583
0
        if (n_moved_tensors > 0) {
6584
0
            LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
6585
0
                __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
6586
0
                ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
6587
0
        }
6588
0
    }
6589
6590
0
    ml.done_getting_tensors();
6591
6592
0
    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
6593
0
    pimpl->mappings.reserve(ml.mappings.size());
6594
6595
    // create the backend buffers
6596
0
    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6597
0
    ctx_buf_maps.reserve(ctx_map.size());
6598
6599
    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6600
0
    const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6601
0
    pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
6602
6603
0
    for (auto & [buft, ctx_ptr] : ctx_map) {
6604
0
        ggml_context * ctx = ctx_ptr.get();
6605
6606
        // skip contexts without tensors
6607
0
        if (ggml_get_first_tensor(ctx) == nullptr) {
6608
0
            continue;
6609
0
        }
6610
6611
0
        llama_buf_map buf_map;
6612
0
        buf_map.reserve(n_max_backend_buffer);
6613
6614
        // check if it is possible to use buffer_from_host_ptr with this buffer type
6615
0
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
6616
0
        if (!dev) {
6617
            // FIXME: workaround for CPU backend buft having a NULL device
6618
0
            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
6619
0
            if (!dev) {
6620
0
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
6621
0
            }
6622
0
        }
6623
0
        ggml_backend_dev_props props;
6624
0
        ggml_backend_dev_get_props(dev, &props);
6625
0
        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6626
0
        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6627
6628
0
        std::vector<ggml_backend_buffer_ptr> bufs;
6629
0
        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6630
0
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6631
                // only the mmap region containing the tensors in the model is mapped to the backend buffer
6632
                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
6633
                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
6634
0
                void * addr = nullptr;
6635
0
                size_t first, last; // NOLINT
6636
0
                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
6637
0
                if (first >= last) {
6638
0
                    continue;
6639
0
                }
6640
0
                const size_t max_size = ggml_get_max_tensor_size(ctx);
6641
0
                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6642
0
                if (buf == nullptr) {
6643
0
                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6644
0
                }
6645
0
                bufs.emplace_back(buf);
6646
0
                buf_map.emplace(idx, buf);
6647
0
            }
6648
0
        }
6649
0
        else {
6650
0
            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6651
0
            if (buf == nullptr) {
6652
0
                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6653
0
            }
6654
0
            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
6655
0
                pimpl->mlock_bufs.emplace_back(new llama_mlock);
6656
0
                auto & mlock_buf = pimpl->mlock_bufs.back();
6657
0
                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
6658
0
                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
6659
0
            }
6660
0
            bufs.emplace_back(buf);
6661
0
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6662
0
                buf_map.emplace(idx, buf);
6663
0
            }
6664
0
        }
6665
0
        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
6666
6667
0
        for (auto & buf : buf_map) {
6668
            // indicate that this buffer contains weights
6669
            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
6670
0
            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
6671
0
        }
6672
6673
0
        ctx_buf_maps.emplace_back(ctx, buf_map);
6674
0
    }
6675
6676
0
    if (llama_supports_gpu_offload()) {
6677
0
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
6678
6679
0
        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
6680
0
        if (n_gpu_layers > (int) hparams.n_layer) {
6681
0
            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
6682
0
        }
6683
6684
0
        const int max_backend_supported_layers = hparams.n_layer + 1;
6685
0
        const int max_offloadable_layers       = hparams.n_layer + 1;
6686
6687
0
        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
6688
0
    }
6689
6690
    // print memory requirements per buffer type
6691
0
    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6692
0
        for (auto & buf: bufs) {
6693
0
            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6694
0
                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6695
0
        }
6696
0
    }
6697
6698
    // populate tensors_by_name
6699
0
    for (auto & [ctx, _] : pimpl->ctxs_bufs) {
6700
0
        for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
6701
0
            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
6702
0
        }
6703
0
    }
6704
6705
    // load tensor data
6706
0
    for (auto & [ctx, buf_map] : ctx_buf_maps) {
6707
0
        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6708
0
            return false;
6709
0
        }
6710
0
    }
6711
6712
0
    if (use_mmap_buffer) {
6713
0
        for (auto & mapping : ml.mappings) {
6714
0
            pimpl->mappings.emplace_back(std::move(mapping));
6715
0
        }
6716
0
    }
6717
6718
0
    return true;
6719
0
}
6720
6721
0
std::string llama_model::arch_name() const {
6722
0
    return llm_arch_name(arch);
6723
0
}
6724
6725
0
std::string llama_model::type_name() const {
6726
0
    return llm_type_name(type);
6727
0
}
6728
6729
0
std::string llama_model::desc() const {
6730
0
    return pimpl->desc_str;
6731
0
}
6732
6733
0
size_t llama_model::size() const {
6734
0
    return pimpl->n_bytes;
6735
0
}
6736
6737
0
size_t llama_model::n_tensors() const {
6738
0
    return tensors_by_name.size();
6739
0
}
6740
6741
0
size_t llama_model::n_devices() const {
6742
0
    return devices.size();
6743
0
}
6744
6745
0
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6746
0
    std::map<ggml_backend_buffer_type_t, size_t> ret;
6747
0
    for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6748
0
        for (const auto & buf : bufs) {
6749
0
            ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6750
0
        }
6751
0
    }
6752
0
    return ret;
6753
0
}
6754
6755
0
uint64_t llama_model::n_elements() const {
6756
0
    return pimpl->n_elements;
6757
0
}
6758
6759
0
void llama_model::print_info() const {
6760
0
    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
6761
6762
0
    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
6763
0
        bool is_var = false;
6764
6765
0
        std::vector<uint32_t> v;
6766
0
        for (uint32_t i = 0; i < n; ++i) {
6767
0
            v.push_back(f(i));
6768
0
            if (v[i] != v[0]) {
6769
0
                is_var = true;
6770
0
            }
6771
0
        }
6772
6773
0
        std::stringstream ss;
6774
6775
0
        if (is_var) {
6776
0
            ss << "[";
6777
0
            for (uint32_t i = 0; i < n; ++i) {
6778
0
                ss << v[i];
6779
0
                if (i < n - 1) {
6780
0
                    ss << ", ";
6781
0
                }
6782
0
            }
6783
0
            ss << "]";
6784
0
        } else {
6785
0
            ss << v[0];
6786
0
        }
6787
6788
0
        return ss.str();
6789
0
    };
6790
6791
    // hparams
6792
0
    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, arch_name().c_str());
6793
0
    LLAMA_LOG_INFO("%s: vocab_only       = %d\n",     __func__, hparams.vocab_only);
6794
6795
0
    if (!hparams.vocab_only) {
6796
0
        LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
6797
0
        LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
6798
0
        LLAMA_LOG_INFO("%s: n_embd_inp       = %u\n",     __func__, hparams.n_embd_inp());
6799
0
        LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
6800
0
        LLAMA_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
6801
0
        LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
6802
0
        LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
6803
0
        LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
6804
0
        LLAMA_LOG_INFO("%s: is_swa_any       = %u\n",     __func__, hparams.is_swa_any());
6805
0
        LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
6806
0
        LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
6807
0
        LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
6808
0
        LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
6809
0
        LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
6810
0
        LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
6811
0
        LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
6812
0
        LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
6813
0
        LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
6814
0
        LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
6815
0
        LLAMA_LOG_INFO("%s: f_attn_scale     = %.1e\n",   __func__, hparams.f_attention_scale);
6816
0
        LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
6817
0
        LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
6818
0
        LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
6819
0
        LLAMA_LOG_INFO("%s: n_expert_groups  = %d\n",     __func__, hparams.n_expert_groups);
6820
0
        LLAMA_LOG_INFO("%s: n_group_used     = %d\n",     __func__, hparams.n_group_used);
6821
0
        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
6822
0
        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
6823
0
        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
6824
0
        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
6825
0
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
6826
0
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
6827
0
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
6828
0
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
6829
0
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
6830
        // MRoPE (Multi-axis Rotary Position Embedding) sections
6831
0
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
6832
0
            LLAMA_LOG_INFO("%s: mrope sections   = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
6833
0
        }
6834
0
        if (!classifier_labels.empty()) {
6835
0
            LLAMA_LOG_INFO("%s: n_cls_out        = %u\n", __func__, hparams.n_cls_out);
6836
6837
0
            size_t i = 0;
6838
0
            for (auto label : classifier_labels) {
6839
0
                LLAMA_LOG_INFO("%s: cls_label[%2zu]    = %s\n", __func__, i++, label.c_str());
6840
0
            }
6841
0
        }
6842
0
    }
6843
6844
0
    if (arch == LLM_ARCH_MAMBA ||
6845
0
        arch == LLM_ARCH_MAMBA2 ||
6846
0
        arch == LLM_ARCH_JAMBA ||
6847
0
        arch == LLM_ARCH_FALCON_H1 ||
6848
0
        arch == LLM_ARCH_PLAMO2 ||
6849
0
        arch == LLM_ARCH_GRANITE_HYBRID ||
6850
0
        arch == LLM_ARCH_QWEN3NEXT ||
6851
0
        arch == LLM_ARCH_NEMOTRON_H) {
6852
0
        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
6853
0
        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
6854
0
        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
6855
0
        LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
6856
0
        LLAMA_LOG_INFO("%s: ssm_n_group      = %u\n",     __func__, hparams.ssm_n_group);
6857
0
        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
6858
0
    }
6859
6860
0
    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
6861
0
    if (pimpl->n_elements >= 1e12) {
6862
0
        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, pimpl->n_elements*1e-12);
6863
0
    } else if (pimpl->n_elements >= 1e9) {
6864
0
        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, pimpl->n_elements*1e-9);
6865
0
    } else if (pimpl->n_elements >= 1e6) {
6866
0
        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, pimpl->n_elements*1e-6);
6867
0
    } else {
6868
0
        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, pimpl->n_elements*1e-3);
6869
0
    }
6870
6871
    // general kv
6872
0
    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, name.c_str());
6873
6874
0
    if (arch == LLM_ARCH_DEEPSEEK) {
6875
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6876
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6877
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6878
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6879
0
    }
6880
6881
0
    if (arch == LLM_ARCH_DEEPSEEK2) {
6882
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6883
0
        LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
6884
0
        LLAMA_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
6885
0
        LLAMA_LOG_INFO("%s: n_embd_head_k_mla    = %d\n",     __func__, hparams.n_embd_head_k_mla);
6886
0
        LLAMA_LOG_INFO("%s: n_embd_head_v_mla    = %d\n",     __func__, hparams.n_embd_head_v_mla);
6887
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6888
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6889
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6890
0
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
6891
0
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6892
0
    }
6893
6894
0
    if (arch == LLM_ARCH_QWEN2MOE) {
6895
0
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
6896
0
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
6897
0
    }
6898
6899
0
    if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
6900
0
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
6901
0
    }
6902
6903
0
    if (arch == LLM_ARCH_MINICPM ||
6904
0
        arch == LLM_ARCH_GRANITE ||
6905
0
        arch == LLM_ARCH_GRANITE_MOE ||
6906
0
        arch == LLM_ARCH_GRANITE_HYBRID) {
6907
0
        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6908
0
        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
6909
0
        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6910
0
        LLAMA_LOG_INFO("%s: n_ff_shexp        = %d\n", __func__, hparams.n_ff_shexp);
6911
0
    }
6912
6913
0
    if (arch == LLM_ARCH_BAILINGMOE) {
6914
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6915
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6916
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6917
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6918
0
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
6919
0
    }
6920
6921
0
    if (arch == LLM_ARCH_BAILINGMOE2) {
6922
0
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
6923
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6924
0
        LLAMA_LOG_INFO("%s: n_ff_shexp           = %d\n",     __func__, hparams.n_ff_shexp);
6925
0
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
6926
0
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
6927
0
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
6928
0
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6929
0
        LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n",     __func__, hparams.nextn_predict_layers);
6930
0
    }
6931
6932
0
    if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6933
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6934
0
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6935
0
    }
6936
6937
0
    if (arch == LLM_ARCH_GROVEMOE) {
6938
0
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
6939
0
        LLAMA_LOG_INFO("%s: n_ff_chexp           = %d\n",     __func__, hparams.n_ff_chexp);
6940
0
        LLAMA_LOG_INFO("%s: n_group_experts      = %d\n",     __func__, hparams.n_group_experts);
6941
0
        LLAMA_LOG_INFO("%s: expert_group_scale   = %.2f\n",   __func__, hparams.expert_group_scale);
6942
0
    }
6943
6944
0
    vocab.print_info();
6945
0
}
6946
6947
0
ggml_backend_dev_t llama_model::dev_layer(int il) const {
6948
0
    return pimpl->dev_layer.at(il).dev;
6949
0
}
6950
6951
0
ggml_backend_dev_t llama_model::dev_output() const {
6952
0
    return pimpl->dev_output.dev;
6953
0
}
6954
6955
template<typename F>
6956
0
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
6957
0
    ggml_init_params params = {
6958
0
        /*.mem_size   =*/ ggml_tensor_overhead()*8,
6959
0
        /*.mem_buffer =*/ NULL,
6960
0
        /*.no_alloc   =*/ true,
6961
0
    };
6962
6963
0
    ggml_context_ptr ctx { ggml_init(params) };
6964
0
    if (!ctx) {
6965
0
        throw std::runtime_error(format("failed to create ggml context"));
6966
0
    }
6967
6968
0
    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
6969
0
    ggml_tensor * op_tensor = fn(ctx.get());
6970
0
    for (int i = 0; i < GGML_MAX_SRC; i++) {
6971
0
        if (op_tensor->src[i] != nullptr) {
6972
0
            assert(op_tensor->src[i]->buffer == nullptr);
6973
0
            op_tensor->src[i]->buffer = buf.get();
6974
0
        }
6975
0
    }
6976
6977
0
    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
6978
6979
0
    return op_supported;
6980
0
}
6981
6982
template<typename F>
6983
0
static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
6984
0
    for (const auto & cur : buft_list) {
6985
0
        ggml_backend_dev_t cur_dev = cur.first;
6986
0
        ggml_backend_buffer_type_t cur_buft = cur.second;
6987
0
        if (buft_supported(cur_buft, cur_dev, fn)) {
6988
0
            return cur_buft;
6989
0
        }
6990
0
    }
6991
6992
0
    throw std::runtime_error(format("no suitable buffer type found"));
6993
0
}
6994
6995
0
ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
6996
0
    return ::select_buft(
6997
0
            *pimpl->dev_layer.at(il).buft_list,
6998
0
            [&](ggml_context * ctx) {
6999
0
                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
7000
0
                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
7001
0
                return ggml_add(ctx, cur, layer_dir);
7002
0
            });
7003
0
}
7004
7005
0
bool llama_model::has_tensor_overrides() const {
7006
0
    return pimpl->has_tensor_overrides;
7007
0
}
7008
7009
0
const ggml_tensor * llama_model::get_tensor(const char * name) const {
7010
0
    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
7011
0
            [name](const std::pair<std::string, ggml_tensor *> & it) {
7012
0
                return it.first == name;
7013
0
            });
7014
0
    if (it == tensors_by_name.end()) {
7015
0
        return nullptr;
7016
0
    }
7017
7018
0
    return it->second;
7019
0
}
7020
7021
0
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
7022
0
    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
7023
0
}
7024
7025
0
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
7026
0
    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
7027
0
}
7028
7029
0
ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
7030
0
    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
7031
7032
    // choose long/short freq factors based on the context size
7033
0
    if (layers[il].rope_freqs != nullptr) {
7034
0
        return layers[il].rope_freqs;
7035
0
    }
7036
7037
0
    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
7038
0
        return layers[il].rope_long;
7039
0
    }
7040
7041
0
    return layers[il].rope_short;
7042
0
}
7043
7044
0
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
7045
0
    llama_memory_i * res;
7046
7047
0
    switch (arch) {
7048
        // Models that need specific instantiation should be handled in the
7049
        // switch statement
7050
0
        case LLM_ARCH_BERT:
7051
0
        case LLM_ARCH_JINA_BERT_V2:
7052
0
        case LLM_ARCH_JINA_BERT_V3:
7053
0
        case LLM_ARCH_NOMIC_BERT:
7054
0
        case LLM_ARCH_NOMIC_BERT_MOE:
7055
0
        case LLM_ARCH_NEO_BERT:
7056
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
7057
0
        case LLM_ARCH_GEMMA_EMBEDDING:
7058
0
        case LLM_ARCH_DREAM:
7059
0
        case LLM_ARCH_LLADA:
7060
0
        case LLM_ARCH_LLADA_MOE:
7061
0
        case LLM_ARCH_RND1:
7062
0
            {
7063
0
                res = nullptr;
7064
0
            } break;
7065
        // Models that need standard caching should rely on recurrent/hybrid
7066
        // checks
7067
0
        default:
7068
0
            {
7069
0
                if (llm_arch_is_recurrent(arch)) {
7070
0
                    res = new llama_memory_recurrent(
7071
0
                            *this,
7072
0
                            GGML_TYPE_F32,
7073
0
                            GGML_TYPE_F32,
7074
0
                            cparams.offload_kqv,
7075
0
                            std::max((uint32_t) 1, cparams.n_seq_max),
7076
0
                            cparams.n_seq_max,
7077
0
                            nullptr);
7078
0
                } else if (llm_arch_is_hybrid(arch)) {
7079
7080
                    // The main difference between hybrid architectures is the
7081
                    // layer filters, so pick the right one here
7082
0
                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
7083
0
                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
7084
0
                    if (arch == LLM_ARCH_FALCON_H1) {
7085
0
                        filter_attn = [&](int32_t) { return true; };
7086
0
                        filter_recr = [&](int32_t) { return true; };
7087
0
                    } else if (arch == LLM_ARCH_NEMOTRON_H) {
7088
0
                        filter_attn = [&](int32_t il) {
7089
0
                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7090
0
                        };
7091
0
                        filter_recr = [&](int32_t il) {
7092
0
                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
7093
0
                        };
7094
0
                    }
7095
7096
0
                    res = new llama_memory_hybrid(
7097
0
                        /* model             */ *this,
7098
0
                        /* attn_type_k       */ params.type_k,
7099
0
                        /* attn_type_v       */ params.type_v,
7100
0
                        /* attn_v_trans      */ !cparams.flash_attn,
7101
0
                        /* attn_kv_size      */ cparams.n_ctx,
7102
0
                        /* attn_n_pad        */ 1,
7103
0
                        /* attn_n_swa        */ hparams.n_swa,
7104
0
                        /* attn_swa_type     */ hparams.swa_type,
7105
0
                        /* recurrent_type_k  */ GGML_TYPE_F32,
7106
0
                        /* recurrent_type_v  */ GGML_TYPE_F32,
7107
0
                        /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
7108
0
                        /* n_seq_max         */ cparams.n_seq_max,
7109
0
                        /* offload           */ cparams.offload_kqv,
7110
0
                        /* unified           */ cparams.kv_unified,
7111
0
                        /* filter_attn       */ std::move(filter_attn),
7112
0
                        /* filter_recr       */ std::move(filter_recr));
7113
0
                } else {
7114
0
                    llama_memory_i::layer_reuse_cb reuse = nullptr;
7115
7116
0
                    if (arch == LLM_ARCH_GEMMA3N) {
7117
0
                        reuse = [&](int32_t il) {
7118
0
                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
7119
0
                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
7120
0
                            }
7121
7122
0
                            return -1;
7123
0
                        };
7124
0
                    }
7125
7126
0
                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7127
0
                        GGML_ASSERT(hparams.is_swa_any());
7128
7129
0
                        res = new llama_kv_cache_iswa(
7130
0
                                *this,
7131
0
                                params.type_k,
7132
0
                                params.type_v,
7133
0
                                !cparams.flash_attn,
7134
0
                                cparams.offload_kqv,
7135
0
                                params.swa_full,
7136
0
                                cparams.kv_unified,
7137
0
                                cparams.n_ctx_seq,
7138
0
                                cparams.n_seq_max,
7139
0
                                cparams.n_ubatch,
7140
0
                                1,
7141
0
                                nullptr,
7142
0
                                reuse);
7143
0
                    } else {
7144
0
                        GGML_ASSERT(!hparams.is_swa_any());
7145
7146
0
                        res = new llama_kv_cache(
7147
0
                                *this,
7148
0
                                params.type_k,
7149
0
                                params.type_v,
7150
0
                                !cparams.flash_attn,
7151
0
                                cparams.offload_kqv,
7152
0
                                cparams.kv_unified,
7153
0
                                cparams.n_ctx_seq,
7154
0
                                cparams.n_seq_max,
7155
0
                                1,
7156
0
                                hparams.n_swa,
7157
0
                                hparams.swa_type,
7158
0
                                nullptr,
7159
0
                                nullptr);
7160
0
                    }
7161
0
                }
7162
0
            }
7163
0
    }
7164
7165
0
    return res;
7166
0
}
7167
7168
0
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7169
0
    std::unique_ptr<llm_graph_context> llm;
7170
7171
0
    switch (arch) {
7172
0
        case LLM_ARCH_LLAMA:
7173
0
            {
7174
0
                llm = std::make_unique<llm_build_llama>(*this, params);
7175
0
            } break;
7176
0
        case LLM_ARCH_LLAMA4:
7177
0
            {
7178
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
7179
0
                    llm = std::make_unique<llm_build_llama>(*this, params);
7180
0
                } else {
7181
0
                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
7182
0
                }
7183
0
            } break;
7184
0
        case LLM_ARCH_DECI:
7185
0
            {
7186
0
                llm = std::make_unique<llm_build_deci>(*this, params);
7187
0
            } break;
7188
0
        case LLM_ARCH_BAICHUAN:
7189
0
            {
7190
0
                llm = std::make_unique<llm_build_baichuan>(*this, params);
7191
0
            } break;
7192
0
        case LLM_ARCH_FALCON:
7193
0
            {
7194
0
                llm = std::make_unique<llm_build_falcon>(*this, params);
7195
0
            } break;
7196
0
        case LLM_ARCH_GROK:
7197
0
            {
7198
0
                llm = std::make_unique<llm_build_grok>(*this, params);
7199
0
            } break;
7200
0
        case LLM_ARCH_STARCODER:
7201
0
            {
7202
0
                llm = std::make_unique<llm_build_starcoder>(*this, params);
7203
0
            } break;
7204
0
        case LLM_ARCH_REFACT:
7205
0
            {
7206
0
                llm = std::make_unique<llm_build_refact>(*this, params);
7207
0
            } break;
7208
0
        case LLM_ARCH_BERT:
7209
0
        case LLM_ARCH_JINA_BERT_V2:
7210
0
        case LLM_ARCH_JINA_BERT_V3:
7211
0
        case LLM_ARCH_NOMIC_BERT:
7212
0
        case LLM_ARCH_NOMIC_BERT_MOE:
7213
0
            {
7214
0
                llm = std::make_unique<llm_build_bert>(*this, params);
7215
0
            } break;
7216
0
        case LLM_ARCH_NEO_BERT:
7217
0
            {
7218
0
                llm = std::make_unique<llm_build_neo_bert>(*this, params);
7219
0
            } break;
7220
0
        case LLM_ARCH_BLOOM:
7221
0
            {
7222
0
                llm = std::make_unique<llm_build_bloom>(*this, params);
7223
0
            } break;
7224
0
        case LLM_ARCH_MPT:
7225
0
            {
7226
0
                llm = std::make_unique<llm_build_mpt>(*this, params);
7227
0
            } break;
7228
0
        case LLM_ARCH_STABLELM:
7229
0
            {
7230
0
                llm = std::make_unique<llm_build_stablelm>(*this, params);
7231
0
            } break;
7232
0
        case LLM_ARCH_QWEN:
7233
0
            {
7234
0
                llm = std::make_unique<llm_build_qwen>(*this, params);
7235
0
            } break;
7236
0
        case LLM_ARCH_QWEN2:
7237
0
            {
7238
0
                llm = std::make_unique<llm_build_qwen2>(*this, params);
7239
0
            } break;
7240
0
        case LLM_ARCH_DREAM:
7241
0
            {
7242
0
                llm = std::make_unique<llm_build_dream>(*this, params);
7243
0
            }
7244
0
            break;
7245
0
        case LLM_ARCH_LLADA:
7246
0
            {
7247
0
                llm = std::make_unique<llm_build_llada>(*this, params);
7248
0
            }
7249
0
            break;
7250
0
        case LLM_ARCH_LLADA_MOE:
7251
0
            {
7252
0
                llm = std::make_unique<llm_build_llada_moe>(*this, params);
7253
0
            }
7254
0
            break;
7255
0
        case LLM_ARCH_RND1:
7256
0
            {
7257
0
                llm = std::make_unique<llm_build_rnd1>(*this, params);
7258
0
            }
7259
0
            break;
7260
0
        case LLM_ARCH_QWEN2VL:
7261
0
            {
7262
0
                llm = std::make_unique<llm_build_qwen2vl>(*this, params);
7263
0
            } break;
7264
0
        case LLM_ARCH_QWEN2MOE:
7265
0
            {
7266
0
                llm = std::make_unique<llm_build_qwen2moe>(*this, params);
7267
0
            } break;
7268
0
        case LLM_ARCH_QWEN3:
7269
0
            {
7270
0
                llm = std::make_unique<llm_build_qwen3>(*this, params);
7271
0
            } break;
7272
0
        case LLM_ARCH_QWEN3MOE:
7273
0
            {
7274
0
                llm = std::make_unique<llm_build_qwen3moe>(*this, params);
7275
0
            } break;
7276
0
        case LLM_ARCH_QWEN3VL:
7277
0
            {
7278
0
                llm = std::make_unique<llm_build_qwen3vl>(*this, params);
7279
0
            } break;
7280
0
        case LLM_ARCH_QWEN3VLMOE:
7281
0
            {
7282
0
                llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
7283
0
            } break;
7284
0
        case LLM_ARCH_PHI2:
7285
0
            {
7286
0
                llm = std::make_unique<llm_build_phi2>(*this, params);
7287
0
            } break;
7288
0
        case LLM_ARCH_PHI3:
7289
0
        case LLM_ARCH_PHIMOE:
7290
0
            {
7291
0
                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7292
0
                    llm = std::make_unique<llm_build_phi3<true>> (*this, params);
7293
0
                } else {
7294
0
                    llm = std::make_unique<llm_build_phi3<false>>(*this, params);
7295
0
                }
7296
0
            } break;
7297
0
        case LLM_ARCH_PLAMO:
7298
0
            {
7299
0
                llm = std::make_unique<llm_build_plamo>(*this, params);
7300
0
            } break;
7301
0
        case LLM_ARCH_PLAMO2:
7302
0
            {
7303
0
                llm = std::make_unique<llm_build_plamo2>(*this, params);
7304
0
            } break;
7305
0
        case LLM_ARCH_GPT2:
7306
0
            {
7307
0
                llm = std::make_unique<llm_build_gpt2>(*this, params);
7308
0
            } break;
7309
0
        case LLM_ARCH_CODESHELL:
7310
0
            {
7311
0
                llm = std::make_unique<llm_build_codeshell>(*this, params);
7312
0
            } break;
7313
0
        case LLM_ARCH_ORION:
7314
0
            {
7315
0
                llm = std::make_unique<llm_build_orion>(*this, params);
7316
0
            } break;
7317
0
        case LLM_ARCH_INTERNLM2:
7318
0
            {
7319
0
                llm = std::make_unique<llm_build_internlm2>(*this, params);
7320
0
            } break;
7321
0
        case LLM_ARCH_MINICPM3:
7322
0
            {
7323
0
                llm = std::make_unique<llm_build_minicpm3>(*this, params);
7324
0
            } break;
7325
0
        case LLM_ARCH_GEMMA:
7326
0
            {
7327
0
                llm = std::make_unique<llm_build_gemma>(*this, params);
7328
0
            } break;
7329
0
        case LLM_ARCH_GEMMA2:
7330
0
            {
7331
0
                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
7332
0
            } break;
7333
0
        case LLM_ARCH_GEMMA3:
7334
0
            {
7335
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7336
0
                    llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
7337
0
                } else {
7338
0
                    llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
7339
0
                }
7340
0
            } break;
7341
0
        case LLM_ARCH_GEMMA3N:
7342
0
            {
7343
0
                llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
7344
0
            } break;
7345
0
        case LLM_ARCH_GEMMA_EMBEDDING:
7346
0
            {
7347
0
                llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
7348
0
            } break;
7349
0
        case LLM_ARCH_STARCODER2:
7350
0
            {
7351
0
                llm = std::make_unique<llm_build_starcoder2>(*this, params);
7352
0
            } break;
7353
0
        case LLM_ARCH_MAMBA:
7354
0
        case LLM_ARCH_MAMBA2:
7355
0
            {
7356
0
                llm = std::make_unique<llm_build_mamba>(*this, params);
7357
0
            } break;
7358
0
        case LLM_ARCH_JAMBA:
7359
0
            {
7360
0
                llm = std::make_unique<llm_build_jamba>(*this, params);
7361
0
            } break;
7362
0
        case LLM_ARCH_XVERSE:
7363
0
            {
7364
0
                llm = std::make_unique<llm_build_xverse>(*this, params);
7365
0
            } break;
7366
0
        case LLM_ARCH_COMMAND_R:
7367
0
            {
7368
0
                llm = std::make_unique<llm_build_command_r>(*this, params);
7369
0
            } break;
7370
0
        case LLM_ARCH_COHERE2:
7371
0
            {
7372
0
                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
7373
0
            } break;
7374
0
        case LLM_ARCH_DBRX:
7375
0
            {
7376
0
                llm = std::make_unique<llm_build_dbrx>(*this, params);
7377
0
            } break;
7378
0
        case LLM_ARCH_OLMO:
7379
0
            {
7380
0
                llm = std::make_unique<llm_build_olmo>(*this, params);
7381
0
            } break;
7382
0
        case LLM_ARCH_OLMO2:
7383
0
            {
7384
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7385
0
                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
7386
0
                } else {
7387
0
                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
7388
0
                }
7389
0
            } break;
7390
0
        case LLM_ARCH_OLMOE:
7391
0
            {
7392
0
                llm = std::make_unique<llm_build_olmoe>(*this, params);
7393
0
            } break;
7394
0
        case LLM_ARCH_OPENELM:
7395
0
            {
7396
0
                llm = std::make_unique<llm_build_openelm>(*this, params);
7397
0
            } break;
7398
0
        case LLM_ARCH_GPTNEOX:
7399
0
            {
7400
0
                llm = std::make_unique<llm_build_gptneox>(*this, params);
7401
0
            } break;
7402
0
        case LLM_ARCH_ARCTIC:
7403
0
            {
7404
0
                llm = std::make_unique<llm_build_arctic>(*this, params);
7405
0
            } break;
7406
0
        case LLM_ARCH_DEEPSEEK:
7407
0
            {
7408
0
                llm = std::make_unique<llm_build_deepseek>(*this, params);
7409
0
            } break;
7410
0
        case LLM_ARCH_DEEPSEEK2:
7411
0
            {
7412
0
                llm = std::make_unique<llm_build_deepseek2>(*this, params);
7413
0
            } break;
7414
0
        case LLM_ARCH_CHATGLM:
7415
0
            {
7416
0
                llm = std::make_unique<llm_build_chatglm>(*this, params);
7417
0
            } break;
7418
0
        case LLM_ARCH_GLM4:
7419
0
            {
7420
0
                llm = std::make_unique<llm_build_glm4>(*this, params);
7421
0
            } break;
7422
0
        case LLM_ARCH_GLM4_MOE:
7423
0
            {
7424
0
                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
7425
0
            } break;
7426
0
        case LLM_ARCH_BITNET:
7427
0
            {
7428
0
                llm = std::make_unique<llm_build_bitnet>(*this, params);
7429
0
            } break;
7430
0
        case LLM_ARCH_T5:
7431
0
            {
7432
0
                switch (params.gtype) {
7433
0
                    case LLM_GRAPH_TYPE_ENCODER:
7434
0
                        llm = std::make_unique<llm_build_t5_enc>(*this, params);
7435
0
                        break;
7436
0
                    case LLM_GRAPH_TYPE_DEFAULT:
7437
0
                    case LLM_GRAPH_TYPE_DECODER:
7438
0
                        llm = std::make_unique<llm_build_t5_dec>(*this, params);
7439
0
                        break;
7440
0
                    default:
7441
0
                        GGML_ABORT("invalid graph type");
7442
0
                };
7443
0
            } break;
7444
0
        case LLM_ARCH_T5ENCODER:
7445
0
            {
7446
0
                llm = std::make_unique<llm_build_t5_enc>(*this, params);
7447
0
            }
7448
0
            break;
7449
0
        case LLM_ARCH_JAIS:
7450
0
            {
7451
0
                llm = std::make_unique<llm_build_jais>(*this, params);
7452
0
            } break;
7453
0
        case LLM_ARCH_NEMOTRON:
7454
0
            {
7455
0
                llm = std::make_unique<llm_build_nemotron>(*this, params);
7456
0
            } break;
7457
0
        case LLM_ARCH_NEMOTRON_H:
7458
0
            {
7459
0
                llm = std::make_unique<llm_build_nemotron_h>(*this, params);
7460
0
            } break;
7461
0
        case LLM_ARCH_EXAONE:
7462
0
            {
7463
0
                llm = std::make_unique<llm_build_exaone>(*this, params);
7464
0
            } break;
7465
0
        case LLM_ARCH_EXAONE4:
7466
0
            {
7467
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7468
0
                    llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
7469
0
                } else {
7470
0
                    llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
7471
0
                }
7472
0
            } break;
7473
0
        case LLM_ARCH_RWKV6:
7474
0
            {
7475
0
                llm = std::make_unique<llm_build_rwkv6>(*this, params);
7476
0
            } break;
7477
0
        case LLM_ARCH_RWKV6QWEN2:
7478
0
            {
7479
0
                llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
7480
0
            } break;
7481
0
        case LLM_ARCH_RWKV7:
7482
0
            {
7483
0
                llm = std::make_unique<llm_build_rwkv7>(*this, params);
7484
0
            } break;
7485
0
        case LLM_ARCH_ARWKV7:
7486
0
            {
7487
0
                llm = std::make_unique<llm_build_arwkv7>(*this, params);
7488
0
            } break;
7489
0
        case LLM_ARCH_GRANITE:
7490
0
        case LLM_ARCH_GRANITE_MOE:
7491
0
        case LLM_ARCH_MINICPM:
7492
0
            {
7493
0
                llm = std::make_unique<llm_build_granite>(*this, params);
7494
0
            } break;
7495
0
        case LLM_ARCH_GRANITE_HYBRID:
7496
0
            {
7497
0
                llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
7498
0
            } break;
7499
0
        case LLM_ARCH_CHAMELEON:
7500
0
            {
7501
0
                llm = std::make_unique<llm_build_chameleon>(*this, params);
7502
0
            } break;
7503
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
7504
0
            {
7505
0
                llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
7506
0
            } break;
7507
0
        case LLM_ARCH_PLM:
7508
0
            {
7509
0
                llm = std::make_unique<llm_build_plm>(*this, params);
7510
0
            } break;
7511
0
        case LLM_ARCH_BAILINGMOE:
7512
0
            {
7513
0
                llm = std::make_unique<llm_build_bailingmoe>(*this, params);
7514
0
            } break;
7515
0
        case LLM_ARCH_BAILINGMOE2:
7516
0
            {
7517
0
                llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
7518
0
            } break;
7519
0
        case LLM_ARCH_SEED_OSS:
7520
0
            {
7521
0
                llm = std::make_unique<llm_build_seed_oss>(*this, params);
7522
0
            } break;
7523
0
        case LLM_ARCH_DOTS1:
7524
0
            {
7525
0
                llm = std::make_unique<llm_build_dots1>(*this, params);
7526
0
            } break;
7527
0
        case LLM_ARCH_ARCEE:
7528
0
            {
7529
0
                llm = std::make_unique<llm_build_arcee>(*this, params);
7530
0
            } break;
7531
0
        case LLM_ARCH_AFMOE:
7532
0
            {
7533
0
                llm = std::make_unique<llm_build_afmoe>(*this, params);
7534
0
            } break;
7535
0
        case LLM_ARCH_ERNIE4_5:
7536
0
            {
7537
0
                llm = std::make_unique<llm_build_ernie4_5>(*this, params);
7538
0
            } break;
7539
0
        case LLM_ARCH_ERNIE4_5_MOE:
7540
0
            {
7541
0
                llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
7542
0
            } break;
7543
0
        case LLM_ARCH_HUNYUAN_MOE:
7544
0
            {
7545
0
                llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
7546
0
            } break;
7547
0
        case LLM_ARCH_HUNYUAN_DENSE:
7548
0
            {
7549
0
                llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
7550
0
            } break;
7551
0
        case LLM_ARCH_SMOLLM3:
7552
0
            {
7553
0
                llm = std::make_unique<llm_build_smollm3>(*this, params);
7554
0
            } break;
7555
0
        case LLM_ARCH_OPENAI_MOE:
7556
0
            {
7557
0
                llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
7558
0
            } break;
7559
0
        case LLM_ARCH_FALCON_H1:
7560
0
            {
7561
0
                llm = std::make_unique<llm_build_falcon_h1>(*this, params);
7562
0
            } break;
7563
0
        case LLM_ARCH_LFM2:
7564
0
        case LLM_ARCH_LFM2MOE:
7565
0
            {
7566
0
                llm = std::make_unique<llm_build_lfm2>(*this, params);
7567
0
            } break;
7568
0
        case LLM_ARCH_SMALLTHINKER:
7569
0
            {
7570
0
                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7571
0
                    llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
7572
0
                } else {
7573
0
                    llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
7574
0
                }
7575
0
            } break;
7576
0
        case LLM_ARCH_GROVEMOE:
7577
0
            {
7578
0
                llm = std::make_unique<llm_build_grovemoe>(*this, params);
7579
0
            } break;
7580
0
        case LLM_ARCH_APERTUS:
7581
0
            {
7582
0
                llm = std::make_unique<llm_build_apertus>(*this, params);
7583
0
            } break;
7584
0
        case LLM_ARCH_MINIMAX_M2:
7585
0
            {
7586
0
                llm = std::make_unique<llm_build_minimax_m2>(*this, params);
7587
0
            } break;
7588
0
        case LLM_ARCH_COGVLM:
7589
0
            {
7590
0
                llm = std::make_unique<llm_build_cogvlm>(*this, params);
7591
0
            } break;
7592
0
        case LLM_ARCH_PANGU_EMBED:
7593
0
            {
7594
0
                llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
7595
0
            } break;
7596
0
        case LLM_ARCH_QWEN3NEXT:
7597
0
            {
7598
0
                llm = std::make_unique<llm_build_qwen3next>(*this, params);
7599
0
            } break;
7600
0
        case LLM_ARCH_MISTRAL3:
7601
0
            {
7602
0
                llm = std::make_unique<llm_build_mistral3>(*this, params);
7603
0
            } break;
7604
0
        default:
7605
0
            GGML_ABORT("fatal error");
7606
0
    }
7607
7608
    // add on pooling layer
7609
0
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7610
7611
    // if the gguf model was converted with --sentence-transformers-dense-modules
7612
    // there will be two additional dense projection layers
7613
    // dense linear projections are applied after pooling
7614
    // TODO: move reranking logic here and generalize
7615
0
    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
7616
7617
0
    return llm->res->get_gf();
7618
0
}
7619
7620
7621
//
7622
// interface implementation
7623
//
7624
7625
1.03k
llama_model_params llama_model_default_params() {
7626
1.03k
    llama_model_params result = {
7627
1.03k
        /*.devices                     =*/ nullptr,
7628
1.03k
        /*.tensor_buft_overrides       =*/ nullptr,
7629
1.03k
        /*.n_gpu_layers                =*/ 999,
7630
1.03k
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
7631
1.03k
        /*.main_gpu                    =*/ 0,
7632
1.03k
        /*.tensor_split                =*/ nullptr,
7633
1.03k
        /*.progress_callback           =*/ nullptr,
7634
1.03k
        /*.progress_callback_user_data =*/ nullptr,
7635
1.03k
        /*.kv_overrides                =*/ nullptr,
7636
1.03k
        /*.vocab_only                  =*/ false,
7637
1.03k
        /*.use_mmap                    =*/ true,
7638
1.03k
        /*.use_mlock                   =*/ false,
7639
1.03k
        /*.check_tensors               =*/ false,
7640
1.03k
        /*.use_extra_bufts             =*/ true,
7641
1.03k
        /*.no_host                     =*/ false,
7642
1.03k
    };
7643
7644
1.03k
    return result;
7645
1.03k
}
7646
7647
0
const llama_vocab * llama_model_get_vocab(const llama_model * model) {
7648
0
    return &model->vocab;
7649
0
}
7650
7651
0
void llama_free_model(llama_model * model) {
7652
0
    llama_model_free(model);
7653
0
}
7654
7655
965
void llama_model_free(llama_model * model) {
7656
965
    delete model;
7657
965
}
7658
7659
0
int32_t llama_model_n_ctx_train(const llama_model * model) {
7660
0
    return model->hparams.n_ctx_train;
7661
0
}
7662
7663
0
int32_t llama_model_n_embd(const llama_model * model) {
7664
0
    return model->hparams.n_embd;
7665
0
}
7666
7667
0
int32_t llama_model_n_embd_inp(const llama_model * model) {
7668
0
    return model->hparams.n_embd_inp();
7669
0
}
7670
7671
0
int32_t llama_model_n_layer(const llama_model * model) {
7672
0
    return model->hparams.n_layer;
7673
0
}
7674
7675
0
int32_t llama_model_n_head(const llama_model * model) {
7676
0
    return model->hparams.n_head();
7677
0
}
7678
7679
0
int32_t llama_model_n_head_kv(const llama_model * model) {
7680
0
    return model->hparams.n_head_kv();
7681
0
}
7682
7683
0
int32_t llama_model_n_swa(const llama_model * model) {
7684
0
    return model->hparams.n_swa;
7685
0
}
7686
7687
0
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
7688
0
    return model->hparams.n_cls_out;
7689
0
}
7690
7691
0
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
7692
0
    if (i < model->classifier_labels.size()) {
7693
0
        return model->classifier_labels[i].c_str();
7694
0
    }
7695
7696
0
    return nullptr;
7697
0
}
7698
7699
// deprecated
7700
0
int32_t llama_n_ctx_train(const llama_model * model) {
7701
0
    return llama_model_n_ctx_train(model);
7702
0
}
7703
7704
// deprecated
7705
0
int32_t llama_n_embd(const llama_model * model) {
7706
0
    return llama_model_n_embd(model);
7707
0
}
7708
7709
// deprecated
7710
0
int32_t llama_n_layer(const llama_model * model) {
7711
0
    return llama_model_n_layer(model);
7712
0
}
7713
7714
// deprecated
7715
0
int32_t llama_n_head(const llama_model * model) {
7716
0
    return llama_model_n_head(model);
7717
0
}
7718
7719
0
llama_rope_type llama_model_rope_type(const llama_model * model) {
7720
0
    switch (model->arch) {
7721
        // these models do not use RoPE
7722
0
        case LLM_ARCH_CLIP:
7723
0
        case LLM_ARCH_GPT2:
7724
0
        case LLM_ARCH_GPTJ:
7725
0
        case LLM_ARCH_MPT:
7726
0
        case LLM_ARCH_REFACT:
7727
0
        case LLM_ARCH_BLOOM:
7728
0
        case LLM_ARCH_MAMBA:
7729
0
        case LLM_ARCH_MAMBA2:
7730
0
        case LLM_ARCH_JAMBA:
7731
0
        case LLM_ARCH_JINA_BERT_V2:
7732
0
        case LLM_ARCH_T5:
7733
0
        case LLM_ARCH_T5ENCODER:
7734
0
        case LLM_ARCH_JAIS:
7735
0
        case LLM_ARCH_RWKV6:
7736
0
        case LLM_ARCH_RWKV6QWEN2:
7737
0
        case LLM_ARCH_RWKV7:
7738
0
        case LLM_ARCH_ARWKV7:
7739
0
        case LLM_ARCH_WAVTOKENIZER_DEC:
7740
0
        case LLM_ARCH_NEMOTRON_H:
7741
0
            return LLAMA_ROPE_TYPE_NONE;
7742
7743
        // use what we call a normal RoPE, operating on pairs of consecutive head values
7744
0
        case LLM_ARCH_LLAMA:
7745
0
        case LLM_ARCH_LLADA:
7746
0
        case LLM_ARCH_LLAMA4:
7747
0
        case LLM_ARCH_DECI:
7748
0
        case LLM_ARCH_BAICHUAN:
7749
0
        case LLM_ARCH_STARCODER:
7750
0
        case LLM_ARCH_INTERNLM2:
7751
0
        case LLM_ARCH_MINICPM:
7752
0
        case LLM_ARCH_XVERSE:
7753
0
        case LLM_ARCH_COMMAND_R:
7754
0
        case LLM_ARCH_COHERE2:
7755
0
        case LLM_ARCH_OLMO:
7756
0
        case LLM_ARCH_ARCTIC:
7757
0
        case LLM_ARCH_DEEPSEEK:
7758
0
        case LLM_ARCH_DEEPSEEK2:
7759
0
        case LLM_ARCH_PLM:
7760
0
        case LLM_ARCH_CHATGLM:
7761
0
        case LLM_ARCH_GLM4:
7762
0
        case LLM_ARCH_GRANITE:
7763
0
        case LLM_ARCH_GRANITE_MOE:
7764
0
        case LLM_ARCH_GRANITE_HYBRID:
7765
0
        case LLM_ARCH_CHAMELEON:
7766
0
        case LLM_ARCH_BAILINGMOE:
7767
0
        case LLM_ARCH_NEO_BERT:
7768
0
        case LLM_ARCH_SMOLLM3:
7769
0
        case LLM_ARCH_ARCEE:
7770
0
        case LLM_ARCH_ERNIE4_5:
7771
0
        case LLM_ARCH_ERNIE4_5_MOE:
7772
0
        case LLM_ARCH_MISTRAL3:
7773
0
            return LLAMA_ROPE_TYPE_NORM;
7774
7775
        // the pairs of head values are offset by n_rot/2
7776
0
        case LLM_ARCH_FALCON:
7777
0
        case LLM_ARCH_FALCON_H1:
7778
0
        case LLM_ARCH_GROK:
7779
0
        case LLM_ARCH_DBRX:
7780
0
        case LLM_ARCH_BERT:
7781
0
        case LLM_ARCH_JINA_BERT_V3:
7782
0
        case LLM_ARCH_NOMIC_BERT:
7783
0
        case LLM_ARCH_NOMIC_BERT_MOE:
7784
0
        case LLM_ARCH_STABLELM:
7785
0
        case LLM_ARCH_BITNET:
7786
0
        case LLM_ARCH_QWEN:
7787
0
        case LLM_ARCH_QWEN2:
7788
0
        case LLM_ARCH_DREAM:
7789
0
        case LLM_ARCH_QWEN2MOE:
7790
0
        case LLM_ARCH_QWEN3:
7791
0
        case LLM_ARCH_QWEN3MOE:
7792
0
        case LLM_ARCH_LLADA_MOE:
7793
0
        case LLM_ARCH_RND1:
7794
0
        case LLM_ARCH_OLMO2:
7795
0
        case LLM_ARCH_OLMOE:
7796
0
        case LLM_ARCH_PHI2:
7797
0
        case LLM_ARCH_PHI3:
7798
0
        case LLM_ARCH_PHIMOE:
7799
0
        case LLM_ARCH_PLAMO:
7800
0
        case LLM_ARCH_PLAMO2:
7801
0
        case LLM_ARCH_GEMMA:
7802
0
        case LLM_ARCH_GEMMA2:
7803
0
        case LLM_ARCH_GEMMA3:
7804
0
        case LLM_ARCH_GEMMA3N:
7805
0
        case LLM_ARCH_GEMMA_EMBEDDING:
7806
0
        case LLM_ARCH_STARCODER2:
7807
0
        case LLM_ARCH_OPENELM:
7808
0
        case LLM_ARCH_GPTNEOX:
7809
0
        case LLM_ARCH_CODESHELL:
7810
0
        case LLM_ARCH_ORION:
7811
0
        case LLM_ARCH_NEMOTRON:
7812
0
        case LLM_ARCH_EXAONE:
7813
0
        case LLM_ARCH_EXAONE4:
7814
0
        case LLM_ARCH_MINICPM3:
7815
0
        case LLM_ARCH_BAILINGMOE2:
7816
0
        case LLM_ARCH_DOTS1:
7817
0
        case LLM_ARCH_HUNYUAN_MOE:
7818
0
        case LLM_ARCH_OPENAI_MOE:
7819
0
        case LLM_ARCH_HUNYUAN_DENSE:
7820
0
        case LLM_ARCH_LFM2:
7821
0
        case LLM_ARCH_LFM2MOE:
7822
0
        case LLM_ARCH_SMALLTHINKER:
7823
0
        case LLM_ARCH_GLM4_MOE:
7824
0
        case LLM_ARCH_SEED_OSS:
7825
0
        case LLM_ARCH_GROVEMOE:
7826
0
        case LLM_ARCH_APERTUS:
7827
0
        case LLM_ARCH_MINIMAX_M2:
7828
0
        case LLM_ARCH_COGVLM:
7829
0
        case LLM_ARCH_PANGU_EMBED:
7830
0
        case LLM_ARCH_AFMOE:
7831
0
        case LLM_ARCH_QWEN3NEXT:
7832
0
            return LLAMA_ROPE_TYPE_NEOX;
7833
7834
0
        case LLM_ARCH_QWEN2VL:
7835
0
            return LLAMA_ROPE_TYPE_MROPE;
7836
0
        case LLM_ARCH_QWEN3VL:
7837
0
        case LLM_ARCH_QWEN3VLMOE:
7838
0
            return LLAMA_ROPE_TYPE_IMROPE;
7839
7840
        // all model arches should be listed explicitly here
7841
0
        case LLM_ARCH_UNKNOWN:
7842
0
            GGML_ABORT("unknown architecture");
7843
0
    }
7844
7845
0
    return LLAMA_ROPE_TYPE_NONE;
7846
0
}
7847
7848
0
float llama_model_rope_freq_scale_train(const llama_model * model) {
7849
0
    return model->hparams.rope_freq_scale_train;
7850
0
}
7851
7852
0
int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
7853
0
    const auto & it = model->gguf_kv.find(key);
7854
0
    if (it == model->gguf_kv.end()) {
7855
0
        if (buf_size > 0) {
7856
0
            buf[0] = '\0';
7857
0
        }
7858
0
        return -1;
7859
0
    }
7860
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
7861
0
}
7862
7863
0
int32_t llama_model_meta_count(const llama_model * model) {
7864
0
    return (int)model->gguf_kv.size();
7865
0
}
7866
7867
0
const char * llama_model_meta_key_str(llama_model_meta_key key) {
7868
0
    switch (key) {
7869
0
        case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE:        return "general.sampling.sequence";
7870
0
        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K:           return "general.sampling.top_k";
7871
0
        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P:           return "general.sampling.top_p";
7872
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P:           return "general.sampling.min_p";
7873
0
        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
7874
0
        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD:   return "general.sampling.xtc_threshold";
7875
0
        case LLAMA_MODEL_META_KEY_SAMPLING_TEMP:            return "general.sampling.temp";
7876
0
        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N:  return "general.sampling.penalty_last_n";
7877
0
        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT:  return "general.sampling.penalty_repeat";
7878
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT:        return "general.sampling.mirostat";
7879
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU:    return "general.sampling.mirostat_tau";
7880
0
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA:    return "general.sampling.mirostat_eta";
7881
0
        default:                                            return nullptr;
7882
0
    }
7883
0
}
7884
7885
0
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
7886
0
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
7887
0
        if (buf_size > 0) {
7888
0
            buf[0] = '\0';
7889
0
        }
7890
0
        return -1;
7891
0
    }
7892
0
    auto it = model->gguf_kv.begin();
7893
0
    std::advance(it, i);
7894
0
    return snprintf(buf, buf_size, "%s", it->first.c_str());
7895
0
}
7896
7897
0
int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
7898
0
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
7899
0
        if (buf_size > 0) {
7900
0
            buf[0] = '\0';
7901
0
        }
7902
0
        return -1;
7903
0
    }
7904
0
    auto it = model->gguf_kv.begin();
7905
0
    std::advance(it, i);
7906
0
    return snprintf(buf, buf_size, "%s", it->second.c_str());
7907
0
}
7908
7909
0
int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
7910
0
    return snprintf(buf, buf_size, "%s", model->desc().c_str());
7911
0
}
7912
7913
0
uint64_t llama_model_size(const llama_model * model) {
7914
0
    return model->size();
7915
0
}
7916
7917
0
const char * llama_model_chat_template(const llama_model * model, const char * name) {
7918
0
    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
7919
0
        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
7920
0
    const auto & it = model->gguf_kv.find(key);
7921
0
    if (it == model->gguf_kv.end()) {
7922
        // one-off fix for very popular models (so we are not flooded with issues)
7923
        // do not extend this list unless absolutely necessary
7924
        // Mistral-Small-2503 does not have built-in chat template
7925
0
        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
7926
0
        if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
7927
0
            return "mistral-v7-tekken";
7928
0
        }
7929
7930
0
        return nullptr;
7931
0
    }
7932
7933
0
    return it->second.c_str();
7934
0
}
7935
7936
0
uint64_t llama_model_n_params(const llama_model * model) {
7937
0
    return model->n_elements();
7938
0
}
7939
7940
0
bool llama_model_has_encoder(const llama_model * model) {
7941
0
    switch (model->arch) {
7942
0
        case LLM_ARCH_T5:        return true;
7943
0
        case LLM_ARCH_T5ENCODER: return true;
7944
0
        default:                 return false;
7945
0
    }
7946
0
}
7947
7948
0
bool llama_model_has_decoder(const llama_model * model) {
7949
0
    switch (model->arch) {
7950
0
        case LLM_ARCH_T5ENCODER: return false;
7951
0
        default:                 return true;
7952
0
    }
7953
0
}
7954
7955
0
llama_token llama_model_decoder_start_token(const llama_model * model) {
7956
0
    return model->hparams.dec_start_token_id;
7957
0
}
7958
7959
0
bool llama_model_is_recurrent(const llama_model * model) {
7960
0
    return llm_arch_is_recurrent(model->arch);
7961
0
}
7962
7963
0
bool llama_model_is_hybrid(const llama_model * model) {
7964
0
    return llm_arch_is_hybrid(model->arch);
7965
0
}
7966
7967
0
bool llama_model_is_diffusion(const llama_model * model) {
7968
0
    return llm_arch_is_diffusion(model->arch);
7969
0
}
7970
7971
0
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
7972
0
    return model->tensors_by_name;
7973
0
}